diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,215072 @@ +{ + "best_global_step": 107424, + "best_metric": 0.7989339828491211, + "best_model_checkpoint": "saves_multiple/prompt-tuning/llama-3-8b-instruct/train_math_qa_123_1760637721/checkpoint-107424", + "epoch": 20.0, + "eval_steps": 6714, + "global_step": 134280, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007447125409591898, + "grad_norm": 16.875, + "learning_rate": 8.936550491510277e-06, + "loss": 2.0541, + "num_input_tokens_seen": 2976, + "step": 5 + }, + { + "epoch": 0.0014894250819183796, + "grad_norm": 11.4375, + "learning_rate": 2.0107238605898124e-05, + "loss": 2.1465, + "num_input_tokens_seen": 5920, + "step": 10 + }, + { + "epoch": 0.002234137622877569, + "grad_norm": 7.3125, + "learning_rate": 3.1277926720285965e-05, + "loss": 1.6496, + "num_input_tokens_seen": 8896, + "step": 15 + }, + { + "epoch": 0.002978850163836759, + "grad_norm": 7.28125, + "learning_rate": 4.244861483467382e-05, + "loss": 1.4091, + "num_input_tokens_seen": 11744, + "step": 20 + }, + { + "epoch": 0.0037235627047959487, + "grad_norm": 1.6484375, + "learning_rate": 5.361930294906166e-05, + "loss": 0.6603, + "num_input_tokens_seen": 14496, + "step": 25 + }, + { + "epoch": 0.004468275245755138, + "grad_norm": 6.59375, + "learning_rate": 6.478999106344951e-05, + "loss": 0.8271, + "num_input_tokens_seen": 17440, + "step": 30 + }, + { + "epoch": 0.005212987786714328, + "grad_norm": 13.4375, + "learning_rate": 7.596067917783735e-05, + "loss": 0.8999, + "num_input_tokens_seen": 20352, + "step": 35 + }, + { + "epoch": 0.005957700327673518, + "grad_norm": 6.1875, + "learning_rate": 8.71313672922252e-05, + "loss": 0.9275, + "num_input_tokens_seen": 23328, + "step": 40 + }, + { + "epoch": 0.006702412868632708, + "grad_norm": 7.125, + "learning_rate": 9.830205540661304e-05, + "loss": 0.7633, + "num_input_tokens_seen": 26304, + "step": 45 + }, + { + "epoch": 0.0074471254095918975, + "grad_norm": 17.375, + "learning_rate": 0.0001094727435210009, + "loss": 0.9321, + "num_input_tokens_seen": 29184, + "step": 50 + }, + { + "epoch": 0.008191837950551088, + "grad_norm": 9.8125, + "learning_rate": 0.00012064343163538874, + "loss": 0.8288, + "num_input_tokens_seen": 32544, + "step": 55 + }, + { + "epoch": 0.008936550491510277, + "grad_norm": 11.875, + "learning_rate": 0.0001318141197497766, + "loss": 0.7911, + "num_input_tokens_seen": 35520, + "step": 60 + }, + { + "epoch": 0.009681263032469467, + "grad_norm": 31.125, + "learning_rate": 0.00014298480786416443, + "loss": 0.9028, + "num_input_tokens_seen": 38400, + "step": 65 + }, + { + "epoch": 0.010425975573428656, + "grad_norm": 7.65625, + "learning_rate": 0.00015415549597855229, + "loss": 0.7724, + "num_input_tokens_seen": 41408, + "step": 70 + }, + { + "epoch": 0.011170688114387846, + "grad_norm": 5.875, + "learning_rate": 0.00016532618409294012, + "loss": 0.7992, + "num_input_tokens_seen": 44352, + "step": 75 + }, + { + "epoch": 0.011915400655347037, + "grad_norm": 4.78125, + "learning_rate": 0.00017649687220732795, + "loss": 0.7498, + "num_input_tokens_seen": 46912, + "step": 80 + }, + { + "epoch": 0.012660113196306225, + "grad_norm": 13.625, + "learning_rate": 0.00018766756032171581, + "loss": 0.8477, + "num_input_tokens_seen": 49472, + "step": 85 + }, + { + "epoch": 0.013404825737265416, + "grad_norm": 5.78125, + "learning_rate": 0.00019883824843610365, + "loss": 0.8396, + "num_input_tokens_seen": 52096, + "step": 90 + }, + { + "epoch": 0.014149538278224605, + "grad_norm": 5.8125, + "learning_rate": 0.0002100089365504915, + "loss": 0.82, + "num_input_tokens_seen": 54944, + "step": 95 + }, + { + "epoch": 0.014894250819183795, + "grad_norm": 6.875, + "learning_rate": 0.00022117962466487934, + "loss": 0.7666, + "num_input_tokens_seen": 57664, + "step": 100 + }, + { + "epoch": 0.015638963360142984, + "grad_norm": 7.9375, + "learning_rate": 0.0002323503127792672, + "loss": 0.8512, + "num_input_tokens_seen": 60448, + "step": 105 + }, + { + "epoch": 0.016383675901102176, + "grad_norm": 5.5, + "learning_rate": 0.00024352100089365507, + "loss": 0.8179, + "num_input_tokens_seen": 63808, + "step": 110 + }, + { + "epoch": 0.017128388442061365, + "grad_norm": 7.125, + "learning_rate": 0.00025469168900804287, + "loss": 0.7675, + "num_input_tokens_seen": 66464, + "step": 115 + }, + { + "epoch": 0.017873100983020553, + "grad_norm": 7.09375, + "learning_rate": 0.00026586237712243073, + "loss": 0.7479, + "num_input_tokens_seen": 69856, + "step": 120 + }, + { + "epoch": 0.018617813523979745, + "grad_norm": 9.375, + "learning_rate": 0.0002770330652368186, + "loss": 0.8035, + "num_input_tokens_seen": 72768, + "step": 125 + }, + { + "epoch": 0.019362526064938934, + "grad_norm": 9.375, + "learning_rate": 0.0002882037533512064, + "loss": 0.7734, + "num_input_tokens_seen": 75616, + "step": 130 + }, + { + "epoch": 0.020107238605898123, + "grad_norm": 10.5625, + "learning_rate": 0.00029937444146559426, + "loss": 0.7944, + "num_input_tokens_seen": 78656, + "step": 135 + }, + { + "epoch": 0.02085195114685731, + "grad_norm": 4.75, + "learning_rate": 0.0003105451295799821, + "loss": 0.8096, + "num_input_tokens_seen": 81760, + "step": 140 + }, + { + "epoch": 0.021596663687816504, + "grad_norm": 5.03125, + "learning_rate": 0.00032171581769437, + "loss": 0.7637, + "num_input_tokens_seen": 84928, + "step": 145 + }, + { + "epoch": 0.022341376228775692, + "grad_norm": 2.421875, + "learning_rate": 0.0003328865058087578, + "loss": 0.845, + "num_input_tokens_seen": 88096, + "step": 150 + }, + { + "epoch": 0.02308608876973488, + "grad_norm": 5.96875, + "learning_rate": 0.00034405719392314565, + "loss": 0.7076, + "num_input_tokens_seen": 90912, + "step": 155 + }, + { + "epoch": 0.023830801310694073, + "grad_norm": 20.0, + "learning_rate": 0.0003552278820375335, + "loss": 0.8118, + "num_input_tokens_seen": 93696, + "step": 160 + }, + { + "epoch": 0.024575513851653262, + "grad_norm": 44.5, + "learning_rate": 0.0003663985701519213, + "loss": 0.9098, + "num_input_tokens_seen": 96992, + "step": 165 + }, + { + "epoch": 0.02532022639261245, + "grad_norm": 6.65625, + "learning_rate": 0.0003775692582663092, + "loss": 0.7708, + "num_input_tokens_seen": 99744, + "step": 170 + }, + { + "epoch": 0.026064938933571643, + "grad_norm": 5.9375, + "learning_rate": 0.00038873994638069704, + "loss": 0.767, + "num_input_tokens_seen": 102432, + "step": 175 + }, + { + "epoch": 0.02680965147453083, + "grad_norm": 27.125, + "learning_rate": 0.0003999106344950849, + "loss": 0.7739, + "num_input_tokens_seen": 105440, + "step": 180 + }, + { + "epoch": 0.02755436401549002, + "grad_norm": 9.3125, + "learning_rate": 0.0004110813226094727, + "loss": 0.8483, + "num_input_tokens_seen": 108320, + "step": 185 + }, + { + "epoch": 0.02829907655644921, + "grad_norm": 7.71875, + "learning_rate": 0.00042225201072386057, + "loss": 0.7432, + "num_input_tokens_seen": 111264, + "step": 190 + }, + { + "epoch": 0.0290437890974084, + "grad_norm": 6.5, + "learning_rate": 0.00043342269883824843, + "loss": 0.6927, + "num_input_tokens_seen": 114432, + "step": 195 + }, + { + "epoch": 0.02978850163836759, + "grad_norm": 5.71875, + "learning_rate": 0.00044459338695263624, + "loss": 1.0435, + "num_input_tokens_seen": 117248, + "step": 200 + }, + { + "epoch": 0.03053321417932678, + "grad_norm": 14.125, + "learning_rate": 0.0004557640750670241, + "loss": 0.8675, + "num_input_tokens_seen": 120512, + "step": 205 + }, + { + "epoch": 0.03127792672028597, + "grad_norm": 12.125, + "learning_rate": 0.00046693476318141196, + "loss": 0.7546, + "num_input_tokens_seen": 123776, + "step": 210 + }, + { + "epoch": 0.032022639261245156, + "grad_norm": 5.09375, + "learning_rate": 0.0004781054512957998, + "loss": 0.7107, + "num_input_tokens_seen": 126880, + "step": 215 + }, + { + "epoch": 0.03276735180220435, + "grad_norm": 9.5625, + "learning_rate": 0.0004892761394101877, + "loss": 0.9462, + "num_input_tokens_seen": 130112, + "step": 220 + }, + { + "epoch": 0.03351206434316354, + "grad_norm": 15.375, + "learning_rate": 0.0005004468275245754, + "loss": 0.8655, + "num_input_tokens_seen": 132704, + "step": 225 + }, + { + "epoch": 0.03425677688412273, + "grad_norm": 12.6875, + "learning_rate": 0.0005116175156389634, + "loss": 0.8188, + "num_input_tokens_seen": 135968, + "step": 230 + }, + { + "epoch": 0.03500148942508192, + "grad_norm": 4.59375, + "learning_rate": 0.0005227882037533513, + "loss": 0.7882, + "num_input_tokens_seen": 138752, + "step": 235 + }, + { + "epoch": 0.035746201966041107, + "grad_norm": 9.6875, + "learning_rate": 0.000533958891867739, + "loss": 0.8318, + "num_input_tokens_seen": 141792, + "step": 240 + }, + { + "epoch": 0.036490914507000295, + "grad_norm": 5.4375, + "learning_rate": 0.0005451295799821269, + "loss": 0.7506, + "num_input_tokens_seen": 144640, + "step": 245 + }, + { + "epoch": 0.03723562704795949, + "grad_norm": 7.96875, + "learning_rate": 0.0005563002680965147, + "loss": 0.7676, + "num_input_tokens_seen": 147648, + "step": 250 + }, + { + "epoch": 0.03798033958891868, + "grad_norm": 188.0, + "learning_rate": 0.0005674709562109025, + "loss": 0.8602, + "num_input_tokens_seen": 150432, + "step": 255 + }, + { + "epoch": 0.03872505212987787, + "grad_norm": 7.3125, + "learning_rate": 0.0005786416443252904, + "loss": 0.7636, + "num_input_tokens_seen": 153312, + "step": 260 + }, + { + "epoch": 0.03946976467083706, + "grad_norm": 7.6875, + "learning_rate": 0.0005898123324396783, + "loss": 0.8312, + "num_input_tokens_seen": 156288, + "step": 265 + }, + { + "epoch": 0.040214477211796246, + "grad_norm": 106.0, + "learning_rate": 0.0006009830205540662, + "loss": 1.1462, + "num_input_tokens_seen": 159136, + "step": 270 + }, + { + "epoch": 0.040959189752755434, + "grad_norm": 15.5, + "learning_rate": 0.0006121537086684539, + "loss": 1.0654, + "num_input_tokens_seen": 162144, + "step": 275 + }, + { + "epoch": 0.04170390229371462, + "grad_norm": 15.375, + "learning_rate": 0.0006233243967828418, + "loss": 0.8154, + "num_input_tokens_seen": 164832, + "step": 280 + }, + { + "epoch": 0.04244861483467382, + "grad_norm": 13.125, + "learning_rate": 0.0006344950848972297, + "loss": 1.1192, + "num_input_tokens_seen": 167744, + "step": 285 + }, + { + "epoch": 0.04319332737563301, + "grad_norm": 12.5625, + "learning_rate": 0.0006456657730116174, + "loss": 0.8284, + "num_input_tokens_seen": 170528, + "step": 290 + }, + { + "epoch": 0.043938039916592196, + "grad_norm": 93.0, + "learning_rate": 0.0006568364611260053, + "loss": 1.1555, + "num_input_tokens_seen": 173440, + "step": 295 + }, + { + "epoch": 0.044682752457551385, + "grad_norm": 16.0, + "learning_rate": 0.0006680071492403932, + "loss": 0.7943, + "num_input_tokens_seen": 176160, + "step": 300 + }, + { + "epoch": 0.045427464998510574, + "grad_norm": 217.0, + "learning_rate": 0.0006791778373547811, + "loss": 2.7377, + "num_input_tokens_seen": 178816, + "step": 305 + }, + { + "epoch": 0.04617217753946976, + "grad_norm": 187.0, + "learning_rate": 0.0006903485254691689, + "loss": 2.2127, + "num_input_tokens_seen": 181664, + "step": 310 + }, + { + "epoch": 0.04691689008042895, + "grad_norm": 72.0, + "learning_rate": 0.0007015192135835567, + "loss": 1.075, + "num_input_tokens_seen": 184384, + "step": 315 + }, + { + "epoch": 0.04766160262138815, + "grad_norm": 18.125, + "learning_rate": 0.0007126899016979446, + "loss": 0.8721, + "num_input_tokens_seen": 187072, + "step": 320 + }, + { + "epoch": 0.048406315162347335, + "grad_norm": 17.25, + "learning_rate": 0.0007238605898123323, + "loss": 0.8984, + "num_input_tokens_seen": 189920, + "step": 325 + }, + { + "epoch": 0.049151027703306524, + "grad_norm": 27.375, + "learning_rate": 0.0007350312779267203, + "loss": 0.9418, + "num_input_tokens_seen": 192608, + "step": 330 + }, + { + "epoch": 0.04989574024426571, + "grad_norm": 15.25, + "learning_rate": 0.0007462019660411082, + "loss": 0.8673, + "num_input_tokens_seen": 195744, + "step": 335 + }, + { + "epoch": 0.0506404527852249, + "grad_norm": 37.25, + "learning_rate": 0.000757372654155496, + "loss": 0.9487, + "num_input_tokens_seen": 198272, + "step": 340 + }, + { + "epoch": 0.05138516532618409, + "grad_norm": 6.75, + "learning_rate": 0.0007685433422698838, + "loss": 0.9024, + "num_input_tokens_seen": 201152, + "step": 345 + }, + { + "epoch": 0.052129877867143286, + "grad_norm": 14.125, + "learning_rate": 0.0007797140303842716, + "loss": 0.8935, + "num_input_tokens_seen": 203936, + "step": 350 + }, + { + "epoch": 0.052874590408102475, + "grad_norm": 4.1875, + "learning_rate": 0.0007908847184986595, + "loss": 0.8749, + "num_input_tokens_seen": 206976, + "step": 355 + }, + { + "epoch": 0.05361930294906166, + "grad_norm": 17.875, + "learning_rate": 0.0008020554066130472, + "loss": 0.9481, + "num_input_tokens_seen": 210048, + "step": 360 + }, + { + "epoch": 0.05436401549002085, + "grad_norm": 11.25, + "learning_rate": 0.0008132260947274352, + "loss": 0.9795, + "num_input_tokens_seen": 212896, + "step": 365 + }, + { + "epoch": 0.05510872803098004, + "grad_norm": 2.609375, + "learning_rate": 0.0008243967828418231, + "loss": 0.9713, + "num_input_tokens_seen": 215872, + "step": 370 + }, + { + "epoch": 0.05585344057193923, + "grad_norm": 10.625, + "learning_rate": 0.0008355674709562109, + "loss": 0.7804, + "num_input_tokens_seen": 218624, + "step": 375 + }, + { + "epoch": 0.05659815311289842, + "grad_norm": 23.25, + "learning_rate": 0.0008467381590705987, + "loss": 0.9118, + "num_input_tokens_seen": 221408, + "step": 380 + }, + { + "epoch": 0.057342865653857614, + "grad_norm": 6.625, + "learning_rate": 0.0008579088471849866, + "loss": 0.8948, + "num_input_tokens_seen": 224224, + "step": 385 + }, + { + "epoch": 0.0580875781948168, + "grad_norm": 2.375, + "learning_rate": 0.0008690795352993744, + "loss": 0.8262, + "num_input_tokens_seen": 227136, + "step": 390 + }, + { + "epoch": 0.05883229073577599, + "grad_norm": 5.1875, + "learning_rate": 0.0008802502234137622, + "loss": 0.7863, + "num_input_tokens_seen": 230272, + "step": 395 + }, + { + "epoch": 0.05957700327673518, + "grad_norm": 2.515625, + "learning_rate": 0.0008914209115281501, + "loss": 0.9703, + "num_input_tokens_seen": 233568, + "step": 400 + }, + { + "epoch": 0.06032171581769437, + "grad_norm": 14.375, + "learning_rate": 0.000902591599642538, + "loss": 0.7932, + "num_input_tokens_seen": 236384, + "step": 405 + }, + { + "epoch": 0.06106642835865356, + "grad_norm": 5.90625, + "learning_rate": 0.0009137622877569257, + "loss": 0.9386, + "num_input_tokens_seen": 239424, + "step": 410 + }, + { + "epoch": 0.06181114089961275, + "grad_norm": 3.28125, + "learning_rate": 0.0009249329758713136, + "loss": 0.8366, + "num_input_tokens_seen": 242240, + "step": 415 + }, + { + "epoch": 0.06255585344057193, + "grad_norm": 356.0, + "learning_rate": 0.0009361036639857015, + "loss": 3.4608, + "num_input_tokens_seen": 244800, + "step": 420 + }, + { + "epoch": 0.06330056598153112, + "grad_norm": 181.0, + "learning_rate": 0.0009472743521000893, + "loss": 7.805, + "num_input_tokens_seen": 247968, + "step": 425 + }, + { + "epoch": 0.06404527852249031, + "grad_norm": 4.03125, + "learning_rate": 0.0009584450402144771, + "loss": 5.2863, + "num_input_tokens_seen": 250720, + "step": 430 + }, + { + "epoch": 0.06478999106344951, + "grad_norm": 3.96875, + "learning_rate": 0.0009696157283288649, + "loss": 0.8318, + "num_input_tokens_seen": 253952, + "step": 435 + }, + { + "epoch": 0.0655347036044087, + "grad_norm": 63.0, + "learning_rate": 0.0009807864164432527, + "loss": 1.1154, + "num_input_tokens_seen": 256576, + "step": 440 + }, + { + "epoch": 0.06627941614536789, + "grad_norm": 4.15625, + "learning_rate": 0.0009919571045576407, + "loss": 1.9513, + "num_input_tokens_seen": 259488, + "step": 445 + }, + { + "epoch": 0.06702412868632708, + "grad_norm": 20.375, + "learning_rate": 0.0010031277926720286, + "loss": 3.3339, + "num_input_tokens_seen": 262112, + "step": 450 + }, + { + "epoch": 0.06776884122728627, + "grad_norm": 14.625, + "learning_rate": 0.0010142984807864164, + "loss": 3.7901, + "num_input_tokens_seen": 265120, + "step": 455 + }, + { + "epoch": 0.06851355376824546, + "grad_norm": 5.5625, + "learning_rate": 0.0010254691689008044, + "loss": 2.2158, + "num_input_tokens_seen": 267872, + "step": 460 + }, + { + "epoch": 0.06925826630920465, + "grad_norm": 13.0625, + "learning_rate": 0.0010366398570151921, + "loss": 1.3114, + "num_input_tokens_seen": 270464, + "step": 465 + }, + { + "epoch": 0.07000297885016384, + "grad_norm": 25.625, + "learning_rate": 0.00104781054512958, + "loss": 1.6267, + "num_input_tokens_seen": 273152, + "step": 470 + }, + { + "epoch": 0.07074769139112302, + "grad_norm": 8.0625, + "learning_rate": 0.0010589812332439678, + "loss": 0.9056, + "num_input_tokens_seen": 275968, + "step": 475 + }, + { + "epoch": 0.07149240393208221, + "grad_norm": 10.3125, + "learning_rate": 0.0010701519213583556, + "loss": 1.0221, + "num_input_tokens_seen": 278848, + "step": 480 + }, + { + "epoch": 0.0722371164730414, + "grad_norm": 3.921875, + "learning_rate": 0.0010813226094727436, + "loss": 1.0297, + "num_input_tokens_seen": 281664, + "step": 485 + }, + { + "epoch": 0.07298182901400059, + "grad_norm": 3.90625, + "learning_rate": 0.0010924932975871313, + "loss": 1.0363, + "num_input_tokens_seen": 285056, + "step": 490 + }, + { + "epoch": 0.07372654155495978, + "grad_norm": 5.375, + "learning_rate": 0.001103663985701519, + "loss": 0.9797, + "num_input_tokens_seen": 287872, + "step": 495 + }, + { + "epoch": 0.07447125409591898, + "grad_norm": 5.5, + "learning_rate": 0.001114834673815907, + "loss": 0.9748, + "num_input_tokens_seen": 290752, + "step": 500 + }, + { + "epoch": 0.07521596663687817, + "grad_norm": 3.359375, + "learning_rate": 0.0011260053619302948, + "loss": 0.8764, + "num_input_tokens_seen": 293824, + "step": 505 + }, + { + "epoch": 0.07596067917783736, + "grad_norm": 4.6875, + "learning_rate": 0.0011371760500446825, + "loss": 0.9853, + "num_input_tokens_seen": 296672, + "step": 510 + }, + { + "epoch": 0.07670539171879655, + "grad_norm": 8.375, + "learning_rate": 0.0011483467381590707, + "loss": 0.9858, + "num_input_tokens_seen": 299776, + "step": 515 + }, + { + "epoch": 0.07745010425975574, + "grad_norm": 4.15625, + "learning_rate": 0.0011595174262734585, + "loss": 1.0254, + "num_input_tokens_seen": 302720, + "step": 520 + }, + { + "epoch": 0.07819481680071493, + "grad_norm": 13.875, + "learning_rate": 0.0011706881143878462, + "loss": 0.9879, + "num_input_tokens_seen": 305472, + "step": 525 + }, + { + "epoch": 0.07893952934167411, + "grad_norm": 2.453125, + "learning_rate": 0.0011818588025022342, + "loss": 0.7589, + "num_input_tokens_seen": 308672, + "step": 530 + }, + { + "epoch": 0.0796842418826333, + "grad_norm": 3.109375, + "learning_rate": 0.001193029490616622, + "loss": 1.1814, + "num_input_tokens_seen": 311904, + "step": 535 + }, + { + "epoch": 0.08042895442359249, + "grad_norm": 50.75, + "learning_rate": 0.0012042001787310097, + "loss": 1.0363, + "num_input_tokens_seen": 314688, + "step": 540 + }, + { + "epoch": 0.08117366696455168, + "grad_norm": 25.25, + "learning_rate": 0.0012153708668453977, + "loss": 2.9199, + "num_input_tokens_seen": 317472, + "step": 545 + }, + { + "epoch": 0.08191837950551087, + "grad_norm": 1128.0, + "learning_rate": 0.0012265415549597854, + "loss": 1.3287, + "num_input_tokens_seen": 320128, + "step": 550 + }, + { + "epoch": 0.08266309204647006, + "grad_norm": 16.5, + "learning_rate": 0.0012377122430741734, + "loss": 1.1768, + "num_input_tokens_seen": 322944, + "step": 555 + }, + { + "epoch": 0.08340780458742925, + "grad_norm": 5.59375, + "learning_rate": 0.0012488829311885611, + "loss": 0.9197, + "num_input_tokens_seen": 325856, + "step": 560 + }, + { + "epoch": 0.08415251712838845, + "grad_norm": 3.296875, + "learning_rate": 0.001260053619302949, + "loss": 1.0059, + "num_input_tokens_seen": 328800, + "step": 565 + }, + { + "epoch": 0.08489722966934764, + "grad_norm": 1.546875, + "learning_rate": 0.0012712243074173369, + "loss": 0.8212, + "num_input_tokens_seen": 331424, + "step": 570 + }, + { + "epoch": 0.08564194221030683, + "grad_norm": 3.046875, + "learning_rate": 0.0012823949955317246, + "loss": 0.8438, + "num_input_tokens_seen": 334112, + "step": 575 + }, + { + "epoch": 0.08638665475126601, + "grad_norm": 2.6875, + "learning_rate": 0.0012935656836461126, + "loss": 0.9073, + "num_input_tokens_seen": 336992, + "step": 580 + }, + { + "epoch": 0.0871313672922252, + "grad_norm": 3.03125, + "learning_rate": 0.0013047363717605006, + "loss": 0.9097, + "num_input_tokens_seen": 340224, + "step": 585 + }, + { + "epoch": 0.08787607983318439, + "grad_norm": 1.421875, + "learning_rate": 0.0013159070598748883, + "loss": 0.9657, + "num_input_tokens_seen": 343040, + "step": 590 + }, + { + "epoch": 0.08862079237414358, + "grad_norm": 13.125, + "learning_rate": 0.001327077747989276, + "loss": 0.7222, + "num_input_tokens_seen": 345696, + "step": 595 + }, + { + "epoch": 0.08936550491510277, + "grad_norm": 5.1875, + "learning_rate": 0.001338248436103664, + "loss": 0.9915, + "num_input_tokens_seen": 348992, + "step": 600 + }, + { + "epoch": 0.09011021745606196, + "grad_norm": 2.796875, + "learning_rate": 0.0013494191242180518, + "loss": 0.9974, + "num_input_tokens_seen": 352032, + "step": 605 + }, + { + "epoch": 0.09085492999702115, + "grad_norm": 1.4375, + "learning_rate": 0.0013605898123324395, + "loss": 0.7878, + "num_input_tokens_seen": 354656, + "step": 610 + }, + { + "epoch": 0.09159964253798034, + "grad_norm": 2.359375, + "learning_rate": 0.0013717605004468275, + "loss": 0.8325, + "num_input_tokens_seen": 357504, + "step": 615 + }, + { + "epoch": 0.09234435507893952, + "grad_norm": 1.4609375, + "learning_rate": 0.0013829311885612153, + "loss": 0.849, + "num_input_tokens_seen": 360256, + "step": 620 + }, + { + "epoch": 0.09308906761989871, + "grad_norm": 0.7734375, + "learning_rate": 0.0013941018766756032, + "loss": 0.9104, + "num_input_tokens_seen": 363072, + "step": 625 + }, + { + "epoch": 0.0938337801608579, + "grad_norm": 2.1875, + "learning_rate": 0.001405272564789991, + "loss": 0.8422, + "num_input_tokens_seen": 366272, + "step": 630 + }, + { + "epoch": 0.0945784927018171, + "grad_norm": 1.0625, + "learning_rate": 0.0014164432529043787, + "loss": 0.8472, + "num_input_tokens_seen": 369376, + "step": 635 + }, + { + "epoch": 0.0953232052427763, + "grad_norm": 2.796875, + "learning_rate": 0.0014276139410187667, + "loss": 1.0073, + "num_input_tokens_seen": 372224, + "step": 640 + }, + { + "epoch": 0.09606791778373548, + "grad_norm": 1.765625, + "learning_rate": 0.0014387846291331547, + "loss": 0.8092, + "num_input_tokens_seen": 375232, + "step": 645 + }, + { + "epoch": 0.09681263032469467, + "grad_norm": 1.4140625, + "learning_rate": 0.0014499553172475424, + "loss": 0.9883, + "num_input_tokens_seen": 378272, + "step": 650 + }, + { + "epoch": 0.09755734286565386, + "grad_norm": 1.53125, + "learning_rate": 0.0014611260053619304, + "loss": 0.8369, + "num_input_tokens_seen": 381152, + "step": 655 + }, + { + "epoch": 0.09830205540661305, + "grad_norm": 1.078125, + "learning_rate": 0.0014722966934763181, + "loss": 0.8597, + "num_input_tokens_seen": 384096, + "step": 660 + }, + { + "epoch": 0.09904676794757224, + "grad_norm": 0.6484375, + "learning_rate": 0.001483467381590706, + "loss": 0.8207, + "num_input_tokens_seen": 387296, + "step": 665 + }, + { + "epoch": 0.09979148048853143, + "grad_norm": 0.474609375, + "learning_rate": 0.0014946380697050939, + "loss": 0.7681, + "num_input_tokens_seen": 390656, + "step": 670 + }, + { + "epoch": 0.10053619302949061, + "grad_norm": 3.890625, + "learning_rate": 0.0015058087578194816, + "loss": 1.0177, + "num_input_tokens_seen": 393568, + "step": 675 + }, + { + "epoch": 0.1012809055704498, + "grad_norm": 0.609375, + "learning_rate": 0.0015169794459338694, + "loss": 0.9052, + "num_input_tokens_seen": 396320, + "step": 680 + }, + { + "epoch": 0.10202561811140899, + "grad_norm": 0.78125, + "learning_rate": 0.0015281501340482573, + "loss": 0.8006, + "num_input_tokens_seen": 399328, + "step": 685 + }, + { + "epoch": 0.10277033065236818, + "grad_norm": 3.046875, + "learning_rate": 0.001539320822162645, + "loss": 0.8001, + "num_input_tokens_seen": 402176, + "step": 690 + }, + { + "epoch": 0.10351504319332737, + "grad_norm": 2.125, + "learning_rate": 0.0015504915102770328, + "loss": 0.8743, + "num_input_tokens_seen": 405088, + "step": 695 + }, + { + "epoch": 0.10425975573428657, + "grad_norm": 0.462890625, + "learning_rate": 0.0015616621983914208, + "loss": 0.8391, + "num_input_tokens_seen": 408288, + "step": 700 + }, + { + "epoch": 0.10500446827524576, + "grad_norm": 0.39453125, + "learning_rate": 0.0015728328865058086, + "loss": 0.8356, + "num_input_tokens_seen": 411488, + "step": 705 + }, + { + "epoch": 0.10574918081620495, + "grad_norm": 0.59375, + "learning_rate": 0.0015840035746201965, + "loss": 0.801, + "num_input_tokens_seen": 414272, + "step": 710 + }, + { + "epoch": 0.10649389335716414, + "grad_norm": 0.87890625, + "learning_rate": 0.0015951742627345845, + "loss": 2.2374, + "num_input_tokens_seen": 417184, + "step": 715 + }, + { + "epoch": 0.10723860589812333, + "grad_norm": 0.380859375, + "learning_rate": 0.0016063449508489723, + "loss": 0.8344, + "num_input_tokens_seen": 420096, + "step": 720 + }, + { + "epoch": 0.10798331843908252, + "grad_norm": 0.50390625, + "learning_rate": 0.0016175156389633602, + "loss": 0.7791, + "num_input_tokens_seen": 422944, + "step": 725 + }, + { + "epoch": 0.1087280309800417, + "grad_norm": 0.62890625, + "learning_rate": 0.001628686327077748, + "loss": 0.9282, + "num_input_tokens_seen": 425824, + "step": 730 + }, + { + "epoch": 0.10947274352100089, + "grad_norm": 0.52734375, + "learning_rate": 0.0016398570151921357, + "loss": 0.7881, + "num_input_tokens_seen": 428832, + "step": 735 + }, + { + "epoch": 0.11021745606196008, + "grad_norm": 0.3046875, + "learning_rate": 0.0016510277033065237, + "loss": 0.8267, + "num_input_tokens_seen": 432032, + "step": 740 + }, + { + "epoch": 0.11096216860291927, + "grad_norm": 1.109375, + "learning_rate": 0.0016621983914209115, + "loss": 0.8291, + "num_input_tokens_seen": 434752, + "step": 745 + }, + { + "epoch": 0.11170688114387846, + "grad_norm": 0.251953125, + "learning_rate": 0.0016733690795352992, + "loss": 0.7834, + "num_input_tokens_seen": 437568, + "step": 750 + }, + { + "epoch": 0.11245159368483765, + "grad_norm": 0.26171875, + "learning_rate": 0.0016845397676496872, + "loss": 0.8327, + "num_input_tokens_seen": 440736, + "step": 755 + }, + { + "epoch": 0.11319630622579684, + "grad_norm": 0.306640625, + "learning_rate": 0.001695710455764075, + "loss": 0.7988, + "num_input_tokens_seen": 443456, + "step": 760 + }, + { + "epoch": 0.11394101876675604, + "grad_norm": 0.2490234375, + "learning_rate": 0.0017068811438784627, + "loss": 0.7976, + "num_input_tokens_seen": 446464, + "step": 765 + }, + { + "epoch": 0.11468573130771523, + "grad_norm": 0.6640625, + "learning_rate": 0.0017180518319928507, + "loss": 0.8453, + "num_input_tokens_seen": 449216, + "step": 770 + }, + { + "epoch": 0.11543044384867442, + "grad_norm": 0.361328125, + "learning_rate": 0.0017292225201072384, + "loss": 0.7951, + "num_input_tokens_seen": 452064, + "step": 775 + }, + { + "epoch": 0.1161751563896336, + "grad_norm": 0.279296875, + "learning_rate": 0.0017403932082216266, + "loss": 0.8149, + "num_input_tokens_seen": 454912, + "step": 780 + }, + { + "epoch": 0.1169198689305928, + "grad_norm": 0.6171875, + "learning_rate": 0.0017515638963360143, + "loss": 0.8142, + "num_input_tokens_seen": 457984, + "step": 785 + }, + { + "epoch": 0.11766458147155198, + "grad_norm": 0.33203125, + "learning_rate": 0.001762734584450402, + "loss": 0.8235, + "num_input_tokens_seen": 460704, + "step": 790 + }, + { + "epoch": 0.11840929401251117, + "grad_norm": 0.408203125, + "learning_rate": 0.00177390527256479, + "loss": 0.8177, + "num_input_tokens_seen": 463552, + "step": 795 + }, + { + "epoch": 0.11915400655347036, + "grad_norm": 0.314453125, + "learning_rate": 0.0017850759606791778, + "loss": 0.8046, + "num_input_tokens_seen": 466400, + "step": 800 + }, + { + "epoch": 0.11989871909442955, + "grad_norm": 0.2099609375, + "learning_rate": 0.0017962466487935656, + "loss": 0.806, + "num_input_tokens_seen": 469312, + "step": 805 + }, + { + "epoch": 0.12064343163538874, + "grad_norm": 0.166015625, + "learning_rate": 0.0018074173369079535, + "loss": 0.8303, + "num_input_tokens_seen": 472256, + "step": 810 + }, + { + "epoch": 0.12138814417634793, + "grad_norm": 0.2265625, + "learning_rate": 0.0018185880250223413, + "loss": 0.788, + "num_input_tokens_seen": 475104, + "step": 815 + }, + { + "epoch": 0.12213285671730711, + "grad_norm": 0.28515625, + "learning_rate": 0.001829758713136729, + "loss": 0.8167, + "num_input_tokens_seen": 477952, + "step": 820 + }, + { + "epoch": 0.1228775692582663, + "grad_norm": 0.291015625, + "learning_rate": 0.001840929401251117, + "loss": 0.8106, + "num_input_tokens_seen": 480704, + "step": 825 + }, + { + "epoch": 0.1236222817992255, + "grad_norm": 0.267578125, + "learning_rate": 0.0018521000893655048, + "loss": 0.8154, + "num_input_tokens_seen": 483680, + "step": 830 + }, + { + "epoch": 0.1243669943401847, + "grad_norm": 0.1650390625, + "learning_rate": 0.0018632707774798925, + "loss": 0.8137, + "num_input_tokens_seen": 486720, + "step": 835 + }, + { + "epoch": 0.12511170688114387, + "grad_norm": 0.2021484375, + "learning_rate": 0.0018744414655942805, + "loss": 0.8004, + "num_input_tokens_seen": 489536, + "step": 840 + }, + { + "epoch": 0.12585641942210307, + "grad_norm": 0.333984375, + "learning_rate": 0.0018856121537086685, + "loss": 0.7934, + "num_input_tokens_seen": 492256, + "step": 845 + }, + { + "epoch": 0.12660113196306225, + "grad_norm": 0.365234375, + "learning_rate": 0.0018967828418230562, + "loss": 0.7985, + "num_input_tokens_seen": 495168, + "step": 850 + }, + { + "epoch": 0.12734584450402145, + "grad_norm": 0.2392578125, + "learning_rate": 0.0019079535299374442, + "loss": 0.8383, + "num_input_tokens_seen": 498240, + "step": 855 + }, + { + "epoch": 0.12809055704498062, + "grad_norm": 0.2421875, + "learning_rate": 0.0019191242180518317, + "loss": 0.7888, + "num_input_tokens_seen": 501216, + "step": 860 + }, + { + "epoch": 0.12883526958593983, + "grad_norm": 0.419921875, + "learning_rate": 0.00193029490616622, + "loss": 0.8048, + "num_input_tokens_seen": 504352, + "step": 865 + }, + { + "epoch": 0.12957998212689903, + "grad_norm": 0.23828125, + "learning_rate": 0.0019414655942806074, + "loss": 0.7945, + "num_input_tokens_seen": 507168, + "step": 870 + }, + { + "epoch": 0.1303246946678582, + "grad_norm": 0.21484375, + "learning_rate": 0.0019526362823949954, + "loss": 0.8243, + "num_input_tokens_seen": 510112, + "step": 875 + }, + { + "epoch": 0.1310694072088174, + "grad_norm": 0.173828125, + "learning_rate": 0.0019638069705093836, + "loss": 0.8002, + "num_input_tokens_seen": 512896, + "step": 880 + }, + { + "epoch": 0.13181411974977658, + "grad_norm": 0.130859375, + "learning_rate": 0.001974977658623771, + "loss": 0.8292, + "num_input_tokens_seen": 515744, + "step": 885 + }, + { + "epoch": 0.13255883229073578, + "grad_norm": 0.203125, + "learning_rate": 0.001986148346738159, + "loss": 0.8324, + "num_input_tokens_seen": 518624, + "step": 890 + }, + { + "epoch": 0.13330354483169496, + "grad_norm": 0.1767578125, + "learning_rate": 0.0019973190348525466, + "loss": 0.797, + "num_input_tokens_seen": 521824, + "step": 895 + }, + { + "epoch": 0.13404825737265416, + "grad_norm": 0.1298828125, + "learning_rate": 0.002008489722966935, + "loss": 0.8214, + "num_input_tokens_seen": 524800, + "step": 900 + }, + { + "epoch": 0.13479296991361334, + "grad_norm": 0.181640625, + "learning_rate": 0.0020196604110813226, + "loss": 0.8126, + "num_input_tokens_seen": 527712, + "step": 905 + }, + { + "epoch": 0.13553768245457254, + "grad_norm": 0.1513671875, + "learning_rate": 0.0020308310991957105, + "loss": 0.8118, + "num_input_tokens_seen": 531200, + "step": 910 + }, + { + "epoch": 0.1362823949955317, + "grad_norm": 0.734375, + "learning_rate": 0.002042001787310098, + "loss": 0.7957, + "num_input_tokens_seen": 534080, + "step": 915 + }, + { + "epoch": 0.13702710753649092, + "grad_norm": 0.23828125, + "learning_rate": 0.002053172475424486, + "loss": 0.8202, + "num_input_tokens_seen": 536800, + "step": 920 + }, + { + "epoch": 0.1377718200774501, + "grad_norm": 0.271484375, + "learning_rate": 0.0020643431635388736, + "loss": 0.8412, + "num_input_tokens_seen": 539904, + "step": 925 + }, + { + "epoch": 0.1385165326184093, + "grad_norm": 0.4453125, + "learning_rate": 0.002075513851653262, + "loss": 0.794, + "num_input_tokens_seen": 542560, + "step": 930 + }, + { + "epoch": 0.1392612451593685, + "grad_norm": 3.25, + "learning_rate": 0.0020866845397676495, + "loss": 0.9754, + "num_input_tokens_seen": 545696, + "step": 935 + }, + { + "epoch": 0.14000595770032767, + "grad_norm": 0.4765625, + "learning_rate": 0.0020978552278820375, + "loss": 0.8658, + "num_input_tokens_seen": 548640, + "step": 940 + }, + { + "epoch": 0.14075067024128687, + "grad_norm": 0.431640625, + "learning_rate": 0.0021090259159964255, + "loss": 0.801, + "num_input_tokens_seen": 551296, + "step": 945 + }, + { + "epoch": 0.14149538278224605, + "grad_norm": 0.310546875, + "learning_rate": 0.002120196604110813, + "loss": 0.7974, + "num_input_tokens_seen": 554176, + "step": 950 + }, + { + "epoch": 0.14224009532320525, + "grad_norm": 0.27734375, + "learning_rate": 0.002131367292225201, + "loss": 0.8134, + "num_input_tokens_seen": 556928, + "step": 955 + }, + { + "epoch": 0.14298480786416443, + "grad_norm": 0.5234375, + "learning_rate": 0.002142537980339589, + "loss": 0.7909, + "num_input_tokens_seen": 560096, + "step": 960 + }, + { + "epoch": 0.14372952040512363, + "grad_norm": 0.5546875, + "learning_rate": 0.002153708668453977, + "loss": 0.8371, + "num_input_tokens_seen": 563104, + "step": 965 + }, + { + "epoch": 0.1444742329460828, + "grad_norm": 1.0, + "learning_rate": 0.0021648793565683644, + "loss": 0.7764, + "num_input_tokens_seen": 566048, + "step": 970 + }, + { + "epoch": 0.145218945487042, + "grad_norm": 0.3203125, + "learning_rate": 0.0021760500446827524, + "loss": 0.9572, + "num_input_tokens_seen": 568544, + "step": 975 + }, + { + "epoch": 0.14596365802800118, + "grad_norm": 0.341796875, + "learning_rate": 0.00218722073279714, + "loss": 0.7995, + "num_input_tokens_seen": 571616, + "step": 980 + }, + { + "epoch": 0.14670837056896038, + "grad_norm": 0.2412109375, + "learning_rate": 0.0021983914209115283, + "loss": 0.811, + "num_input_tokens_seen": 574496, + "step": 985 + }, + { + "epoch": 0.14745308310991956, + "grad_norm": 0.34765625, + "learning_rate": 0.002209562109025916, + "loss": 0.8106, + "num_input_tokens_seen": 577280, + "step": 990 + }, + { + "epoch": 0.14819779565087876, + "grad_norm": 0.2060546875, + "learning_rate": 0.002220732797140304, + "loss": 0.8042, + "num_input_tokens_seen": 580096, + "step": 995 + }, + { + "epoch": 0.14894250819183796, + "grad_norm": 0.251953125, + "learning_rate": 0.0022319034852546914, + "loss": 0.7844, + "num_input_tokens_seen": 583040, + "step": 1000 + }, + { + "epoch": 0.14968722073279714, + "grad_norm": 0.1591796875, + "learning_rate": 0.0022430741733690794, + "loss": 0.7881, + "num_input_tokens_seen": 586016, + "step": 1005 + }, + { + "epoch": 0.15043193327375634, + "grad_norm": 0.171875, + "learning_rate": 0.0022542448614834673, + "loss": 0.841, + "num_input_tokens_seen": 589056, + "step": 1010 + }, + { + "epoch": 0.15117664581471552, + "grad_norm": 0.1337890625, + "learning_rate": 0.0022654155495978553, + "loss": 0.79, + "num_input_tokens_seen": 591936, + "step": 1015 + }, + { + "epoch": 0.15192135835567472, + "grad_norm": 0.365234375, + "learning_rate": 0.0022765862377122433, + "loss": 0.7988, + "num_input_tokens_seen": 594848, + "step": 1020 + }, + { + "epoch": 0.1526660708966339, + "grad_norm": 0.220703125, + "learning_rate": 0.002287756925826631, + "loss": 0.8258, + "num_input_tokens_seen": 597792, + "step": 1025 + }, + { + "epoch": 0.1534107834375931, + "grad_norm": 0.1767578125, + "learning_rate": 0.0022989276139410188, + "loss": 0.8321, + "num_input_tokens_seen": 600768, + "step": 1030 + }, + { + "epoch": 0.15415549597855227, + "grad_norm": 0.2578125, + "learning_rate": 0.0023100983020554063, + "loss": 0.793, + "num_input_tokens_seen": 603616, + "step": 1035 + }, + { + "epoch": 0.15490020851951147, + "grad_norm": 0.1923828125, + "learning_rate": 0.0023212689901697943, + "loss": 0.7979, + "num_input_tokens_seen": 606432, + "step": 1040 + }, + { + "epoch": 0.15564492106047065, + "grad_norm": 0.251953125, + "learning_rate": 0.0023324396782841822, + "loss": 0.8154, + "num_input_tokens_seen": 609536, + "step": 1045 + }, + { + "epoch": 0.15638963360142985, + "grad_norm": 0.171875, + "learning_rate": 0.00234361036639857, + "loss": 0.803, + "num_input_tokens_seen": 612256, + "step": 1050 + }, + { + "epoch": 0.15713434614238903, + "grad_norm": 0.1181640625, + "learning_rate": 0.0023547810545129578, + "loss": 0.7985, + "num_input_tokens_seen": 615136, + "step": 1055 + }, + { + "epoch": 0.15787905868334823, + "grad_norm": 0.244140625, + "learning_rate": 0.0023659517426273457, + "loss": 0.8253, + "num_input_tokens_seen": 618144, + "step": 1060 + }, + { + "epoch": 0.15862377122430743, + "grad_norm": 0.2001953125, + "learning_rate": 0.0023771224307417333, + "loss": 0.8004, + "num_input_tokens_seen": 621440, + "step": 1065 + }, + { + "epoch": 0.1593684837652666, + "grad_norm": 0.1806640625, + "learning_rate": 0.0023882931188561217, + "loss": 0.821, + "num_input_tokens_seen": 624352, + "step": 1070 + }, + { + "epoch": 0.1601131963062258, + "grad_norm": 0.19921875, + "learning_rate": 0.002399463806970509, + "loss": 0.789, + "num_input_tokens_seen": 627392, + "step": 1075 + }, + { + "epoch": 0.16085790884718498, + "grad_norm": 0.24609375, + "learning_rate": 0.002410634495084897, + "loss": 0.7933, + "num_input_tokens_seen": 630368, + "step": 1080 + }, + { + "epoch": 0.16160262138814419, + "grad_norm": 0.13671875, + "learning_rate": 0.002421805183199285, + "loss": 0.8103, + "num_input_tokens_seen": 633504, + "step": 1085 + }, + { + "epoch": 0.16234733392910336, + "grad_norm": 0.1904296875, + "learning_rate": 0.0024329758713136727, + "loss": 0.828, + "num_input_tokens_seen": 636640, + "step": 1090 + }, + { + "epoch": 0.16309204647006256, + "grad_norm": 0.1591796875, + "learning_rate": 0.0024441465594280606, + "loss": 0.781, + "num_input_tokens_seen": 639744, + "step": 1095 + }, + { + "epoch": 0.16383675901102174, + "grad_norm": 0.197265625, + "learning_rate": 0.0024553172475424486, + "loss": 0.8001, + "num_input_tokens_seen": 642464, + "step": 1100 + }, + { + "epoch": 0.16458147155198094, + "grad_norm": 0.2021484375, + "learning_rate": 0.0024664879356568366, + "loss": 0.798, + "num_input_tokens_seen": 645472, + "step": 1105 + }, + { + "epoch": 0.16532618409294011, + "grad_norm": 0.1708984375, + "learning_rate": 0.002477658623771224, + "loss": 0.7871, + "num_input_tokens_seen": 648320, + "step": 1110 + }, + { + "epoch": 0.16607089663389932, + "grad_norm": 0.5703125, + "learning_rate": 0.002488829311885612, + "loss": 0.7895, + "num_input_tokens_seen": 651168, + "step": 1115 + }, + { + "epoch": 0.1668156091748585, + "grad_norm": 0.25, + "learning_rate": 0.0024999999999999996, + "loss": 0.7921, + "num_input_tokens_seen": 654144, + "step": 1120 + }, + { + "epoch": 0.1675603217158177, + "grad_norm": 0.1826171875, + "learning_rate": 0.002511170688114388, + "loss": 0.8011, + "num_input_tokens_seen": 656928, + "step": 1125 + }, + { + "epoch": 0.1683050342567769, + "grad_norm": 0.34765625, + "learning_rate": 0.0025223413762287756, + "loss": 0.8233, + "num_input_tokens_seen": 659392, + "step": 1130 + }, + { + "epoch": 0.16904974679773607, + "grad_norm": 0.150390625, + "learning_rate": 0.0025335120643431635, + "loss": 0.8304, + "num_input_tokens_seen": 662464, + "step": 1135 + }, + { + "epoch": 0.16979445933869527, + "grad_norm": 0.1533203125, + "learning_rate": 0.002544682752457551, + "loss": 0.8154, + "num_input_tokens_seen": 665408, + "step": 1140 + }, + { + "epoch": 0.17053917187965445, + "grad_norm": 0.12255859375, + "learning_rate": 0.002555853440571939, + "loss": 0.8151, + "num_input_tokens_seen": 668320, + "step": 1145 + }, + { + "epoch": 0.17128388442061365, + "grad_norm": 0.1904296875, + "learning_rate": 0.002567024128686327, + "loss": 0.8244, + "num_input_tokens_seen": 671168, + "step": 1150 + }, + { + "epoch": 0.17202859696157283, + "grad_norm": 0.10498046875, + "learning_rate": 0.002578194816800715, + "loss": 0.8037, + "num_input_tokens_seen": 674368, + "step": 1155 + }, + { + "epoch": 0.17277330950253203, + "grad_norm": 0.1572265625, + "learning_rate": 0.002589365504915103, + "loss": 0.8035, + "num_input_tokens_seen": 677056, + "step": 1160 + }, + { + "epoch": 0.1735180220434912, + "grad_norm": 0.208984375, + "learning_rate": 0.0026005361930294905, + "loss": 0.8164, + "num_input_tokens_seen": 680000, + "step": 1165 + }, + { + "epoch": 0.1742627345844504, + "grad_norm": 0.185546875, + "learning_rate": 0.0026117068811438784, + "loss": 0.7988, + "num_input_tokens_seen": 682752, + "step": 1170 + }, + { + "epoch": 0.17500744712540958, + "grad_norm": 0.171875, + "learning_rate": 0.002622877569258266, + "loss": 0.8057, + "num_input_tokens_seen": 685600, + "step": 1175 + }, + { + "epoch": 0.17575215966636878, + "grad_norm": 0.08447265625, + "learning_rate": 0.002634048257372654, + "loss": 0.8157, + "num_input_tokens_seen": 688448, + "step": 1180 + }, + { + "epoch": 0.17649687220732796, + "grad_norm": 0.08837890625, + "learning_rate": 0.002645218945487042, + "loss": 0.8022, + "num_input_tokens_seen": 691328, + "step": 1185 + }, + { + "epoch": 0.17724158474828716, + "grad_norm": 0.1591796875, + "learning_rate": 0.00265638963360143, + "loss": 0.7937, + "num_input_tokens_seen": 694208, + "step": 1190 + }, + { + "epoch": 0.17798629728924636, + "grad_norm": 0.09619140625, + "learning_rate": 0.0026675603217158174, + "loss": 0.8153, + "num_input_tokens_seen": 696928, + "step": 1195 + }, + { + "epoch": 0.17873100983020554, + "grad_norm": 0.1484375, + "learning_rate": 0.0026787310098302054, + "loss": 0.7932, + "num_input_tokens_seen": 699776, + "step": 1200 + }, + { + "epoch": 0.17947572237116474, + "grad_norm": 0.2119140625, + "learning_rate": 0.002689901697944593, + "loss": 0.7866, + "num_input_tokens_seen": 702688, + "step": 1205 + }, + { + "epoch": 0.18022043491212392, + "grad_norm": 0.234375, + "learning_rate": 0.0027010723860589813, + "loss": 0.759, + "num_input_tokens_seen": 705536, + "step": 1210 + }, + { + "epoch": 0.18096514745308312, + "grad_norm": 0.37109375, + "learning_rate": 0.0027122430741733693, + "loss": 0.8443, + "num_input_tokens_seen": 708544, + "step": 1215 + }, + { + "epoch": 0.1817098599940423, + "grad_norm": 0.1337890625, + "learning_rate": 0.002723413762287757, + "loss": 0.7938, + "num_input_tokens_seen": 711200, + "step": 1220 + }, + { + "epoch": 0.1824545725350015, + "grad_norm": 0.2451171875, + "learning_rate": 0.002734584450402145, + "loss": 0.8195, + "num_input_tokens_seen": 714176, + "step": 1225 + }, + { + "epoch": 0.18319928507596067, + "grad_norm": 0.08984375, + "learning_rate": 0.0027457551385165323, + "loss": 0.802, + "num_input_tokens_seen": 716896, + "step": 1230 + }, + { + "epoch": 0.18394399761691987, + "grad_norm": 0.1630859375, + "learning_rate": 0.0027569258266309203, + "loss": 0.7978, + "num_input_tokens_seen": 719680, + "step": 1235 + }, + { + "epoch": 0.18468871015787905, + "grad_norm": 0.2236328125, + "learning_rate": 0.0027680965147453083, + "loss": 0.8123, + "num_input_tokens_seen": 722336, + "step": 1240 + }, + { + "epoch": 0.18543342269883825, + "grad_norm": 0.1826171875, + "learning_rate": 0.0027792672028596963, + "loss": 0.8264, + "num_input_tokens_seen": 725472, + "step": 1245 + }, + { + "epoch": 0.18617813523979743, + "grad_norm": 0.1025390625, + "learning_rate": 0.002790437890974084, + "loss": 0.8186, + "num_input_tokens_seen": 728512, + "step": 1250 + }, + { + "epoch": 0.18692284778075663, + "grad_norm": 0.177734375, + "learning_rate": 0.0028016085790884718, + "loss": 0.8023, + "num_input_tokens_seen": 731424, + "step": 1255 + }, + { + "epoch": 0.1876675603217158, + "grad_norm": 0.255859375, + "learning_rate": 0.0028127792672028593, + "loss": 0.8065, + "num_input_tokens_seen": 734368, + "step": 1260 + }, + { + "epoch": 0.188412272862675, + "grad_norm": 0.142578125, + "learning_rate": 0.0028239499553172473, + "loss": 0.7825, + "num_input_tokens_seen": 737408, + "step": 1265 + }, + { + "epoch": 0.1891569854036342, + "grad_norm": 0.1494140625, + "learning_rate": 0.0028351206434316352, + "loss": 0.7678, + "num_input_tokens_seen": 740576, + "step": 1270 + }, + { + "epoch": 0.18990169794459338, + "grad_norm": 0.173828125, + "learning_rate": 0.002846291331546023, + "loss": 0.8316, + "num_input_tokens_seen": 743200, + "step": 1275 + }, + { + "epoch": 0.1906464104855526, + "grad_norm": 0.1279296875, + "learning_rate": 0.002857462019660411, + "loss": 0.8145, + "num_input_tokens_seen": 746176, + "step": 1280 + }, + { + "epoch": 0.19139112302651176, + "grad_norm": 0.1904296875, + "learning_rate": 0.0028686327077747987, + "loss": 0.8149, + "num_input_tokens_seen": 749152, + "step": 1285 + }, + { + "epoch": 0.19213583556747096, + "grad_norm": 0.275390625, + "learning_rate": 0.0028798033958891867, + "loss": 0.8109, + "num_input_tokens_seen": 752128, + "step": 1290 + }, + { + "epoch": 0.19288054810843014, + "grad_norm": 0.416015625, + "learning_rate": 0.0028909740840035746, + "loss": 0.7829, + "num_input_tokens_seen": 755072, + "step": 1295 + }, + { + "epoch": 0.19362526064938934, + "grad_norm": 0.1572265625, + "learning_rate": 0.0029021447721179626, + "loss": 0.7677, + "num_input_tokens_seen": 758016, + "step": 1300 + }, + { + "epoch": 0.19436997319034852, + "grad_norm": 0.1533203125, + "learning_rate": 0.00291331546023235, + "loss": 0.7986, + "num_input_tokens_seen": 760928, + "step": 1305 + }, + { + "epoch": 0.19511468573130772, + "grad_norm": 0.134765625, + "learning_rate": 0.002924486148346738, + "loss": 0.8195, + "num_input_tokens_seen": 763776, + "step": 1310 + }, + { + "epoch": 0.1958593982722669, + "grad_norm": 0.1748046875, + "learning_rate": 0.0029356568364611257, + "loss": 0.8154, + "num_input_tokens_seen": 766432, + "step": 1315 + }, + { + "epoch": 0.1966041108132261, + "grad_norm": 0.1181640625, + "learning_rate": 0.0029468275245755136, + "loss": 0.7835, + "num_input_tokens_seen": 769280, + "step": 1320 + }, + { + "epoch": 0.19734882335418527, + "grad_norm": 0.10546875, + "learning_rate": 0.0029579982126899016, + "loss": 0.7873, + "num_input_tokens_seen": 772224, + "step": 1325 + }, + { + "epoch": 0.19809353589514447, + "grad_norm": 0.14453125, + "learning_rate": 0.0029691689008042896, + "loss": 0.8426, + "num_input_tokens_seen": 774944, + "step": 1330 + }, + { + "epoch": 0.19883824843610368, + "grad_norm": 0.1728515625, + "learning_rate": 0.002980339588918677, + "loss": 0.8255, + "num_input_tokens_seen": 777664, + "step": 1335 + }, + { + "epoch": 0.19958296097706285, + "grad_norm": 0.1494140625, + "learning_rate": 0.002991510277033065, + "loss": 0.8002, + "num_input_tokens_seen": 780480, + "step": 1340 + }, + { + "epoch": 0.20032767351802205, + "grad_norm": 0.1982421875, + "learning_rate": 0.003002680965147453, + "loss": 0.8231, + "num_input_tokens_seen": 783296, + "step": 1345 + }, + { + "epoch": 0.20107238605898123, + "grad_norm": 0.2119140625, + "learning_rate": 0.003013851653261841, + "loss": 0.8056, + "num_input_tokens_seen": 786016, + "step": 1350 + }, + { + "epoch": 0.20181709859994043, + "grad_norm": 0.40234375, + "learning_rate": 0.003025022341376229, + "loss": 0.8247, + "num_input_tokens_seen": 788960, + "step": 1355 + }, + { + "epoch": 0.2025618111408996, + "grad_norm": 0.3515625, + "learning_rate": 0.0030361930294906165, + "loss": 0.8236, + "num_input_tokens_seen": 791936, + "step": 1360 + }, + { + "epoch": 0.2033065236818588, + "grad_norm": 0.1455078125, + "learning_rate": 0.0030473637176050045, + "loss": 0.8269, + "num_input_tokens_seen": 794720, + "step": 1365 + }, + { + "epoch": 0.20405123622281798, + "grad_norm": 0.158203125, + "learning_rate": 0.003058534405719392, + "loss": 0.7759, + "num_input_tokens_seen": 797376, + "step": 1370 + }, + { + "epoch": 0.20479594876377719, + "grad_norm": 0.15625, + "learning_rate": 0.00306970509383378, + "loss": 0.8103, + "num_input_tokens_seen": 800192, + "step": 1375 + }, + { + "epoch": 0.20554066130473636, + "grad_norm": 0.412109375, + "learning_rate": 0.003080875781948168, + "loss": 0.7956, + "num_input_tokens_seen": 803104, + "step": 1380 + }, + { + "epoch": 0.20628537384569556, + "grad_norm": 0.21484375, + "learning_rate": 0.003092046470062556, + "loss": 0.8015, + "num_input_tokens_seen": 806496, + "step": 1385 + }, + { + "epoch": 0.20703008638665474, + "grad_norm": 0.1923828125, + "learning_rate": 0.0031032171581769435, + "loss": 0.8026, + "num_input_tokens_seen": 809344, + "step": 1390 + }, + { + "epoch": 0.20777479892761394, + "grad_norm": 0.2490234375, + "learning_rate": 0.0031143878462913314, + "loss": 0.8065, + "num_input_tokens_seen": 812256, + "step": 1395 + }, + { + "epoch": 0.20851951146857314, + "grad_norm": 0.1533203125, + "learning_rate": 0.003125558534405719, + "loss": 0.845, + "num_input_tokens_seen": 814816, + "step": 1400 + }, + { + "epoch": 0.20926422400953232, + "grad_norm": 0.19921875, + "learning_rate": 0.003136729222520107, + "loss": 0.8042, + "num_input_tokens_seen": 817440, + "step": 1405 + }, + { + "epoch": 0.21000893655049152, + "grad_norm": 0.388671875, + "learning_rate": 0.0031478999106344953, + "loss": 0.7613, + "num_input_tokens_seen": 820640, + "step": 1410 + }, + { + "epoch": 0.2107536490914507, + "grad_norm": 0.33203125, + "learning_rate": 0.003159070598748883, + "loss": 0.8333, + "num_input_tokens_seen": 823488, + "step": 1415 + }, + { + "epoch": 0.2114983616324099, + "grad_norm": 0.1455078125, + "learning_rate": 0.003170241286863271, + "loss": 0.8019, + "num_input_tokens_seen": 826240, + "step": 1420 + }, + { + "epoch": 0.21224307417336907, + "grad_norm": 0.310546875, + "learning_rate": 0.0031814119749776584, + "loss": 0.802, + "num_input_tokens_seen": 828992, + "step": 1425 + }, + { + "epoch": 0.21298778671432828, + "grad_norm": 0.5546875, + "learning_rate": 0.0031925826630920463, + "loss": 0.8035, + "num_input_tokens_seen": 831648, + "step": 1430 + }, + { + "epoch": 0.21373249925528745, + "grad_norm": 0.087890625, + "learning_rate": 0.0032037533512064343, + "loss": 0.786, + "num_input_tokens_seen": 834336, + "step": 1435 + }, + { + "epoch": 0.21447721179624665, + "grad_norm": 0.1865234375, + "learning_rate": 0.0032149240393208223, + "loss": 0.8404, + "num_input_tokens_seen": 837536, + "step": 1440 + }, + { + "epoch": 0.21522192433720583, + "grad_norm": 0.126953125, + "learning_rate": 0.00322609472743521, + "loss": 0.8017, + "num_input_tokens_seen": 841088, + "step": 1445 + }, + { + "epoch": 0.21596663687816503, + "grad_norm": 0.16796875, + "learning_rate": 0.003237265415549598, + "loss": 0.8064, + "num_input_tokens_seen": 844000, + "step": 1450 + }, + { + "epoch": 0.2167113494191242, + "grad_norm": 0.2490234375, + "learning_rate": 0.0032484361036639853, + "loss": 0.7731, + "num_input_tokens_seen": 846912, + "step": 1455 + }, + { + "epoch": 0.2174560619600834, + "grad_norm": 0.216796875, + "learning_rate": 0.0032596067917783733, + "loss": 0.827, + "num_input_tokens_seen": 849952, + "step": 1460 + }, + { + "epoch": 0.2182007745010426, + "grad_norm": 0.1669921875, + "learning_rate": 0.0032707774798927613, + "loss": 0.8175, + "num_input_tokens_seen": 852768, + "step": 1465 + }, + { + "epoch": 0.21894548704200179, + "grad_norm": 0.1962890625, + "learning_rate": 0.0032819481680071492, + "loss": 0.8203, + "num_input_tokens_seen": 855808, + "step": 1470 + }, + { + "epoch": 0.219690199582961, + "grad_norm": 0.25390625, + "learning_rate": 0.003293118856121537, + "loss": 0.806, + "num_input_tokens_seen": 858624, + "step": 1475 + }, + { + "epoch": 0.22043491212392016, + "grad_norm": 0.1787109375, + "learning_rate": 0.0033042895442359247, + "loss": 0.8093, + "num_input_tokens_seen": 861600, + "step": 1480 + }, + { + "epoch": 0.22117962466487937, + "grad_norm": 0.2041015625, + "learning_rate": 0.0033154602323503127, + "loss": 0.7755, + "num_input_tokens_seen": 864768, + "step": 1485 + }, + { + "epoch": 0.22192433720583854, + "grad_norm": 0.189453125, + "learning_rate": 0.0033266309204647002, + "loss": 0.8289, + "num_input_tokens_seen": 867776, + "step": 1490 + }, + { + "epoch": 0.22266904974679774, + "grad_norm": 0.2119140625, + "learning_rate": 0.0033378016085790886, + "loss": 0.7943, + "num_input_tokens_seen": 871104, + "step": 1495 + }, + { + "epoch": 0.22341376228775692, + "grad_norm": 0.1787109375, + "learning_rate": 0.003348972296693476, + "loss": 0.8039, + "num_input_tokens_seen": 874304, + "step": 1500 + }, + { + "epoch": 0.22415847482871612, + "grad_norm": 0.12890625, + "learning_rate": 0.003360142984807864, + "loss": 0.8083, + "num_input_tokens_seen": 877184, + "step": 1505 + }, + { + "epoch": 0.2249031873696753, + "grad_norm": 0.1259765625, + "learning_rate": 0.0033713136729222517, + "loss": 0.8262, + "num_input_tokens_seen": 880128, + "step": 1510 + }, + { + "epoch": 0.2256478999106345, + "grad_norm": 0.134765625, + "learning_rate": 0.0033824843610366397, + "loss": 0.7944, + "num_input_tokens_seen": 883328, + "step": 1515 + }, + { + "epoch": 0.22639261245159367, + "grad_norm": 0.11767578125, + "learning_rate": 0.0033936550491510276, + "loss": 0.8216, + "num_input_tokens_seen": 886304, + "step": 1520 + }, + { + "epoch": 0.22713732499255287, + "grad_norm": 0.189453125, + "learning_rate": 0.0034048257372654156, + "loss": 0.8168, + "num_input_tokens_seen": 889280, + "step": 1525 + }, + { + "epoch": 0.22788203753351208, + "grad_norm": 0.06884765625, + "learning_rate": 0.003415996425379803, + "loss": 0.8008, + "num_input_tokens_seen": 892000, + "step": 1530 + }, + { + "epoch": 0.22862675007447125, + "grad_norm": 0.23046875, + "learning_rate": 0.003427167113494191, + "loss": 0.8287, + "num_input_tokens_seen": 894912, + "step": 1535 + }, + { + "epoch": 0.22937146261543045, + "grad_norm": 0.1376953125, + "learning_rate": 0.0034383378016085786, + "loss": 0.7983, + "num_input_tokens_seen": 897600, + "step": 1540 + }, + { + "epoch": 0.23011617515638963, + "grad_norm": 0.1435546875, + "learning_rate": 0.0034495084897229666, + "loss": 0.8113, + "num_input_tokens_seen": 900800, + "step": 1545 + }, + { + "epoch": 0.23086088769734883, + "grad_norm": 0.1845703125, + "learning_rate": 0.003460679177837355, + "loss": 0.792, + "num_input_tokens_seen": 903712, + "step": 1550 + }, + { + "epoch": 0.231605600238308, + "grad_norm": 0.134765625, + "learning_rate": 0.0034718498659517425, + "loss": 0.7976, + "num_input_tokens_seen": 906560, + "step": 1555 + }, + { + "epoch": 0.2323503127792672, + "grad_norm": 0.12890625, + "learning_rate": 0.0034830205540661305, + "loss": 0.7974, + "num_input_tokens_seen": 909344, + "step": 1560 + }, + { + "epoch": 0.23309502532022638, + "grad_norm": 0.12451171875, + "learning_rate": 0.003494191242180518, + "loss": 0.7987, + "num_input_tokens_seen": 911968, + "step": 1565 + }, + { + "epoch": 0.2338397378611856, + "grad_norm": 0.2255859375, + "learning_rate": 0.003505361930294906, + "loss": 0.8109, + "num_input_tokens_seen": 914816, + "step": 1570 + }, + { + "epoch": 0.23458445040214476, + "grad_norm": 0.1435546875, + "learning_rate": 0.0035165326184092936, + "loss": 0.7841, + "num_input_tokens_seen": 917696, + "step": 1575 + }, + { + "epoch": 0.23532916294310396, + "grad_norm": 0.2197265625, + "learning_rate": 0.003527703306523682, + "loss": 0.8, + "num_input_tokens_seen": 920992, + "step": 1580 + }, + { + "epoch": 0.23607387548406314, + "grad_norm": 0.2109375, + "learning_rate": 0.0035388739946380695, + "loss": 0.7975, + "num_input_tokens_seen": 923968, + "step": 1585 + }, + { + "epoch": 0.23681858802502234, + "grad_norm": 0.171875, + "learning_rate": 0.0035500446827524575, + "loss": 0.8281, + "num_input_tokens_seen": 926656, + "step": 1590 + }, + { + "epoch": 0.23756330056598154, + "grad_norm": 0.189453125, + "learning_rate": 0.003561215370866845, + "loss": 0.8041, + "num_input_tokens_seen": 929696, + "step": 1595 + }, + { + "epoch": 0.23830801310694072, + "grad_norm": 0.2041015625, + "learning_rate": 0.003572386058981233, + "loss": 0.8228, + "num_input_tokens_seen": 932608, + "step": 1600 + }, + { + "epoch": 0.23905272564789992, + "grad_norm": 0.130859375, + "learning_rate": 0.003583556747095621, + "loss": 0.8174, + "num_input_tokens_seen": 935424, + "step": 1605 + }, + { + "epoch": 0.2397974381888591, + "grad_norm": 0.1025390625, + "learning_rate": 0.003594727435210009, + "loss": 0.7908, + "num_input_tokens_seen": 938240, + "step": 1610 + }, + { + "epoch": 0.2405421507298183, + "grad_norm": 0.0634765625, + "learning_rate": 0.003605898123324397, + "loss": 0.806, + "num_input_tokens_seen": 941152, + "step": 1615 + }, + { + "epoch": 0.24128686327077747, + "grad_norm": 0.09228515625, + "learning_rate": 0.0036170688114387844, + "loss": 0.79, + "num_input_tokens_seen": 943936, + "step": 1620 + }, + { + "epoch": 0.24203157581173668, + "grad_norm": 0.185546875, + "learning_rate": 0.0036282394995531724, + "loss": 0.7929, + "num_input_tokens_seen": 946944, + "step": 1625 + }, + { + "epoch": 0.24277628835269585, + "grad_norm": 0.11279296875, + "learning_rate": 0.00363941018766756, + "loss": 0.8097, + "num_input_tokens_seen": 949696, + "step": 1630 + }, + { + "epoch": 0.24352100089365505, + "grad_norm": 0.11328125, + "learning_rate": 0.0036505808757819483, + "loss": 0.7953, + "num_input_tokens_seen": 952608, + "step": 1635 + }, + { + "epoch": 0.24426571343461423, + "grad_norm": 0.142578125, + "learning_rate": 0.003661751563896336, + "loss": 0.798, + "num_input_tokens_seen": 955712, + "step": 1640 + }, + { + "epoch": 0.24501042597557343, + "grad_norm": 0.16015625, + "learning_rate": 0.003672922252010724, + "loss": 0.8099, + "num_input_tokens_seen": 959392, + "step": 1645 + }, + { + "epoch": 0.2457551385165326, + "grad_norm": 0.1552734375, + "learning_rate": 0.0036840929401251114, + "loss": 0.8173, + "num_input_tokens_seen": 962464, + "step": 1650 + }, + { + "epoch": 0.2464998510574918, + "grad_norm": 0.1142578125, + "learning_rate": 0.0036952636282394993, + "loss": 0.8057, + "num_input_tokens_seen": 965408, + "step": 1655 + }, + { + "epoch": 0.247244563598451, + "grad_norm": 0.0732421875, + "learning_rate": 0.0037064343163538873, + "loss": 0.7866, + "num_input_tokens_seen": 968288, + "step": 1660 + }, + { + "epoch": 0.2479892761394102, + "grad_norm": 0.1875, + "learning_rate": 0.0037176050044682753, + "loss": 0.8133, + "num_input_tokens_seen": 971168, + "step": 1665 + }, + { + "epoch": 0.2487339886803694, + "grad_norm": 0.16015625, + "learning_rate": 0.003728775692582663, + "loss": 0.8129, + "num_input_tokens_seen": 974016, + "step": 1670 + }, + { + "epoch": 0.24947870122132856, + "grad_norm": 0.134765625, + "learning_rate": 0.0037399463806970508, + "loss": 0.8012, + "num_input_tokens_seen": 977056, + "step": 1675 + }, + { + "epoch": 0.25022341376228774, + "grad_norm": 0.1689453125, + "learning_rate": 0.0037511170688114387, + "loss": 0.8194, + "num_input_tokens_seen": 980384, + "step": 1680 + }, + { + "epoch": 0.25096812630324694, + "grad_norm": 0.1435546875, + "learning_rate": 0.0037622877569258267, + "loss": 0.8221, + "num_input_tokens_seen": 983264, + "step": 1685 + }, + { + "epoch": 0.25171283884420614, + "grad_norm": 0.1845703125, + "learning_rate": 0.0037734584450402142, + "loss": 0.803, + "num_input_tokens_seen": 986496, + "step": 1690 + }, + { + "epoch": 0.25245755138516535, + "grad_norm": 0.0791015625, + "learning_rate": 0.0037846291331546022, + "loss": 0.7886, + "num_input_tokens_seen": 989248, + "step": 1695 + }, + { + "epoch": 0.2532022639261245, + "grad_norm": 0.185546875, + "learning_rate": 0.00379579982126899, + "loss": 0.8152, + "num_input_tokens_seen": 992000, + "step": 1700 + }, + { + "epoch": 0.2539469764670837, + "grad_norm": 0.18359375, + "learning_rate": 0.003806970509383378, + "loss": 0.8078, + "num_input_tokens_seen": 994784, + "step": 1705 + }, + { + "epoch": 0.2546916890080429, + "grad_norm": 0.06396484375, + "learning_rate": 0.0038181411974977653, + "loss": 0.7974, + "num_input_tokens_seen": 997696, + "step": 1710 + }, + { + "epoch": 0.2554364015490021, + "grad_norm": 0.1484375, + "learning_rate": 0.0038293118856121532, + "loss": 0.8187, + "num_input_tokens_seen": 1000416, + "step": 1715 + }, + { + "epoch": 0.25618111408996125, + "grad_norm": 0.1259765625, + "learning_rate": 0.0038404825737265416, + "loss": 0.7855, + "num_input_tokens_seen": 1002976, + "step": 1720 + }, + { + "epoch": 0.25692582663092045, + "grad_norm": 0.059326171875, + "learning_rate": 0.0038516532618409296, + "loss": 0.819, + "num_input_tokens_seen": 1006048, + "step": 1725 + }, + { + "epoch": 0.25767053917187965, + "grad_norm": 0.1630859375, + "learning_rate": 0.0038628239499553167, + "loss": 0.7978, + "num_input_tokens_seen": 1009184, + "step": 1730 + }, + { + "epoch": 0.25841525171283886, + "grad_norm": 0.1640625, + "learning_rate": 0.0038739946380697047, + "loss": 0.7983, + "num_input_tokens_seen": 1011904, + "step": 1735 + }, + { + "epoch": 0.25915996425379806, + "grad_norm": 0.11279296875, + "learning_rate": 0.0038851653261840926, + "loss": 0.7865, + "num_input_tokens_seen": 1014432, + "step": 1740 + }, + { + "epoch": 0.2599046767947572, + "grad_norm": 0.06982421875, + "learning_rate": 0.003896336014298481, + "loss": 0.8264, + "num_input_tokens_seen": 1017376, + "step": 1745 + }, + { + "epoch": 0.2606493893357164, + "grad_norm": 0.10986328125, + "learning_rate": 0.003907506702412869, + "loss": 0.818, + "num_input_tokens_seen": 1020192, + "step": 1750 + }, + { + "epoch": 0.2613941018766756, + "grad_norm": 0.07080078125, + "learning_rate": 0.003918677390527256, + "loss": 0.811, + "num_input_tokens_seen": 1023360, + "step": 1755 + }, + { + "epoch": 0.2621388144176348, + "grad_norm": 0.057373046875, + "learning_rate": 0.003929848078641644, + "loss": 0.8079, + "num_input_tokens_seen": 1026208, + "step": 1760 + }, + { + "epoch": 0.26288352695859396, + "grad_norm": 0.1376953125, + "learning_rate": 0.003941018766756032, + "loss": 0.7813, + "num_input_tokens_seen": 1029024, + "step": 1765 + }, + { + "epoch": 0.26362823949955316, + "grad_norm": 0.1298828125, + "learning_rate": 0.00395218945487042, + "loss": 0.8197, + "num_input_tokens_seen": 1032064, + "step": 1770 + }, + { + "epoch": 0.26437295204051237, + "grad_norm": 0.07666015625, + "learning_rate": 0.003963360142984807, + "loss": 0.8026, + "num_input_tokens_seen": 1034880, + "step": 1775 + }, + { + "epoch": 0.26511766458147157, + "grad_norm": 0.07421875, + "learning_rate": 0.003974530831099195, + "loss": 0.7915, + "num_input_tokens_seen": 1037696, + "step": 1780 + }, + { + "epoch": 0.2658623771224307, + "grad_norm": 0.1328125, + "learning_rate": 0.003985701519213583, + "loss": 0.8252, + "num_input_tokens_seen": 1040448, + "step": 1785 + }, + { + "epoch": 0.2666070896633899, + "grad_norm": 0.0966796875, + "learning_rate": 0.003996872207327972, + "loss": 0.8003, + "num_input_tokens_seen": 1043104, + "step": 1790 + }, + { + "epoch": 0.2673518022043491, + "grad_norm": 0.1259765625, + "learning_rate": 0.004008042895442359, + "loss": 0.7904, + "num_input_tokens_seen": 1045824, + "step": 1795 + }, + { + "epoch": 0.2680965147453083, + "grad_norm": 0.0634765625, + "learning_rate": 0.004019213583556747, + "loss": 0.7982, + "num_input_tokens_seen": 1048672, + "step": 1800 + }, + { + "epoch": 0.2688412272862675, + "grad_norm": 0.10595703125, + "learning_rate": 0.004030384271671135, + "loss": 0.802, + "num_input_tokens_seen": 1051360, + "step": 1805 + }, + { + "epoch": 0.2695859398272267, + "grad_norm": 0.185546875, + "learning_rate": 0.004041554959785523, + "loss": 0.8205, + "num_input_tokens_seen": 1054304, + "step": 1810 + }, + { + "epoch": 0.2703306523681859, + "grad_norm": 0.267578125, + "learning_rate": 0.004052725647899911, + "loss": 0.8175, + "num_input_tokens_seen": 1057952, + "step": 1815 + }, + { + "epoch": 0.2710753649091451, + "grad_norm": 0.150390625, + "learning_rate": 0.004063896336014298, + "loss": 0.8011, + "num_input_tokens_seen": 1060704, + "step": 1820 + }, + { + "epoch": 0.2718200774501043, + "grad_norm": 0.2060546875, + "learning_rate": 0.004075067024128686, + "loss": 0.7827, + "num_input_tokens_seen": 1063360, + "step": 1825 + }, + { + "epoch": 0.2725647899910634, + "grad_norm": 0.1826171875, + "learning_rate": 0.004086237712243074, + "loss": 0.792, + "num_input_tokens_seen": 1066112, + "step": 1830 + }, + { + "epoch": 0.27330950253202263, + "grad_norm": 45.75, + "learning_rate": 0.004097408400357462, + "loss": 1.167, + "num_input_tokens_seen": 1068992, + "step": 1835 + }, + { + "epoch": 0.27405421507298183, + "grad_norm": 0.216796875, + "learning_rate": 0.004108579088471849, + "loss": 0.8448, + "num_input_tokens_seen": 1071872, + "step": 1840 + }, + { + "epoch": 0.27479892761394104, + "grad_norm": 0.921875, + "learning_rate": 0.004119749776586238, + "loss": 0.8134, + "num_input_tokens_seen": 1074752, + "step": 1845 + }, + { + "epoch": 0.2755436401549002, + "grad_norm": 0.0751953125, + "learning_rate": 0.004130920464700626, + "loss": 0.8569, + "num_input_tokens_seen": 1077728, + "step": 1850 + }, + { + "epoch": 0.2762883526958594, + "grad_norm": 0.1064453125, + "learning_rate": 0.004142091152815014, + "loss": 0.7924, + "num_input_tokens_seen": 1080512, + "step": 1855 + }, + { + "epoch": 0.2770330652368186, + "grad_norm": 0.1396484375, + "learning_rate": 0.004153261840929401, + "loss": 0.7996, + "num_input_tokens_seen": 1083424, + "step": 1860 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 0.1318359375, + "learning_rate": 0.004164432529043789, + "loss": 0.7974, + "num_input_tokens_seen": 1086208, + "step": 1865 + }, + { + "epoch": 0.278522490318737, + "grad_norm": 0.1318359375, + "learning_rate": 0.004175603217158177, + "loss": 0.8234, + "num_input_tokens_seen": 1089152, + "step": 1870 + }, + { + "epoch": 0.27926720285969614, + "grad_norm": 0.1513671875, + "learning_rate": 0.004186773905272565, + "loss": 0.7978, + "num_input_tokens_seen": 1092000, + "step": 1875 + }, + { + "epoch": 0.28001191540065534, + "grad_norm": 0.1171875, + "learning_rate": 0.004197944593386953, + "loss": 0.7903, + "num_input_tokens_seen": 1095008, + "step": 1880 + }, + { + "epoch": 0.28075662794161454, + "grad_norm": 0.09423828125, + "learning_rate": 0.00420911528150134, + "loss": 0.8021, + "num_input_tokens_seen": 1098080, + "step": 1885 + }, + { + "epoch": 0.28150134048257375, + "grad_norm": 0.10595703125, + "learning_rate": 0.004220285969615728, + "loss": 0.8237, + "num_input_tokens_seen": 1100736, + "step": 1890 + }, + { + "epoch": 0.2822460530235329, + "grad_norm": 0.0751953125, + "learning_rate": 0.004231456657730116, + "loss": 0.8198, + "num_input_tokens_seen": 1103744, + "step": 1895 + }, + { + "epoch": 0.2829907655644921, + "grad_norm": 0.0859375, + "learning_rate": 0.004242627345844504, + "loss": 0.7883, + "num_input_tokens_seen": 1106304, + "step": 1900 + }, + { + "epoch": 0.2837354781054513, + "grad_norm": 0.1552734375, + "learning_rate": 0.004253798033958892, + "loss": 0.8205, + "num_input_tokens_seen": 1109472, + "step": 1905 + }, + { + "epoch": 0.2844801906464105, + "grad_norm": 0.0927734375, + "learning_rate": 0.00426496872207328, + "loss": 0.8084, + "num_input_tokens_seen": 1112192, + "step": 1910 + }, + { + "epoch": 0.28522490318736965, + "grad_norm": 0.25, + "learning_rate": 0.004276139410187668, + "loss": 0.8026, + "num_input_tokens_seen": 1115168, + "step": 1915 + }, + { + "epoch": 0.28596961572832885, + "grad_norm": 0.173828125, + "learning_rate": 0.004287310098302056, + "loss": 0.8083, + "num_input_tokens_seen": 1118016, + "step": 1920 + }, + { + "epoch": 0.28671432826928805, + "grad_norm": 0.28125, + "learning_rate": 0.004298480786416443, + "loss": 0.8222, + "num_input_tokens_seen": 1120704, + "step": 1925 + }, + { + "epoch": 0.28745904081024726, + "grad_norm": 0.1865234375, + "learning_rate": 0.004309651474530831, + "loss": 0.8029, + "num_input_tokens_seen": 1123552, + "step": 1930 + }, + { + "epoch": 0.28820375335120646, + "grad_norm": 0.06591796875, + "learning_rate": 0.004320822162645219, + "loss": 0.8159, + "num_input_tokens_seen": 1126368, + "step": 1935 + }, + { + "epoch": 0.2889484658921656, + "grad_norm": 0.1083984375, + "learning_rate": 0.004331992850759607, + "loss": 0.7952, + "num_input_tokens_seen": 1129312, + "step": 1940 + }, + { + "epoch": 0.2896931784331248, + "grad_norm": 0.2099609375, + "learning_rate": 0.004343163538873995, + "loss": 0.7949, + "num_input_tokens_seen": 1132256, + "step": 1945 + }, + { + "epoch": 0.290437890974084, + "grad_norm": 0.0986328125, + "learning_rate": 0.004354334226988382, + "loss": 0.8148, + "num_input_tokens_seen": 1135008, + "step": 1950 + }, + { + "epoch": 0.2911826035150432, + "grad_norm": 0.10595703125, + "learning_rate": 0.00436550491510277, + "loss": 0.7817, + "num_input_tokens_seen": 1137728, + "step": 1955 + }, + { + "epoch": 0.29192731605600236, + "grad_norm": 0.1650390625, + "learning_rate": 0.0043766756032171585, + "loss": 0.7769, + "num_input_tokens_seen": 1140512, + "step": 1960 + }, + { + "epoch": 0.29267202859696156, + "grad_norm": 0.2119140625, + "learning_rate": 0.0043878462913315465, + "loss": 0.8068, + "num_input_tokens_seen": 1143616, + "step": 1965 + }, + { + "epoch": 0.29341674113792077, + "grad_norm": 0.1923828125, + "learning_rate": 0.004399016979445934, + "loss": 0.7804, + "num_input_tokens_seen": 1146496, + "step": 1970 + }, + { + "epoch": 0.29416145367887997, + "grad_norm": 0.1494140625, + "learning_rate": 0.0044101876675603216, + "loss": 0.774, + "num_input_tokens_seen": 1149600, + "step": 1975 + }, + { + "epoch": 0.2949061662198391, + "grad_norm": 0.376953125, + "learning_rate": 0.0044213583556747095, + "loss": 0.8425, + "num_input_tokens_seen": 1152320, + "step": 1980 + }, + { + "epoch": 0.2956508787607983, + "grad_norm": 0.2275390625, + "learning_rate": 0.0044325290437890975, + "loss": 0.7277, + "num_input_tokens_seen": 1155648, + "step": 1985 + }, + { + "epoch": 0.2963955913017575, + "grad_norm": 0.310546875, + "learning_rate": 0.004443699731903485, + "loss": 1.0019, + "num_input_tokens_seen": 1158336, + "step": 1990 + }, + { + "epoch": 0.2971403038427167, + "grad_norm": 0.1396484375, + "learning_rate": 0.004454870420017873, + "loss": 0.8634, + "num_input_tokens_seen": 1161280, + "step": 1995 + }, + { + "epoch": 0.2978850163836759, + "grad_norm": 0.1220703125, + "learning_rate": 0.0044660411081322605, + "loss": 0.8086, + "num_input_tokens_seen": 1164064, + "step": 2000 + }, + { + "epoch": 0.2986297289246351, + "grad_norm": 0.1162109375, + "learning_rate": 0.0044772117962466485, + "loss": 0.7784, + "num_input_tokens_seen": 1166880, + "step": 2005 + }, + { + "epoch": 0.2993744414655943, + "grad_norm": 0.1318359375, + "learning_rate": 0.0044883824843610365, + "loss": 0.8299, + "num_input_tokens_seen": 1169792, + "step": 2010 + }, + { + "epoch": 0.3001191540065535, + "grad_norm": 0.10888671875, + "learning_rate": 0.0044995531724754245, + "loss": 0.7999, + "num_input_tokens_seen": 1172608, + "step": 2015 + }, + { + "epoch": 0.3008638665475127, + "grad_norm": 0.10302734375, + "learning_rate": 0.004510723860589812, + "loss": 0.8082, + "num_input_tokens_seen": 1175168, + "step": 2020 + }, + { + "epoch": 0.30160857908847183, + "grad_norm": 0.0625, + "learning_rate": 0.0045218945487042, + "loss": 0.8174, + "num_input_tokens_seen": 1178016, + "step": 2025 + }, + { + "epoch": 0.30235329162943103, + "grad_norm": 0.10302734375, + "learning_rate": 0.004533065236818588, + "loss": 0.7824, + "num_input_tokens_seen": 1180800, + "step": 2030 + }, + { + "epoch": 0.30309800417039023, + "grad_norm": 0.064453125, + "learning_rate": 0.0045442359249329755, + "loss": 0.7757, + "num_input_tokens_seen": 1183776, + "step": 2035 + }, + { + "epoch": 0.30384271671134944, + "grad_norm": 0.12060546875, + "learning_rate": 0.004555406613047363, + "loss": 0.8222, + "num_input_tokens_seen": 1186880, + "step": 2040 + }, + { + "epoch": 0.3045874292523086, + "grad_norm": 0.1103515625, + "learning_rate": 0.004566577301161751, + "loss": 0.7931, + "num_input_tokens_seen": 1189856, + "step": 2045 + }, + { + "epoch": 0.3053321417932678, + "grad_norm": 0.0712890625, + "learning_rate": 0.004577747989276139, + "loss": 0.8225, + "num_input_tokens_seen": 1192736, + "step": 2050 + }, + { + "epoch": 0.306076854334227, + "grad_norm": 0.0966796875, + "learning_rate": 0.0045889186773905265, + "loss": 0.7805, + "num_input_tokens_seen": 1195808, + "step": 2055 + }, + { + "epoch": 0.3068215668751862, + "grad_norm": 0.1015625, + "learning_rate": 0.0046000893655049144, + "loss": 0.7865, + "num_input_tokens_seen": 1198656, + "step": 2060 + }, + { + "epoch": 0.3075662794161454, + "grad_norm": 0.10791015625, + "learning_rate": 0.004611260053619302, + "loss": 0.8075, + "num_input_tokens_seen": 1201664, + "step": 2065 + }, + { + "epoch": 0.30831099195710454, + "grad_norm": 0.119140625, + "learning_rate": 0.004622430741733691, + "loss": 0.8101, + "num_input_tokens_seen": 1204384, + "step": 2070 + }, + { + "epoch": 0.30905570449806374, + "grad_norm": 0.1748046875, + "learning_rate": 0.004633601429848079, + "loss": 0.791, + "num_input_tokens_seen": 1207360, + "step": 2075 + }, + { + "epoch": 0.30980041703902295, + "grad_norm": 0.076171875, + "learning_rate": 0.004644772117962466, + "loss": 0.838, + "num_input_tokens_seen": 1210144, + "step": 2080 + }, + { + "epoch": 0.31054512957998215, + "grad_norm": 0.10400390625, + "learning_rate": 0.004655942806076854, + "loss": 0.7827, + "num_input_tokens_seen": 1213088, + "step": 2085 + }, + { + "epoch": 0.3112898421209413, + "grad_norm": 0.1015625, + "learning_rate": 0.004667113494191242, + "loss": 0.8126, + "num_input_tokens_seen": 1216416, + "step": 2090 + }, + { + "epoch": 0.3120345546619005, + "grad_norm": 0.1455078125, + "learning_rate": 0.00467828418230563, + "loss": 0.8275, + "num_input_tokens_seen": 1219424, + "step": 2095 + }, + { + "epoch": 0.3127792672028597, + "grad_norm": 0.21484375, + "learning_rate": 0.004689454870420017, + "loss": 0.8199, + "num_input_tokens_seen": 1222528, + "step": 2100 + }, + { + "epoch": 0.3135239797438189, + "grad_norm": 0.1396484375, + "learning_rate": 0.004700625558534405, + "loss": 0.7786, + "num_input_tokens_seen": 1225408, + "step": 2105 + }, + { + "epoch": 0.31426869228477805, + "grad_norm": 0.076171875, + "learning_rate": 0.004711796246648793, + "loss": 0.8157, + "num_input_tokens_seen": 1228384, + "step": 2110 + }, + { + "epoch": 0.31501340482573725, + "grad_norm": 0.1923828125, + "learning_rate": 0.004722966934763181, + "loss": 0.8157, + "num_input_tokens_seen": 1230880, + "step": 2115 + }, + { + "epoch": 0.31575811736669646, + "grad_norm": 0.12451171875, + "learning_rate": 0.004734137622877568, + "loss": 0.799, + "num_input_tokens_seen": 1233920, + "step": 2120 + }, + { + "epoch": 0.31650282990765566, + "grad_norm": 0.1767578125, + "learning_rate": 0.004745308310991957, + "loss": 0.7912, + "num_input_tokens_seen": 1236704, + "step": 2125 + }, + { + "epoch": 0.31724754244861486, + "grad_norm": 0.2236328125, + "learning_rate": 0.004756478999106345, + "loss": 0.8166, + "num_input_tokens_seen": 1239616, + "step": 2130 + }, + { + "epoch": 0.317992254989574, + "grad_norm": 0.1708984375, + "learning_rate": 0.004767649687220733, + "loss": 0.8064, + "num_input_tokens_seen": 1242464, + "step": 2135 + }, + { + "epoch": 0.3187369675305332, + "grad_norm": 0.2314453125, + "learning_rate": 0.004778820375335121, + "loss": 0.7963, + "num_input_tokens_seen": 1245440, + "step": 2140 + }, + { + "epoch": 0.3194816800714924, + "grad_norm": 0.1865234375, + "learning_rate": 0.004789991063449508, + "loss": 0.7968, + "num_input_tokens_seen": 1248416, + "step": 2145 + }, + { + "epoch": 0.3202263926124516, + "grad_norm": 0.1298828125, + "learning_rate": 0.004801161751563896, + "loss": 0.8293, + "num_input_tokens_seen": 1251008, + "step": 2150 + }, + { + "epoch": 0.32097110515341076, + "grad_norm": 0.09814453125, + "learning_rate": 0.004812332439678284, + "loss": 0.7915, + "num_input_tokens_seen": 1253888, + "step": 2155 + }, + { + "epoch": 0.32171581769436997, + "grad_norm": 0.13671875, + "learning_rate": 0.004823503127792672, + "loss": 0.7886, + "num_input_tokens_seen": 1256512, + "step": 2160 + }, + { + "epoch": 0.32246053023532917, + "grad_norm": 0.0908203125, + "learning_rate": 0.004834673815907059, + "loss": 0.808, + "num_input_tokens_seen": 1259232, + "step": 2165 + }, + { + "epoch": 0.32320524277628837, + "grad_norm": 0.0771484375, + "learning_rate": 0.004845844504021447, + "loss": 0.8232, + "num_input_tokens_seen": 1261952, + "step": 2170 + }, + { + "epoch": 0.3239499553172475, + "grad_norm": 0.06396484375, + "learning_rate": 0.004857015192135835, + "loss": 0.8055, + "num_input_tokens_seen": 1264704, + "step": 2175 + }, + { + "epoch": 0.3246946678582067, + "grad_norm": 0.1513671875, + "learning_rate": 0.004868185880250223, + "loss": 0.8108, + "num_input_tokens_seen": 1267424, + "step": 2180 + }, + { + "epoch": 0.3254393803991659, + "grad_norm": 0.0927734375, + "learning_rate": 0.004879356568364611, + "loss": 0.794, + "num_input_tokens_seen": 1270496, + "step": 2185 + }, + { + "epoch": 0.3261840929401251, + "grad_norm": 0.11767578125, + "learning_rate": 0.004890527256478999, + "loss": 0.7929, + "num_input_tokens_seen": 1273248, + "step": 2190 + }, + { + "epoch": 0.32692880548108433, + "grad_norm": 0.09765625, + "learning_rate": 0.004901697944593387, + "loss": 0.7683, + "num_input_tokens_seen": 1276288, + "step": 2195 + }, + { + "epoch": 0.3276735180220435, + "grad_norm": 0.09375, + "learning_rate": 0.004912868632707775, + "loss": 0.7998, + "num_input_tokens_seen": 1279584, + "step": 2200 + }, + { + "epoch": 0.3284182305630027, + "grad_norm": 0.1572265625, + "learning_rate": 0.004924039320822163, + "loss": 0.7985, + "num_input_tokens_seen": 1282656, + "step": 2205 + }, + { + "epoch": 0.3291629431039619, + "grad_norm": 0.078125, + "learning_rate": 0.00493521000893655, + "loss": 0.7859, + "num_input_tokens_seen": 1285568, + "step": 2210 + }, + { + "epoch": 0.3299076556449211, + "grad_norm": 0.1455078125, + "learning_rate": 0.004946380697050938, + "loss": 0.8076, + "num_input_tokens_seen": 1288448, + "step": 2215 + }, + { + "epoch": 0.33065236818588023, + "grad_norm": 0.173828125, + "learning_rate": 0.004957551385165326, + "loss": 0.7768, + "num_input_tokens_seen": 1291264, + "step": 2220 + }, + { + "epoch": 0.33139708072683943, + "grad_norm": 0.1357421875, + "learning_rate": 0.004968722073279714, + "loss": 0.7992, + "num_input_tokens_seen": 1294240, + "step": 2225 + }, + { + "epoch": 0.33214179326779864, + "grad_norm": 0.1552734375, + "learning_rate": 0.004979892761394101, + "loss": 0.7924, + "num_input_tokens_seen": 1296992, + "step": 2230 + }, + { + "epoch": 0.33288650580875784, + "grad_norm": 0.11376953125, + "learning_rate": 0.004991063449508489, + "loss": 0.8205, + "num_input_tokens_seen": 1299808, + "step": 2235 + }, + { + "epoch": 0.333631218349717, + "grad_norm": 0.1904296875, + "learning_rate": 0.005002234137622878, + "loss": 0.8069, + "num_input_tokens_seen": 1302816, + "step": 2240 + }, + { + "epoch": 0.3343759308906762, + "grad_norm": 0.09814453125, + "learning_rate": 0.005013404825737266, + "loss": 0.7866, + "num_input_tokens_seen": 1305600, + "step": 2245 + }, + { + "epoch": 0.3351206434316354, + "grad_norm": 0.07373046875, + "learning_rate": 0.005024575513851653, + "loss": 0.813, + "num_input_tokens_seen": 1309088, + "step": 2250 + }, + { + "epoch": 0.3358653559725946, + "grad_norm": 0.06005859375, + "learning_rate": 0.005035746201966041, + "loss": 0.8387, + "num_input_tokens_seen": 1311968, + "step": 2255 + }, + { + "epoch": 0.3366100685135538, + "grad_norm": 0.1171875, + "learning_rate": 0.005046916890080429, + "loss": 0.7961, + "num_input_tokens_seen": 1315040, + "step": 2260 + }, + { + "epoch": 0.33735478105451294, + "grad_norm": 0.0947265625, + "learning_rate": 0.005058087578194817, + "loss": 0.8141, + "num_input_tokens_seen": 1318304, + "step": 2265 + }, + { + "epoch": 0.33809949359547214, + "grad_norm": 0.11376953125, + "learning_rate": 0.005069258266309205, + "loss": 0.8023, + "num_input_tokens_seen": 1320960, + "step": 2270 + }, + { + "epoch": 0.33884420613643135, + "grad_norm": 0.103515625, + "learning_rate": 0.005080428954423592, + "loss": 0.846, + "num_input_tokens_seen": 1323712, + "step": 2275 + }, + { + "epoch": 0.33958891867739055, + "grad_norm": 0.09326171875, + "learning_rate": 0.00509159964253798, + "loss": 0.8243, + "num_input_tokens_seen": 1326752, + "step": 2280 + }, + { + "epoch": 0.3403336312183497, + "grad_norm": 0.0625, + "learning_rate": 0.005102770330652368, + "loss": 0.7878, + "num_input_tokens_seen": 1329600, + "step": 2285 + }, + { + "epoch": 0.3410783437593089, + "grad_norm": 0.08544921875, + "learning_rate": 0.005113941018766756, + "loss": 0.8195, + "num_input_tokens_seen": 1332448, + "step": 2290 + }, + { + "epoch": 0.3418230563002681, + "grad_norm": 0.11083984375, + "learning_rate": 0.005125111706881144, + "loss": 0.8056, + "num_input_tokens_seen": 1335072, + "step": 2295 + }, + { + "epoch": 0.3425677688412273, + "grad_norm": 0.046142578125, + "learning_rate": 0.005136282394995532, + "loss": 0.8155, + "num_input_tokens_seen": 1337856, + "step": 2300 + }, + { + "epoch": 0.34331248138218645, + "grad_norm": 0.0810546875, + "learning_rate": 0.00514745308310992, + "loss": 0.7939, + "num_input_tokens_seen": 1340896, + "step": 2305 + }, + { + "epoch": 0.34405719392314565, + "grad_norm": 0.045166015625, + "learning_rate": 0.005158623771224308, + "loss": 0.7912, + "num_input_tokens_seen": 1343840, + "step": 2310 + }, + { + "epoch": 0.34480190646410486, + "grad_norm": 0.07861328125, + "learning_rate": 0.005169794459338695, + "loss": 0.824, + "num_input_tokens_seen": 1346688, + "step": 2315 + }, + { + "epoch": 0.34554661900506406, + "grad_norm": 0.078125, + "learning_rate": 0.005180965147453083, + "loss": 0.8175, + "num_input_tokens_seen": 1349312, + "step": 2320 + }, + { + "epoch": 0.34629133154602326, + "grad_norm": 0.1591796875, + "learning_rate": 0.005192135835567471, + "loss": 0.7999, + "num_input_tokens_seen": 1352320, + "step": 2325 + }, + { + "epoch": 0.3470360440869824, + "grad_norm": 0.07275390625, + "learning_rate": 0.005203306523681859, + "loss": 0.8015, + "num_input_tokens_seen": 1355168, + "step": 2330 + }, + { + "epoch": 0.3477807566279416, + "grad_norm": 0.06884765625, + "learning_rate": 0.005214477211796246, + "loss": 0.7909, + "num_input_tokens_seen": 1358112, + "step": 2335 + }, + { + "epoch": 0.3485254691689008, + "grad_norm": 0.07568359375, + "learning_rate": 0.005225647899910634, + "loss": 0.8202, + "num_input_tokens_seen": 1360992, + "step": 2340 + }, + { + "epoch": 0.34927018170986, + "grad_norm": 0.0810546875, + "learning_rate": 0.005236818588025022, + "loss": 0.7869, + "num_input_tokens_seen": 1363744, + "step": 2345 + }, + { + "epoch": 0.35001489425081916, + "grad_norm": 0.11572265625, + "learning_rate": 0.00524798927613941, + "loss": 0.7978, + "num_input_tokens_seen": 1366848, + "step": 2350 + }, + { + "epoch": 0.35075960679177837, + "grad_norm": 0.10009765625, + "learning_rate": 0.0052591599642537986, + "loss": 0.8153, + "num_input_tokens_seen": 1369536, + "step": 2355 + }, + { + "epoch": 0.35150431933273757, + "grad_norm": 0.1513671875, + "learning_rate": 0.005270330652368186, + "loss": 0.805, + "num_input_tokens_seen": 1372064, + "step": 2360 + }, + { + "epoch": 0.35224903187369677, + "grad_norm": 0.04931640625, + "learning_rate": 0.005281501340482574, + "loss": 0.7884, + "num_input_tokens_seen": 1375136, + "step": 2365 + }, + { + "epoch": 0.3529937444146559, + "grad_norm": 0.080078125, + "learning_rate": 0.005292672028596962, + "loss": 0.7968, + "num_input_tokens_seen": 1377952, + "step": 2370 + }, + { + "epoch": 0.3537384569556151, + "grad_norm": 0.1181640625, + "learning_rate": 0.00530384271671135, + "loss": 0.7839, + "num_input_tokens_seen": 1380864, + "step": 2375 + }, + { + "epoch": 0.3544831694965743, + "grad_norm": 0.08642578125, + "learning_rate": 0.005315013404825737, + "loss": 0.7975, + "num_input_tokens_seen": 1384000, + "step": 2380 + }, + { + "epoch": 0.3552278820375335, + "grad_norm": 0.07470703125, + "learning_rate": 0.005326184092940125, + "loss": 0.8095, + "num_input_tokens_seen": 1387232, + "step": 2385 + }, + { + "epoch": 0.35597259457849273, + "grad_norm": 0.08056640625, + "learning_rate": 0.005337354781054513, + "loss": 0.817, + "num_input_tokens_seen": 1390112, + "step": 2390 + }, + { + "epoch": 0.3567173071194519, + "grad_norm": 0.047119140625, + "learning_rate": 0.005348525469168901, + "loss": 0.8174, + "num_input_tokens_seen": 1393472, + "step": 2395 + }, + { + "epoch": 0.3574620196604111, + "grad_norm": 0.08642578125, + "learning_rate": 0.005359696157283288, + "loss": 0.7856, + "num_input_tokens_seen": 1396288, + "step": 2400 + }, + { + "epoch": 0.3582067322013703, + "grad_norm": 0.1025390625, + "learning_rate": 0.005370866845397676, + "loss": 0.8144, + "num_input_tokens_seen": 1399360, + "step": 2405 + }, + { + "epoch": 0.3589514447423295, + "grad_norm": 0.0732421875, + "learning_rate": 0.0053820375335120645, + "loss": 0.8188, + "num_input_tokens_seen": 1402528, + "step": 2410 + }, + { + "epoch": 0.35969615728328863, + "grad_norm": 0.11474609375, + "learning_rate": 0.0053932082216264525, + "loss": 0.8356, + "num_input_tokens_seen": 1405472, + "step": 2415 + }, + { + "epoch": 0.36044086982424783, + "grad_norm": 0.078125, + "learning_rate": 0.00540437890974084, + "loss": 0.8073, + "num_input_tokens_seen": 1408512, + "step": 2420 + }, + { + "epoch": 0.36118558236520704, + "grad_norm": 0.1474609375, + "learning_rate": 0.0054155495978552275, + "loss": 0.7966, + "num_input_tokens_seen": 1411264, + "step": 2425 + }, + { + "epoch": 0.36193029490616624, + "grad_norm": 0.0439453125, + "learning_rate": 0.0054267202859696155, + "loss": 0.805, + "num_input_tokens_seen": 1413792, + "step": 2430 + }, + { + "epoch": 0.3626750074471254, + "grad_norm": 0.0947265625, + "learning_rate": 0.0054378909740840035, + "loss": 0.8097, + "num_input_tokens_seen": 1416800, + "step": 2435 + }, + { + "epoch": 0.3634197199880846, + "grad_norm": 0.09375, + "learning_rate": 0.0054490616621983914, + "loss": 0.773, + "num_input_tokens_seen": 1419552, + "step": 2440 + }, + { + "epoch": 0.3641644325290438, + "grad_norm": 0.09228515625, + "learning_rate": 0.0054602323503127785, + "loss": 0.8012, + "num_input_tokens_seen": 1422272, + "step": 2445 + }, + { + "epoch": 0.364909145070003, + "grad_norm": 0.11181640625, + "learning_rate": 0.0054714030384271665, + "loss": 0.8058, + "num_input_tokens_seen": 1425120, + "step": 2450 + }, + { + "epoch": 0.3656538576109622, + "grad_norm": 0.087890625, + "learning_rate": 0.0054825737265415545, + "loss": 0.7963, + "num_input_tokens_seen": 1427840, + "step": 2455 + }, + { + "epoch": 0.36639857015192134, + "grad_norm": 0.08447265625, + "learning_rate": 0.0054937444146559425, + "loss": 0.8081, + "num_input_tokens_seen": 1430944, + "step": 2460 + }, + { + "epoch": 0.36714328269288055, + "grad_norm": 0.07373046875, + "learning_rate": 0.00550491510277033, + "loss": 0.8009, + "num_input_tokens_seen": 1433792, + "step": 2465 + }, + { + "epoch": 0.36788799523383975, + "grad_norm": 0.04833984375, + "learning_rate": 0.005516085790884718, + "loss": 0.7863, + "num_input_tokens_seen": 1436704, + "step": 2470 + }, + { + "epoch": 0.36863270777479895, + "grad_norm": 0.07373046875, + "learning_rate": 0.005527256478999106, + "loss": 0.7968, + "num_input_tokens_seen": 1439360, + "step": 2475 + }, + { + "epoch": 0.3693774203157581, + "grad_norm": 0.1796875, + "learning_rate": 0.005538427167113494, + "loss": 0.8603, + "num_input_tokens_seen": 1442016, + "step": 2480 + }, + { + "epoch": 0.3701221328567173, + "grad_norm": 0.0771484375, + "learning_rate": 0.005549597855227882, + "loss": 0.8337, + "num_input_tokens_seen": 1444800, + "step": 2485 + }, + { + "epoch": 0.3708668453976765, + "grad_norm": 0.0927734375, + "learning_rate": 0.005560768543342269, + "loss": 0.8224, + "num_input_tokens_seen": 1447488, + "step": 2490 + }, + { + "epoch": 0.3716115579386357, + "grad_norm": 0.0947265625, + "learning_rate": 0.005571939231456657, + "loss": 0.8146, + "num_input_tokens_seen": 1450144, + "step": 2495 + }, + { + "epoch": 0.37235627047959485, + "grad_norm": 0.091796875, + "learning_rate": 0.005583109919571045, + "loss": 0.7983, + "num_input_tokens_seen": 1453184, + "step": 2500 + }, + { + "epoch": 0.37310098302055406, + "grad_norm": 0.10986328125, + "learning_rate": 0.005594280607685433, + "loss": 0.7993, + "num_input_tokens_seen": 1456128, + "step": 2505 + }, + { + "epoch": 0.37384569556151326, + "grad_norm": 0.1279296875, + "learning_rate": 0.00560545129579982, + "loss": 0.7952, + "num_input_tokens_seen": 1459168, + "step": 2510 + }, + { + "epoch": 0.37459040810247246, + "grad_norm": 0.09521484375, + "learning_rate": 0.005616621983914208, + "loss": 0.7952, + "num_input_tokens_seen": 1462080, + "step": 2515 + }, + { + "epoch": 0.3753351206434316, + "grad_norm": 0.109375, + "learning_rate": 0.005627792672028597, + "loss": 0.823, + "num_input_tokens_seen": 1464896, + "step": 2520 + }, + { + "epoch": 0.3760798331843908, + "grad_norm": 0.08984375, + "learning_rate": 0.005638963360142985, + "loss": 0.8251, + "num_input_tokens_seen": 1467648, + "step": 2525 + }, + { + "epoch": 0.37682454572535, + "grad_norm": 0.12353515625, + "learning_rate": 0.005650134048257372, + "loss": 0.8115, + "num_input_tokens_seen": 1470464, + "step": 2530 + }, + { + "epoch": 0.3775692582663092, + "grad_norm": 0.07373046875, + "learning_rate": 0.00566130473637176, + "loss": 0.7964, + "num_input_tokens_seen": 1473248, + "step": 2535 + }, + { + "epoch": 0.3783139708072684, + "grad_norm": 0.07763671875, + "learning_rate": 0.005672475424486148, + "loss": 0.7992, + "num_input_tokens_seen": 1476256, + "step": 2540 + }, + { + "epoch": 0.37905868334822757, + "grad_norm": 0.053466796875, + "learning_rate": 0.005683646112600536, + "loss": 0.7868, + "num_input_tokens_seen": 1479136, + "step": 2545 + }, + { + "epoch": 0.37980339588918677, + "grad_norm": 0.12255859375, + "learning_rate": 0.005694816800714924, + "loss": 0.8043, + "num_input_tokens_seen": 1481696, + "step": 2550 + }, + { + "epoch": 0.38054810843014597, + "grad_norm": 0.07080078125, + "learning_rate": 0.005705987488829311, + "loss": 0.8233, + "num_input_tokens_seen": 1484640, + "step": 2555 + }, + { + "epoch": 0.3812928209711052, + "grad_norm": 0.045166015625, + "learning_rate": 0.005717158176943699, + "loss": 0.8172, + "num_input_tokens_seen": 1487296, + "step": 2560 + }, + { + "epoch": 0.3820375335120643, + "grad_norm": 0.076171875, + "learning_rate": 0.005728328865058087, + "loss": 0.8236, + "num_input_tokens_seen": 1490208, + "step": 2565 + }, + { + "epoch": 0.3827822460530235, + "grad_norm": 0.083984375, + "learning_rate": 0.005739499553172475, + "loss": 0.8056, + "num_input_tokens_seen": 1493152, + "step": 2570 + }, + { + "epoch": 0.3835269585939827, + "grad_norm": 0.07373046875, + "learning_rate": 0.005750670241286863, + "loss": 0.797, + "num_input_tokens_seen": 1496032, + "step": 2575 + }, + { + "epoch": 0.38427167113494193, + "grad_norm": 0.1201171875, + "learning_rate": 0.005761840929401251, + "loss": 0.8061, + "num_input_tokens_seen": 1498880, + "step": 2580 + }, + { + "epoch": 0.3850163836759011, + "grad_norm": 0.12255859375, + "learning_rate": 0.005773011617515639, + "loss": 0.7983, + "num_input_tokens_seen": 1501696, + "step": 2585 + }, + { + "epoch": 0.3857610962168603, + "grad_norm": 0.07275390625, + "learning_rate": 0.005784182305630027, + "loss": 0.7959, + "num_input_tokens_seen": 1504448, + "step": 2590 + }, + { + "epoch": 0.3865058087578195, + "grad_norm": 0.072265625, + "learning_rate": 0.005795352993744414, + "loss": 0.7863, + "num_input_tokens_seen": 1507520, + "step": 2595 + }, + { + "epoch": 0.3872505212987787, + "grad_norm": 0.09033203125, + "learning_rate": 0.005806523681858802, + "loss": 0.7973, + "num_input_tokens_seen": 1510176, + "step": 2600 + }, + { + "epoch": 0.3879952338397379, + "grad_norm": 0.06689453125, + "learning_rate": 0.00581769436997319, + "loss": 0.799, + "num_input_tokens_seen": 1512992, + "step": 2605 + }, + { + "epoch": 0.38873994638069703, + "grad_norm": 0.09130859375, + "learning_rate": 0.005828865058087578, + "loss": 0.808, + "num_input_tokens_seen": 1515680, + "step": 2610 + }, + { + "epoch": 0.38948465892165623, + "grad_norm": 0.08349609375, + "learning_rate": 0.005840035746201966, + "loss": 0.8053, + "num_input_tokens_seen": 1518400, + "step": 2615 + }, + { + "epoch": 0.39022937146261544, + "grad_norm": 0.1015625, + "learning_rate": 0.005851206434316353, + "loss": 0.8225, + "num_input_tokens_seen": 1522560, + "step": 2620 + }, + { + "epoch": 0.39097408400357464, + "grad_norm": 0.11962890625, + "learning_rate": 0.005862377122430741, + "loss": 0.8285, + "num_input_tokens_seen": 1525408, + "step": 2625 + }, + { + "epoch": 0.3917187965445338, + "grad_norm": 0.095703125, + "learning_rate": 0.005873547810545129, + "loss": 0.8163, + "num_input_tokens_seen": 1528256, + "step": 2630 + }, + { + "epoch": 0.392463509085493, + "grad_norm": 0.1396484375, + "learning_rate": 0.005884718498659518, + "loss": 0.7951, + "num_input_tokens_seen": 1531264, + "step": 2635 + }, + { + "epoch": 0.3932082216264522, + "grad_norm": 0.06689453125, + "learning_rate": 0.005895889186773905, + "loss": 0.8046, + "num_input_tokens_seen": 1534272, + "step": 2640 + }, + { + "epoch": 0.3939529341674114, + "grad_norm": 0.1455078125, + "learning_rate": 0.005907059874888293, + "loss": 0.8192, + "num_input_tokens_seen": 1537472, + "step": 2645 + }, + { + "epoch": 0.39469764670837054, + "grad_norm": 0.091796875, + "learning_rate": 0.005918230563002681, + "loss": 0.8019, + "num_input_tokens_seen": 1540352, + "step": 2650 + }, + { + "epoch": 0.39544235924932974, + "grad_norm": 0.0888671875, + "learning_rate": 0.005929401251117069, + "loss": 0.7914, + "num_input_tokens_seen": 1543328, + "step": 2655 + }, + { + "epoch": 0.39618707179028895, + "grad_norm": 0.0751953125, + "learning_rate": 0.005940571939231456, + "loss": 0.8237, + "num_input_tokens_seen": 1546272, + "step": 2660 + }, + { + "epoch": 0.39693178433124815, + "grad_norm": 0.0859375, + "learning_rate": 0.005951742627345844, + "loss": 0.8177, + "num_input_tokens_seen": 1549280, + "step": 2665 + }, + { + "epoch": 0.39767649687220735, + "grad_norm": 0.0888671875, + "learning_rate": 0.005962913315460232, + "loss": 0.8201, + "num_input_tokens_seen": 1552256, + "step": 2670 + }, + { + "epoch": 0.3984212094131665, + "grad_norm": 0.08447265625, + "learning_rate": 0.00597408400357462, + "loss": 0.8072, + "num_input_tokens_seen": 1555168, + "step": 2675 + }, + { + "epoch": 0.3991659219541257, + "grad_norm": 0.08154296875, + "learning_rate": 0.005985254691689008, + "loss": 0.8045, + "num_input_tokens_seen": 1558112, + "step": 2680 + }, + { + "epoch": 0.3999106344950849, + "grad_norm": 0.0751953125, + "learning_rate": 0.005996425379803395, + "loss": 0.7813, + "num_input_tokens_seen": 1561184, + "step": 2685 + }, + { + "epoch": 0.4006553470360441, + "grad_norm": 0.10791015625, + "learning_rate": 0.006007596067917784, + "loss": 0.8273, + "num_input_tokens_seen": 1563936, + "step": 2690 + }, + { + "epoch": 0.40140005957700325, + "grad_norm": 0.0947265625, + "learning_rate": 0.006018766756032172, + "loss": 0.8008, + "num_input_tokens_seen": 1566592, + "step": 2695 + }, + { + "epoch": 0.40214477211796246, + "grad_norm": 0.053466796875, + "learning_rate": 0.00602993744414656, + "loss": 0.8193, + "num_input_tokens_seen": 1569568, + "step": 2700 + }, + { + "epoch": 0.40288948465892166, + "grad_norm": 0.0869140625, + "learning_rate": 0.006041108132260947, + "loss": 0.7988, + "num_input_tokens_seen": 1572384, + "step": 2705 + }, + { + "epoch": 0.40363419719988086, + "grad_norm": 0.08544921875, + "learning_rate": 0.006052278820375335, + "loss": 0.7884, + "num_input_tokens_seen": 1575296, + "step": 2710 + }, + { + "epoch": 0.40437890974084, + "grad_norm": 0.10302734375, + "learning_rate": 0.006063449508489723, + "loss": 0.802, + "num_input_tokens_seen": 1578208, + "step": 2715 + }, + { + "epoch": 0.4051236222817992, + "grad_norm": 0.1328125, + "learning_rate": 0.006074620196604111, + "loss": 0.8236, + "num_input_tokens_seen": 1581408, + "step": 2720 + }, + { + "epoch": 0.4058683348227584, + "grad_norm": 0.052490234375, + "learning_rate": 0.006085790884718498, + "loss": 0.8018, + "num_input_tokens_seen": 1584096, + "step": 2725 + }, + { + "epoch": 0.4066130473637176, + "grad_norm": 0.047607421875, + "learning_rate": 0.006096961572832886, + "loss": 0.8073, + "num_input_tokens_seen": 1587008, + "step": 2730 + }, + { + "epoch": 0.4073577599046768, + "grad_norm": 0.0869140625, + "learning_rate": 0.006108132260947274, + "loss": 0.7903, + "num_input_tokens_seen": 1589824, + "step": 2735 + }, + { + "epoch": 0.40810247244563597, + "grad_norm": 0.15625, + "learning_rate": 0.006119302949061662, + "loss": 0.8195, + "num_input_tokens_seen": 1592800, + "step": 2740 + }, + { + "epoch": 0.40884718498659517, + "grad_norm": 0.107421875, + "learning_rate": 0.00613047363717605, + "loss": 0.7999, + "num_input_tokens_seen": 1595808, + "step": 2745 + }, + { + "epoch": 0.40959189752755437, + "grad_norm": 0.1552734375, + "learning_rate": 0.006141644325290438, + "loss": 0.8034, + "num_input_tokens_seen": 1598592, + "step": 2750 + }, + { + "epoch": 0.4103366100685136, + "grad_norm": 0.142578125, + "learning_rate": 0.006152815013404826, + "loss": 0.8029, + "num_input_tokens_seen": 1601824, + "step": 2755 + }, + { + "epoch": 0.4110813226094727, + "grad_norm": 0.142578125, + "learning_rate": 0.006163985701519214, + "loss": 0.8185, + "num_input_tokens_seen": 1604672, + "step": 2760 + }, + { + "epoch": 0.4118260351504319, + "grad_norm": 0.080078125, + "learning_rate": 0.006175156389633602, + "loss": 0.8116, + "num_input_tokens_seen": 1607584, + "step": 2765 + }, + { + "epoch": 0.4125707476913911, + "grad_norm": 0.1201171875, + "learning_rate": 0.006186327077747989, + "loss": 0.8112, + "num_input_tokens_seen": 1610496, + "step": 2770 + }, + { + "epoch": 0.41331546023235033, + "grad_norm": 0.0849609375, + "learning_rate": 0.006197497765862377, + "loss": 0.8162, + "num_input_tokens_seen": 1613472, + "step": 2775 + }, + { + "epoch": 0.4140601727733095, + "grad_norm": 0.08642578125, + "learning_rate": 0.006208668453976765, + "loss": 0.7906, + "num_input_tokens_seen": 1616416, + "step": 2780 + }, + { + "epoch": 0.4148048853142687, + "grad_norm": 0.0966796875, + "learning_rate": 0.006219839142091153, + "loss": 0.8043, + "num_input_tokens_seen": 1619456, + "step": 2785 + }, + { + "epoch": 0.4155495978552279, + "grad_norm": 0.10498046875, + "learning_rate": 0.00623100983020554, + "loss": 0.7984, + "num_input_tokens_seen": 1622528, + "step": 2790 + }, + { + "epoch": 0.4162943103961871, + "grad_norm": 0.09716796875, + "learning_rate": 0.006242180518319928, + "loss": 0.8035, + "num_input_tokens_seen": 1625280, + "step": 2795 + }, + { + "epoch": 0.4170390229371463, + "grad_norm": 0.058349609375, + "learning_rate": 0.006253351206434316, + "loss": 0.7976, + "num_input_tokens_seen": 1628192, + "step": 2800 + }, + { + "epoch": 0.41778373547810543, + "grad_norm": 0.059326171875, + "learning_rate": 0.0062645218945487045, + "loss": 0.7845, + "num_input_tokens_seen": 1631264, + "step": 2805 + }, + { + "epoch": 0.41852844801906464, + "grad_norm": 0.10693359375, + "learning_rate": 0.0062756925826630925, + "loss": 0.8, + "num_input_tokens_seen": 1634080, + "step": 2810 + }, + { + "epoch": 0.41927316056002384, + "grad_norm": 0.255859375, + "learning_rate": 0.00628686327077748, + "loss": 0.818, + "num_input_tokens_seen": 1637056, + "step": 2815 + }, + { + "epoch": 0.42001787310098304, + "grad_norm": 0.1943359375, + "learning_rate": 0.006298033958891868, + "loss": 0.8472, + "num_input_tokens_seen": 1640064, + "step": 2820 + }, + { + "epoch": 0.4207625856419422, + "grad_norm": 0.047607421875, + "learning_rate": 0.0063092046470062555, + "loss": 0.8414, + "num_input_tokens_seen": 1642784, + "step": 2825 + }, + { + "epoch": 0.4215072981829014, + "grad_norm": 0.09033203125, + "learning_rate": 0.0063203753351206435, + "loss": 0.7909, + "num_input_tokens_seen": 1645536, + "step": 2830 + }, + { + "epoch": 0.4222520107238606, + "grad_norm": 0.06640625, + "learning_rate": 0.006331546023235031, + "loss": 0.7871, + "num_input_tokens_seen": 1648128, + "step": 2835 + }, + { + "epoch": 0.4229967232648198, + "grad_norm": 0.06884765625, + "learning_rate": 0.006342716711349419, + "loss": 0.7893, + "num_input_tokens_seen": 1650880, + "step": 2840 + }, + { + "epoch": 0.42374143580577894, + "grad_norm": 0.09912109375, + "learning_rate": 0.0063538873994638066, + "loss": 0.806, + "num_input_tokens_seen": 1653664, + "step": 2845 + }, + { + "epoch": 0.42448614834673815, + "grad_norm": 0.06689453125, + "learning_rate": 0.0063650580875781945, + "loss": 0.7705, + "num_input_tokens_seen": 1656640, + "step": 2850 + }, + { + "epoch": 0.42523086088769735, + "grad_norm": 0.043701171875, + "learning_rate": 0.006376228775692582, + "loss": 0.8076, + "num_input_tokens_seen": 1659296, + "step": 2855 + }, + { + "epoch": 0.42597557342865655, + "grad_norm": 0.048828125, + "learning_rate": 0.0063873994638069705, + "loss": 0.819, + "num_input_tokens_seen": 1662112, + "step": 2860 + }, + { + "epoch": 0.42672028596961575, + "grad_norm": 0.0966796875, + "learning_rate": 0.006398570151921358, + "loss": 0.8228, + "num_input_tokens_seen": 1664832, + "step": 2865 + }, + { + "epoch": 0.4274649985105749, + "grad_norm": 0.0673828125, + "learning_rate": 0.006409740840035746, + "loss": 0.7994, + "num_input_tokens_seen": 1667872, + "step": 2870 + }, + { + "epoch": 0.4282097110515341, + "grad_norm": 0.039306640625, + "learning_rate": 0.006420911528150134, + "loss": 0.8089, + "num_input_tokens_seen": 1670592, + "step": 2875 + }, + { + "epoch": 0.4289544235924933, + "grad_norm": 0.10791015625, + "learning_rate": 0.0064320822162645215, + "loss": 0.7864, + "num_input_tokens_seen": 1673472, + "step": 2880 + }, + { + "epoch": 0.4296991361334525, + "grad_norm": 0.0986328125, + "learning_rate": 0.0064432529043789094, + "loss": 0.7938, + "num_input_tokens_seen": 1676352, + "step": 2885 + }, + { + "epoch": 0.43044384867441166, + "grad_norm": 0.091796875, + "learning_rate": 0.006454423592493297, + "loss": 0.8229, + "num_input_tokens_seen": 1679328, + "step": 2890 + }, + { + "epoch": 0.43118856121537086, + "grad_norm": 0.08447265625, + "learning_rate": 0.006465594280607685, + "loss": 0.7961, + "num_input_tokens_seen": 1682240, + "step": 2895 + }, + { + "epoch": 0.43193327375633006, + "grad_norm": 0.10498046875, + "learning_rate": 0.0064767649687220725, + "loss": 0.8511, + "num_input_tokens_seen": 1685120, + "step": 2900 + }, + { + "epoch": 0.43267798629728926, + "grad_norm": 0.052978515625, + "learning_rate": 0.0064879356568364605, + "loss": 0.8197, + "num_input_tokens_seen": 1688000, + "step": 2905 + }, + { + "epoch": 0.4334226988382484, + "grad_norm": 0.1259765625, + "learning_rate": 0.006499106344950848, + "loss": 0.7966, + "num_input_tokens_seen": 1690816, + "step": 2910 + }, + { + "epoch": 0.4341674113792076, + "grad_norm": 0.07177734375, + "learning_rate": 0.006510277033065237, + "loss": 0.8212, + "num_input_tokens_seen": 1693600, + "step": 2915 + }, + { + "epoch": 0.4349121239201668, + "grad_norm": 0.212890625, + "learning_rate": 0.006521447721179624, + "loss": 0.8079, + "num_input_tokens_seen": 1696544, + "step": 2920 + }, + { + "epoch": 0.435656836461126, + "grad_norm": 0.11279296875, + "learning_rate": 0.006532618409294012, + "loss": 0.7943, + "num_input_tokens_seen": 1699200, + "step": 2925 + }, + { + "epoch": 0.4364015490020852, + "grad_norm": 0.06591796875, + "learning_rate": 0.0065437890974084, + "loss": 0.8185, + "num_input_tokens_seen": 1702080, + "step": 2930 + }, + { + "epoch": 0.43714626154304437, + "grad_norm": 0.068359375, + "learning_rate": 0.006554959785522788, + "loss": 0.7901, + "num_input_tokens_seen": 1705088, + "step": 2935 + }, + { + "epoch": 0.43789097408400357, + "grad_norm": 0.06640625, + "learning_rate": 0.006566130473637176, + "loss": 0.8147, + "num_input_tokens_seen": 1707808, + "step": 2940 + }, + { + "epoch": 0.4386356866249628, + "grad_norm": 0.0478515625, + "learning_rate": 0.006577301161751563, + "loss": 0.8102, + "num_input_tokens_seen": 1710592, + "step": 2945 + }, + { + "epoch": 0.439380399165922, + "grad_norm": 0.068359375, + "learning_rate": 0.006588471849865951, + "loss": 0.7644, + "num_input_tokens_seen": 1713664, + "step": 2950 + }, + { + "epoch": 0.4401251117068811, + "grad_norm": 0.07861328125, + "learning_rate": 0.006599642537980339, + "loss": 0.798, + "num_input_tokens_seen": 1716512, + "step": 2955 + }, + { + "epoch": 0.4408698242478403, + "grad_norm": 0.09375, + "learning_rate": 0.006610813226094727, + "loss": 0.7991, + "num_input_tokens_seen": 1719424, + "step": 2960 + }, + { + "epoch": 0.4416145367887995, + "grad_norm": 0.109375, + "learning_rate": 0.006621983914209114, + "loss": 0.7762, + "num_input_tokens_seen": 1722208, + "step": 2965 + }, + { + "epoch": 0.44235924932975873, + "grad_norm": 0.08935546875, + "learning_rate": 0.006633154602323503, + "loss": 0.8036, + "num_input_tokens_seen": 1725248, + "step": 2970 + }, + { + "epoch": 0.4431039618707179, + "grad_norm": 0.0791015625, + "learning_rate": 0.006644325290437891, + "loss": 0.8234, + "num_input_tokens_seen": 1728192, + "step": 2975 + }, + { + "epoch": 0.4438486744116771, + "grad_norm": 0.103515625, + "learning_rate": 0.006655495978552279, + "loss": 0.8216, + "num_input_tokens_seen": 1731328, + "step": 2980 + }, + { + "epoch": 0.4445933869526363, + "grad_norm": 0.1416015625, + "learning_rate": 0.006666666666666666, + "loss": 0.7941, + "num_input_tokens_seen": 1734304, + "step": 2985 + }, + { + "epoch": 0.4453380994935955, + "grad_norm": 0.08837890625, + "learning_rate": 0.006677837354781054, + "loss": 0.7999, + "num_input_tokens_seen": 1736896, + "step": 2990 + }, + { + "epoch": 0.4460828120345547, + "grad_norm": 0.07470703125, + "learning_rate": 0.006689008042895442, + "loss": 0.7549, + "num_input_tokens_seen": 1739712, + "step": 2995 + }, + { + "epoch": 0.44682752457551383, + "grad_norm": 0.09619140625, + "learning_rate": 0.00670017873100983, + "loss": 0.8194, + "num_input_tokens_seen": 1742528, + "step": 3000 + }, + { + "epoch": 0.44757223711647304, + "grad_norm": 0.07666015625, + "learning_rate": 0.006711349419124218, + "loss": 0.8239, + "num_input_tokens_seen": 1745536, + "step": 3005 + }, + { + "epoch": 0.44831694965743224, + "grad_norm": 0.08544921875, + "learning_rate": 0.006722520107238605, + "loss": 0.8207, + "num_input_tokens_seen": 1748448, + "step": 3010 + }, + { + "epoch": 0.44906166219839144, + "grad_norm": 0.181640625, + "learning_rate": 0.006733690795352993, + "loss": 0.7745, + "num_input_tokens_seen": 1751168, + "step": 3015 + }, + { + "epoch": 0.4498063747393506, + "grad_norm": 0.10693359375, + "learning_rate": 0.006744861483467381, + "loss": 0.7886, + "num_input_tokens_seen": 1753760, + "step": 3020 + }, + { + "epoch": 0.4505510872803098, + "grad_norm": 0.11279296875, + "learning_rate": 0.006756032171581769, + "loss": 0.8137, + "num_input_tokens_seen": 1756832, + "step": 3025 + }, + { + "epoch": 0.451295799821269, + "grad_norm": 0.1123046875, + "learning_rate": 0.006767202859696157, + "loss": 0.8085, + "num_input_tokens_seen": 1759840, + "step": 3030 + }, + { + "epoch": 0.4520405123622282, + "grad_norm": 0.12158203125, + "learning_rate": 0.006778373547810545, + "loss": 0.8062, + "num_input_tokens_seen": 1762752, + "step": 3035 + }, + { + "epoch": 0.45278522490318734, + "grad_norm": 0.09814453125, + "learning_rate": 0.006789544235924933, + "loss": 0.8184, + "num_input_tokens_seen": 1765568, + "step": 3040 + }, + { + "epoch": 0.45352993744414655, + "grad_norm": 0.11474609375, + "learning_rate": 0.006800714924039321, + "loss": 0.8258, + "num_input_tokens_seen": 1768160, + "step": 3045 + }, + { + "epoch": 0.45427464998510575, + "grad_norm": 0.095703125, + "learning_rate": 0.006811885612153708, + "loss": 0.8131, + "num_input_tokens_seen": 1771072, + "step": 3050 + }, + { + "epoch": 0.45501936252606495, + "grad_norm": 0.07666015625, + "learning_rate": 0.006823056300268096, + "loss": 0.7809, + "num_input_tokens_seen": 1774016, + "step": 3055 + }, + { + "epoch": 0.45576407506702415, + "grad_norm": 0.0703125, + "learning_rate": 0.006834226988382484, + "loss": 0.7916, + "num_input_tokens_seen": 1777088, + "step": 3060 + }, + { + "epoch": 0.4565087876079833, + "grad_norm": 0.08642578125, + "learning_rate": 0.006845397676496872, + "loss": 0.7893, + "num_input_tokens_seen": 1779840, + "step": 3065 + }, + { + "epoch": 0.4572535001489425, + "grad_norm": 0.11669921875, + "learning_rate": 0.00685656836461126, + "loss": 0.8032, + "num_input_tokens_seen": 1782656, + "step": 3070 + }, + { + "epoch": 0.4579982126899017, + "grad_norm": 0.0703125, + "learning_rate": 0.006867739052725647, + "loss": 0.7977, + "num_input_tokens_seen": 1785792, + "step": 3075 + }, + { + "epoch": 0.4587429252308609, + "grad_norm": 0.08447265625, + "learning_rate": 0.006878909740840035, + "loss": 0.814, + "num_input_tokens_seen": 1788896, + "step": 3080 + }, + { + "epoch": 0.45948763777182006, + "grad_norm": 0.03955078125, + "learning_rate": 0.006890080428954424, + "loss": 0.7915, + "num_input_tokens_seen": 1791904, + "step": 3085 + }, + { + "epoch": 0.46023235031277926, + "grad_norm": 0.038330078125, + "learning_rate": 0.006901251117068812, + "loss": 0.8377, + "num_input_tokens_seen": 1794816, + "step": 3090 + }, + { + "epoch": 0.46097706285373846, + "grad_norm": 0.0673828125, + "learning_rate": 0.006912421805183199, + "loss": 0.786, + "num_input_tokens_seen": 1797536, + "step": 3095 + }, + { + "epoch": 0.46172177539469766, + "grad_norm": 0.08349609375, + "learning_rate": 0.006923592493297587, + "loss": 0.8338, + "num_input_tokens_seen": 1800512, + "step": 3100 + }, + { + "epoch": 0.4624664879356568, + "grad_norm": 0.06884765625, + "learning_rate": 0.006934763181411975, + "loss": 0.8189, + "num_input_tokens_seen": 1803328, + "step": 3105 + }, + { + "epoch": 0.463211200476616, + "grad_norm": 0.08544921875, + "learning_rate": 0.006945933869526363, + "loss": 0.824, + "num_input_tokens_seen": 1806176, + "step": 3110 + }, + { + "epoch": 0.4639559130175752, + "grad_norm": 0.03857421875, + "learning_rate": 0.00695710455764075, + "loss": 0.8025, + "num_input_tokens_seen": 1809056, + "step": 3115 + }, + { + "epoch": 0.4647006255585344, + "grad_norm": 0.032470703125, + "learning_rate": 0.006968275245755138, + "loss": 0.8, + "num_input_tokens_seen": 1812032, + "step": 3120 + }, + { + "epoch": 0.4654453380994936, + "grad_norm": 0.06982421875, + "learning_rate": 0.006979445933869526, + "loss": 0.8116, + "num_input_tokens_seen": 1814976, + "step": 3125 + }, + { + "epoch": 0.46619005064045277, + "grad_norm": 0.0291748046875, + "learning_rate": 0.006990616621983914, + "loss": 0.7993, + "num_input_tokens_seen": 1818080, + "step": 3130 + }, + { + "epoch": 0.46693476318141197, + "grad_norm": 0.06201171875, + "learning_rate": 0.007001787310098302, + "loss": 0.7906, + "num_input_tokens_seen": 1821216, + "step": 3135 + }, + { + "epoch": 0.4676794757223712, + "grad_norm": 0.07373046875, + "learning_rate": 0.00701295799821269, + "loss": 0.8016, + "num_input_tokens_seen": 1824000, + "step": 3140 + }, + { + "epoch": 0.4684241882633304, + "grad_norm": 0.02978515625, + "learning_rate": 0.007024128686327078, + "loss": 0.8138, + "num_input_tokens_seen": 1826560, + "step": 3145 + }, + { + "epoch": 0.4691689008042895, + "grad_norm": 0.07177734375, + "learning_rate": 0.007035299374441466, + "loss": 0.8105, + "num_input_tokens_seen": 1829248, + "step": 3150 + }, + { + "epoch": 0.4699136133452487, + "grad_norm": 0.0498046875, + "learning_rate": 0.007046470062555854, + "loss": 0.8002, + "num_input_tokens_seen": 1832032, + "step": 3155 + }, + { + "epoch": 0.47065832588620793, + "grad_norm": 0.0576171875, + "learning_rate": 0.007057640750670241, + "loss": 0.7858, + "num_input_tokens_seen": 1835328, + "step": 3160 + }, + { + "epoch": 0.47140303842716713, + "grad_norm": 0.0556640625, + "learning_rate": 0.007068811438784629, + "loss": 0.8007, + "num_input_tokens_seen": 1838176, + "step": 3165 + }, + { + "epoch": 0.4721477509681263, + "grad_norm": 0.06982421875, + "learning_rate": 0.007079982126899017, + "loss": 0.8187, + "num_input_tokens_seen": 1841824, + "step": 3170 + }, + { + "epoch": 0.4728924635090855, + "grad_norm": 0.055419921875, + "learning_rate": 0.007091152815013405, + "loss": 0.8047, + "num_input_tokens_seen": 1845024, + "step": 3175 + }, + { + "epoch": 0.4736371760500447, + "grad_norm": 0.05322265625, + "learning_rate": 0.007102323503127792, + "loss": 0.8032, + "num_input_tokens_seen": 1847744, + "step": 3180 + }, + { + "epoch": 0.4743818885910039, + "grad_norm": 0.05908203125, + "learning_rate": 0.00711349419124218, + "loss": 0.7917, + "num_input_tokens_seen": 1850560, + "step": 3185 + }, + { + "epoch": 0.4751266011319631, + "grad_norm": 0.08056640625, + "learning_rate": 0.007124664879356568, + "loss": 0.7921, + "num_input_tokens_seen": 1853504, + "step": 3190 + }, + { + "epoch": 0.47587131367292224, + "grad_norm": 0.038818359375, + "learning_rate": 0.007135835567470956, + "loss": 0.7934, + "num_input_tokens_seen": 1856480, + "step": 3195 + }, + { + "epoch": 0.47661602621388144, + "grad_norm": 0.035888671875, + "learning_rate": 0.007147006255585345, + "loss": 0.8105, + "num_input_tokens_seen": 1859392, + "step": 3200 + }, + { + "epoch": 0.47736073875484064, + "grad_norm": 0.072265625, + "learning_rate": 0.007158176943699732, + "loss": 0.7797, + "num_input_tokens_seen": 1862240, + "step": 3205 + }, + { + "epoch": 0.47810545129579984, + "grad_norm": 0.08349609375, + "learning_rate": 0.00716934763181412, + "loss": 0.7975, + "num_input_tokens_seen": 1865024, + "step": 3210 + }, + { + "epoch": 0.478850163836759, + "grad_norm": 0.09716796875, + "learning_rate": 0.007180518319928508, + "loss": 0.7932, + "num_input_tokens_seen": 1867776, + "step": 3215 + }, + { + "epoch": 0.4795948763777182, + "grad_norm": 0.03564453125, + "learning_rate": 0.007191689008042896, + "loss": 0.8089, + "num_input_tokens_seen": 1870752, + "step": 3220 + }, + { + "epoch": 0.4803395889186774, + "grad_norm": 0.060791015625, + "learning_rate": 0.007202859696157283, + "loss": 0.7776, + "num_input_tokens_seen": 1873888, + "step": 3225 + }, + { + "epoch": 0.4810843014596366, + "grad_norm": 0.08740234375, + "learning_rate": 0.007214030384271671, + "loss": 0.7947, + "num_input_tokens_seen": 1876896, + "step": 3230 + }, + { + "epoch": 0.48182901400059575, + "grad_norm": 0.054931640625, + "learning_rate": 0.007225201072386059, + "loss": 0.7909, + "num_input_tokens_seen": 1879360, + "step": 3235 + }, + { + "epoch": 0.48257372654155495, + "grad_norm": 0.09619140625, + "learning_rate": 0.007236371760500447, + "loss": 0.8397, + "num_input_tokens_seen": 1882592, + "step": 3240 + }, + { + "epoch": 0.48331843908251415, + "grad_norm": 0.091796875, + "learning_rate": 0.007247542448614834, + "loss": 0.8302, + "num_input_tokens_seen": 1885728, + "step": 3245 + }, + { + "epoch": 0.48406315162347335, + "grad_norm": 0.053466796875, + "learning_rate": 0.007258713136729222, + "loss": 0.7873, + "num_input_tokens_seen": 1888896, + "step": 3250 + }, + { + "epoch": 0.48480786416443256, + "grad_norm": 0.05078125, + "learning_rate": 0.0072698838248436105, + "loss": 0.814, + "num_input_tokens_seen": 1892000, + "step": 3255 + }, + { + "epoch": 0.4855525767053917, + "grad_norm": 0.080078125, + "learning_rate": 0.0072810545129579985, + "loss": 0.8294, + "num_input_tokens_seen": 1894784, + "step": 3260 + }, + { + "epoch": 0.4862972892463509, + "grad_norm": 0.056640625, + "learning_rate": 0.007292225201072386, + "loss": 0.8205, + "num_input_tokens_seen": 1897440, + "step": 3265 + }, + { + "epoch": 0.4870420017873101, + "grad_norm": 0.1357421875, + "learning_rate": 0.0073033958891867735, + "loss": 0.8331, + "num_input_tokens_seen": 1900352, + "step": 3270 + }, + { + "epoch": 0.4877867143282693, + "grad_norm": 0.07080078125, + "learning_rate": 0.0073145665773011615, + "loss": 0.8037, + "num_input_tokens_seen": 1903392, + "step": 3275 + }, + { + "epoch": 0.48853142686922846, + "grad_norm": 0.05419921875, + "learning_rate": 0.0073257372654155495, + "loss": 0.8034, + "num_input_tokens_seen": 1906176, + "step": 3280 + }, + { + "epoch": 0.48927613941018766, + "grad_norm": 0.043212890625, + "learning_rate": 0.0073369079535299375, + "loss": 0.8142, + "num_input_tokens_seen": 1909024, + "step": 3285 + }, + { + "epoch": 0.49002085195114686, + "grad_norm": 0.054931640625, + "learning_rate": 0.0073480786416443246, + "loss": 0.8086, + "num_input_tokens_seen": 1912160, + "step": 3290 + }, + { + "epoch": 0.49076556449210607, + "grad_norm": 0.08349609375, + "learning_rate": 0.0073592493297587125, + "loss": 0.8198, + "num_input_tokens_seen": 1915104, + "step": 3295 + }, + { + "epoch": 0.4915102770330652, + "grad_norm": 0.09814453125, + "learning_rate": 0.0073704200178731005, + "loss": 0.7962, + "num_input_tokens_seen": 1917984, + "step": 3300 + }, + { + "epoch": 0.4922549895740244, + "grad_norm": 0.1640625, + "learning_rate": 0.0073815907059874885, + "loss": 0.8198, + "num_input_tokens_seen": 1921152, + "step": 3305 + }, + { + "epoch": 0.4929997021149836, + "grad_norm": 0.08447265625, + "learning_rate": 0.007392761394101876, + "loss": 0.8, + "num_input_tokens_seen": 1923872, + "step": 3310 + }, + { + "epoch": 0.4937444146559428, + "grad_norm": 0.10595703125, + "learning_rate": 0.007403932082216264, + "loss": 0.765, + "num_input_tokens_seen": 1926848, + "step": 3315 + }, + { + "epoch": 0.494489127196902, + "grad_norm": 0.23828125, + "learning_rate": 0.007415102770330652, + "loss": 0.7882, + "num_input_tokens_seen": 1929920, + "step": 3320 + }, + { + "epoch": 0.49523383973786117, + "grad_norm": 0.083984375, + "learning_rate": 0.00742627345844504, + "loss": 0.8464, + "num_input_tokens_seen": 1932736, + "step": 3325 + }, + { + "epoch": 0.4959785522788204, + "grad_norm": 0.1357421875, + "learning_rate": 0.0074374441465594274, + "loss": 0.8641, + "num_input_tokens_seen": 1935392, + "step": 3330 + }, + { + "epoch": 0.4967232648197796, + "grad_norm": 0.11865234375, + "learning_rate": 0.007448614834673815, + "loss": 0.8144, + "num_input_tokens_seen": 1938176, + "step": 3335 + }, + { + "epoch": 0.4974679773607388, + "grad_norm": 0.091796875, + "learning_rate": 0.007459785522788203, + "loss": 0.8023, + "num_input_tokens_seen": 1941056, + "step": 3340 + }, + { + "epoch": 0.4982126899016979, + "grad_norm": 0.12353515625, + "learning_rate": 0.007470956210902591, + "loss": 0.8095, + "num_input_tokens_seen": 1943936, + "step": 3345 + }, + { + "epoch": 0.4989574024426571, + "grad_norm": 0.080078125, + "learning_rate": 0.007482126899016979, + "loss": 0.7966, + "num_input_tokens_seen": 1946784, + "step": 3350 + }, + { + "epoch": 0.49970211498361633, + "grad_norm": 0.05126953125, + "learning_rate": 0.007493297587131366, + "loss": 0.7973, + "num_input_tokens_seen": 1949760, + "step": 3355 + }, + { + "epoch": 0.5004468275245755, + "grad_norm": 0.091796875, + "learning_rate": 0.007504468275245755, + "loss": 0.7992, + "num_input_tokens_seen": 1952352, + "step": 3360 + }, + { + "epoch": 0.5011915400655347, + "grad_norm": 0.0546875, + "learning_rate": 0.007515638963360142, + "loss": 0.7846, + "num_input_tokens_seen": 1955072, + "step": 3365 + }, + { + "epoch": 0.5019362526064939, + "grad_norm": 0.0810546875, + "learning_rate": 0.00752680965147453, + "loss": 0.8269, + "num_input_tokens_seen": 1957696, + "step": 3370 + }, + { + "epoch": 0.5026809651474531, + "grad_norm": 0.1064453125, + "learning_rate": 0.007537980339588918, + "loss": 0.8188, + "num_input_tokens_seen": 1960736, + "step": 3375 + }, + { + "epoch": 0.5034256776884123, + "grad_norm": 0.05224609375, + "learning_rate": 0.007549151027703306, + "loss": 0.8032, + "num_input_tokens_seen": 1963616, + "step": 3380 + }, + { + "epoch": 0.5041703902293715, + "grad_norm": 0.0927734375, + "learning_rate": 0.007560321715817694, + "loss": 0.7796, + "num_input_tokens_seen": 1966528, + "step": 3385 + }, + { + "epoch": 0.5049151027703307, + "grad_norm": 0.07568359375, + "learning_rate": 0.007571492403932082, + "loss": 0.8188, + "num_input_tokens_seen": 1969312, + "step": 3390 + }, + { + "epoch": 0.5056598153112899, + "grad_norm": 0.134765625, + "learning_rate": 0.00758266309204647, + "loss": 0.8104, + "num_input_tokens_seen": 1972096, + "step": 3395 + }, + { + "epoch": 0.506404527852249, + "grad_norm": 0.0927734375, + "learning_rate": 0.007593833780160858, + "loss": 0.8005, + "num_input_tokens_seen": 1975040, + "step": 3400 + }, + { + "epoch": 0.5071492403932082, + "grad_norm": 0.083984375, + "learning_rate": 0.007605004468275246, + "loss": 0.7993, + "num_input_tokens_seen": 1977856, + "step": 3405 + }, + { + "epoch": 0.5078939529341674, + "grad_norm": 0.05810546875, + "learning_rate": 0.007616175156389632, + "loss": 0.7996, + "num_input_tokens_seen": 1980608, + "step": 3410 + }, + { + "epoch": 0.5086386654751266, + "grad_norm": 0.126953125, + "learning_rate": 0.00762734584450402, + "loss": 0.8088, + "num_input_tokens_seen": 1983328, + "step": 3415 + }, + { + "epoch": 0.5093833780160858, + "grad_norm": 0.06884765625, + "learning_rate": 0.007638516532618409, + "loss": 0.8219, + "num_input_tokens_seen": 1986400, + "step": 3420 + }, + { + "epoch": 0.510128090557045, + "grad_norm": 0.0859375, + "learning_rate": 0.007649687220732797, + "loss": 0.8118, + "num_input_tokens_seen": 1989248, + "step": 3425 + }, + { + "epoch": 0.5108728030980042, + "grad_norm": 0.061767578125, + "learning_rate": 0.007660857908847185, + "loss": 0.8146, + "num_input_tokens_seen": 1992224, + "step": 3430 + }, + { + "epoch": 0.5116175156389634, + "grad_norm": 0.1357421875, + "learning_rate": 0.007672028596961573, + "loss": 0.8196, + "num_input_tokens_seen": 1995136, + "step": 3435 + }, + { + "epoch": 0.5123622281799225, + "grad_norm": 0.103515625, + "learning_rate": 0.007683199285075961, + "loss": 0.8153, + "num_input_tokens_seen": 1997792, + "step": 3440 + }, + { + "epoch": 0.5131069407208817, + "grad_norm": 0.10693359375, + "learning_rate": 0.007694369973190349, + "loss": 0.79, + "num_input_tokens_seen": 2000896, + "step": 3445 + }, + { + "epoch": 0.5138516532618409, + "grad_norm": 0.11474609375, + "learning_rate": 0.007705540661304735, + "loss": 0.8089, + "num_input_tokens_seen": 2003840, + "step": 3450 + }, + { + "epoch": 0.5145963658028001, + "grad_norm": 0.035400390625, + "learning_rate": 0.007716711349419123, + "loss": 0.813, + "num_input_tokens_seen": 2006496, + "step": 3455 + }, + { + "epoch": 0.5153410783437593, + "grad_norm": 0.04345703125, + "learning_rate": 0.007727882037533511, + "loss": 0.8309, + "num_input_tokens_seen": 2009568, + "step": 3460 + }, + { + "epoch": 0.5160857908847185, + "grad_norm": 0.034912109375, + "learning_rate": 0.007739052725647899, + "loss": 0.8067, + "num_input_tokens_seen": 2012384, + "step": 3465 + }, + { + "epoch": 0.5168305034256777, + "grad_norm": 0.072265625, + "learning_rate": 0.007750223413762287, + "loss": 0.796, + "num_input_tokens_seen": 2015360, + "step": 3470 + }, + { + "epoch": 0.5175752159666369, + "grad_norm": 0.0654296875, + "learning_rate": 0.007761394101876675, + "loss": 0.8089, + "num_input_tokens_seen": 2018048, + "step": 3475 + }, + { + "epoch": 0.5183199285075961, + "grad_norm": 0.040283203125, + "learning_rate": 0.007772564789991064, + "loss": 0.7924, + "num_input_tokens_seen": 2020960, + "step": 3480 + }, + { + "epoch": 0.5190646410485552, + "grad_norm": 0.035888671875, + "learning_rate": 0.007783735478105452, + "loss": 0.8008, + "num_input_tokens_seen": 2023744, + "step": 3485 + }, + { + "epoch": 0.5198093535895144, + "grad_norm": 0.0732421875, + "learning_rate": 0.00779490616621984, + "loss": 0.7945, + "num_input_tokens_seen": 2026592, + "step": 3490 + }, + { + "epoch": 0.5205540661304736, + "grad_norm": 0.09716796875, + "learning_rate": 0.007806076854334226, + "loss": 0.8172, + "num_input_tokens_seen": 2029728, + "step": 3495 + }, + { + "epoch": 0.5212987786714328, + "grad_norm": 0.06884765625, + "learning_rate": 0.007817247542448613, + "loss": 0.8178, + "num_input_tokens_seen": 2032672, + "step": 3500 + }, + { + "epoch": 0.522043491212392, + "grad_norm": 0.041259765625, + "learning_rate": 0.007828418230563003, + "loss": 0.796, + "num_input_tokens_seen": 2035808, + "step": 3505 + }, + { + "epoch": 0.5227882037533512, + "grad_norm": 0.035400390625, + "learning_rate": 0.007839588918677391, + "loss": 0.8049, + "num_input_tokens_seen": 2038720, + "step": 3510 + }, + { + "epoch": 0.5235329162943104, + "grad_norm": 0.07275390625, + "learning_rate": 0.007850759606791779, + "loss": 0.7857, + "num_input_tokens_seen": 2041664, + "step": 3515 + }, + { + "epoch": 0.5242776288352696, + "grad_norm": 0.08349609375, + "learning_rate": 0.007861930294906167, + "loss": 0.8076, + "num_input_tokens_seen": 2044800, + "step": 3520 + }, + { + "epoch": 0.5250223413762288, + "grad_norm": 0.08447265625, + "learning_rate": 0.007873100983020555, + "loss": 0.8136, + "num_input_tokens_seen": 2047392, + "step": 3525 + }, + { + "epoch": 0.5257670539171879, + "grad_norm": 0.078125, + "learning_rate": 0.007884271671134943, + "loss": 0.816, + "num_input_tokens_seen": 2050496, + "step": 3530 + }, + { + "epoch": 0.5265117664581471, + "grad_norm": 0.0693359375, + "learning_rate": 0.00789544235924933, + "loss": 0.8097, + "num_input_tokens_seen": 2053984, + "step": 3535 + }, + { + "epoch": 0.5272564789991063, + "grad_norm": 0.06689453125, + "learning_rate": 0.007906613047363717, + "loss": 0.7966, + "num_input_tokens_seen": 2057120, + "step": 3540 + }, + { + "epoch": 0.5280011915400655, + "grad_norm": 0.059814453125, + "learning_rate": 0.007917783735478105, + "loss": 0.7939, + "num_input_tokens_seen": 2059680, + "step": 3545 + }, + { + "epoch": 0.5287459040810247, + "grad_norm": 0.078125, + "learning_rate": 0.007928954423592493, + "loss": 0.8063, + "num_input_tokens_seen": 2062560, + "step": 3550 + }, + { + "epoch": 0.5294906166219839, + "grad_norm": 0.07373046875, + "learning_rate": 0.00794012511170688, + "loss": 0.8119, + "num_input_tokens_seen": 2065536, + "step": 3555 + }, + { + "epoch": 0.5302353291629431, + "grad_norm": 0.115234375, + "learning_rate": 0.007951295799821269, + "loss": 0.7921, + "num_input_tokens_seen": 2068416, + "step": 3560 + }, + { + "epoch": 0.5309800417039023, + "grad_norm": 0.060791015625, + "learning_rate": 0.007962466487935657, + "loss": 0.7863, + "num_input_tokens_seen": 2071328, + "step": 3565 + }, + { + "epoch": 0.5317247542448614, + "grad_norm": 0.07421875, + "learning_rate": 0.007973637176050045, + "loss": 0.8061, + "num_input_tokens_seen": 2074016, + "step": 3570 + }, + { + "epoch": 0.5324694667858206, + "grad_norm": 0.0888671875, + "learning_rate": 0.007984807864164433, + "loss": 0.7856, + "num_input_tokens_seen": 2076832, + "step": 3575 + }, + { + "epoch": 0.5332141793267798, + "grad_norm": 0.08740234375, + "learning_rate": 0.007995978552278819, + "loss": 0.8238, + "num_input_tokens_seen": 2079648, + "step": 3580 + }, + { + "epoch": 0.533958891867739, + "grad_norm": 0.042236328125, + "learning_rate": 0.008007149240393207, + "loss": 0.8098, + "num_input_tokens_seen": 2082656, + "step": 3585 + }, + { + "epoch": 0.5347036044086982, + "grad_norm": 0.056884765625, + "learning_rate": 0.008018319928507595, + "loss": 0.7879, + "num_input_tokens_seen": 2085664, + "step": 3590 + }, + { + "epoch": 0.5354483169496574, + "grad_norm": 0.0478515625, + "learning_rate": 0.008029490616621983, + "loss": 0.8279, + "num_input_tokens_seen": 2088768, + "step": 3595 + }, + { + "epoch": 0.5361930294906166, + "grad_norm": 0.0537109375, + "learning_rate": 0.00804066130473637, + "loss": 0.7891, + "num_input_tokens_seen": 2091648, + "step": 3600 + }, + { + "epoch": 0.5369377420315758, + "grad_norm": 0.1298828125, + "learning_rate": 0.008051831992850759, + "loss": 0.8175, + "num_input_tokens_seen": 2094816, + "step": 3605 + }, + { + "epoch": 0.537682454572535, + "grad_norm": 0.058837890625, + "learning_rate": 0.008063002680965147, + "loss": 0.7905, + "num_input_tokens_seen": 2097696, + "step": 3610 + }, + { + "epoch": 0.5384271671134941, + "grad_norm": 0.05615234375, + "learning_rate": 0.008074173369079535, + "loss": 0.8021, + "num_input_tokens_seen": 2100320, + "step": 3615 + }, + { + "epoch": 0.5391718796544533, + "grad_norm": 0.06396484375, + "learning_rate": 0.008085344057193924, + "loss": 0.8194, + "num_input_tokens_seen": 2103328, + "step": 3620 + }, + { + "epoch": 0.5399165921954125, + "grad_norm": 0.0830078125, + "learning_rate": 0.00809651474530831, + "loss": 0.7852, + "num_input_tokens_seen": 2106240, + "step": 3625 + }, + { + "epoch": 0.5406613047363718, + "grad_norm": 0.08349609375, + "learning_rate": 0.008107685433422699, + "loss": 0.7832, + "num_input_tokens_seen": 2109280, + "step": 3630 + }, + { + "epoch": 0.541406017277331, + "grad_norm": 0.07861328125, + "learning_rate": 0.008118856121537087, + "loss": 0.8327, + "num_input_tokens_seen": 2111936, + "step": 3635 + }, + { + "epoch": 0.5421507298182902, + "grad_norm": 0.08984375, + "learning_rate": 0.008130026809651475, + "loss": 0.7872, + "num_input_tokens_seen": 2114624, + "step": 3640 + }, + { + "epoch": 0.5428954423592494, + "grad_norm": 0.03466796875, + "learning_rate": 0.008141197497765863, + "loss": 0.7982, + "num_input_tokens_seen": 2117664, + "step": 3645 + }, + { + "epoch": 0.5436401549002086, + "grad_norm": 0.06982421875, + "learning_rate": 0.00815236818588025, + "loss": 0.8244, + "num_input_tokens_seen": 2120448, + "step": 3650 + }, + { + "epoch": 0.5443848674411678, + "grad_norm": 0.04931640625, + "learning_rate": 0.008163538873994639, + "loss": 0.8037, + "num_input_tokens_seen": 2123552, + "step": 3655 + }, + { + "epoch": 0.5451295799821269, + "grad_norm": 0.035888671875, + "learning_rate": 0.008174709562109026, + "loss": 0.8228, + "num_input_tokens_seen": 2126688, + "step": 3660 + }, + { + "epoch": 0.5458742925230861, + "grad_norm": 0.0712890625, + "learning_rate": 0.008185880250223414, + "loss": 0.7878, + "num_input_tokens_seen": 2130432, + "step": 3665 + }, + { + "epoch": 0.5466190050640453, + "grad_norm": 0.103515625, + "learning_rate": 0.0081970509383378, + "loss": 0.8284, + "num_input_tokens_seen": 2133376, + "step": 3670 + }, + { + "epoch": 0.5473637176050045, + "grad_norm": 0.0673828125, + "learning_rate": 0.008208221626452189, + "loss": 0.8404, + "num_input_tokens_seen": 2136576, + "step": 3675 + }, + { + "epoch": 0.5481084301459637, + "grad_norm": 0.05712890625, + "learning_rate": 0.008219392314566577, + "loss": 0.7991, + "num_input_tokens_seen": 2139776, + "step": 3680 + }, + { + "epoch": 0.5488531426869229, + "grad_norm": 0.1328125, + "learning_rate": 0.008230563002680965, + "loss": 0.7935, + "num_input_tokens_seen": 2142528, + "step": 3685 + }, + { + "epoch": 0.5495978552278821, + "grad_norm": 0.056396484375, + "learning_rate": 0.008241733690795353, + "loss": 0.8062, + "num_input_tokens_seen": 2145472, + "step": 3690 + }, + { + "epoch": 0.5503425677688413, + "grad_norm": 0.057373046875, + "learning_rate": 0.00825290437890974, + "loss": 0.7994, + "num_input_tokens_seen": 2148384, + "step": 3695 + }, + { + "epoch": 0.5510872803098004, + "grad_norm": 0.064453125, + "learning_rate": 0.008264075067024129, + "loss": 0.8154, + "num_input_tokens_seen": 2151136, + "step": 3700 + }, + { + "epoch": 0.5518319928507596, + "grad_norm": 0.1005859375, + "learning_rate": 0.008275245755138516, + "loss": 0.8141, + "num_input_tokens_seen": 2153952, + "step": 3705 + }, + { + "epoch": 0.5525767053917188, + "grad_norm": 0.05859375, + "learning_rate": 0.008286416443252903, + "loss": 0.7987, + "num_input_tokens_seen": 2156736, + "step": 3710 + }, + { + "epoch": 0.553321417932678, + "grad_norm": 0.05322265625, + "learning_rate": 0.00829758713136729, + "loss": 0.7933, + "num_input_tokens_seen": 2159776, + "step": 3715 + }, + { + "epoch": 0.5540661304736372, + "grad_norm": 0.0869140625, + "learning_rate": 0.008308757819481679, + "loss": 0.8128, + "num_input_tokens_seen": 2162656, + "step": 3720 + }, + { + "epoch": 0.5548108430145964, + "grad_norm": 0.0654296875, + "learning_rate": 0.008319928507596067, + "loss": 0.8011, + "num_input_tokens_seen": 2165792, + "step": 3725 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.0654296875, + "learning_rate": 0.008331099195710456, + "loss": 0.7935, + "num_input_tokens_seen": 2168704, + "step": 3730 + }, + { + "epoch": 0.5563002680965148, + "grad_norm": 0.056640625, + "learning_rate": 0.008342269883824844, + "loss": 0.8027, + "num_input_tokens_seen": 2171808, + "step": 3735 + }, + { + "epoch": 0.557044980637474, + "grad_norm": 0.034423828125, + "learning_rate": 0.008353440571939232, + "loss": 0.7986, + "num_input_tokens_seen": 2174816, + "step": 3740 + }, + { + "epoch": 0.5577896931784331, + "grad_norm": 0.06640625, + "learning_rate": 0.00836461126005362, + "loss": 0.775, + "num_input_tokens_seen": 2177536, + "step": 3745 + }, + { + "epoch": 0.5585344057193923, + "grad_norm": 0.06005859375, + "learning_rate": 0.008375781948168008, + "loss": 0.8222, + "num_input_tokens_seen": 2180416, + "step": 3750 + }, + { + "epoch": 0.5592791182603515, + "grad_norm": 0.037109375, + "learning_rate": 0.008386952636282394, + "loss": 0.8019, + "num_input_tokens_seen": 2183520, + "step": 3755 + }, + { + "epoch": 0.5600238308013107, + "grad_norm": 0.1103515625, + "learning_rate": 0.008398123324396782, + "loss": 0.7982, + "num_input_tokens_seen": 2186016, + "step": 3760 + }, + { + "epoch": 0.5607685433422699, + "grad_norm": 0.06982421875, + "learning_rate": 0.00840929401251117, + "loss": 0.8087, + "num_input_tokens_seen": 2189376, + "step": 3765 + }, + { + "epoch": 0.5615132558832291, + "grad_norm": 0.083984375, + "learning_rate": 0.008420464700625558, + "loss": 0.7916, + "num_input_tokens_seen": 2192160, + "step": 3770 + }, + { + "epoch": 0.5622579684241883, + "grad_norm": 0.05078125, + "learning_rate": 0.008431635388739946, + "loss": 0.846, + "num_input_tokens_seen": 2195200, + "step": 3775 + }, + { + "epoch": 0.5630026809651475, + "grad_norm": 0.06884765625, + "learning_rate": 0.008442806076854334, + "loss": 0.7751, + "num_input_tokens_seen": 2198048, + "step": 3780 + }, + { + "epoch": 0.5637473935061067, + "grad_norm": 0.08056640625, + "learning_rate": 0.008453976764968722, + "loss": 0.7933, + "num_input_tokens_seen": 2201024, + "step": 3785 + }, + { + "epoch": 0.5644921060470658, + "grad_norm": 0.03564453125, + "learning_rate": 0.00846514745308311, + "loss": 0.8075, + "num_input_tokens_seen": 2203712, + "step": 3790 + }, + { + "epoch": 0.565236818588025, + "grad_norm": 0.09326171875, + "learning_rate": 0.008476318141197496, + "loss": 0.7996, + "num_input_tokens_seen": 2206464, + "step": 3795 + }, + { + "epoch": 0.5659815311289842, + "grad_norm": 0.08447265625, + "learning_rate": 0.008487488829311884, + "loss": 0.794, + "num_input_tokens_seen": 2209504, + "step": 3800 + }, + { + "epoch": 0.5667262436699434, + "grad_norm": 0.05322265625, + "learning_rate": 0.008498659517426272, + "loss": 0.8085, + "num_input_tokens_seen": 2213088, + "step": 3805 + }, + { + "epoch": 0.5674709562109026, + "grad_norm": 0.054931640625, + "learning_rate": 0.00850983020554066, + "loss": 0.8204, + "num_input_tokens_seen": 2215904, + "step": 3810 + }, + { + "epoch": 0.5682156687518618, + "grad_norm": 0.0517578125, + "learning_rate": 0.008521000893655048, + "loss": 0.7623, + "num_input_tokens_seen": 2218848, + "step": 3815 + }, + { + "epoch": 0.568960381292821, + "grad_norm": 0.059326171875, + "learning_rate": 0.008532171581769436, + "loss": 0.8212, + "num_input_tokens_seen": 2221920, + "step": 3820 + }, + { + "epoch": 0.5697050938337802, + "grad_norm": 0.059326171875, + "learning_rate": 0.008543342269883824, + "loss": 0.8109, + "num_input_tokens_seen": 2224992, + "step": 3825 + }, + { + "epoch": 0.5704498063747393, + "grad_norm": 0.0810546875, + "learning_rate": 0.008554512957998212, + "loss": 0.8075, + "num_input_tokens_seen": 2227872, + "step": 3830 + }, + { + "epoch": 0.5711945189156985, + "grad_norm": 0.035888671875, + "learning_rate": 0.0085656836461126, + "loss": 0.7684, + "num_input_tokens_seen": 2230272, + "step": 3835 + }, + { + "epoch": 0.5719392314566577, + "grad_norm": 0.0625, + "learning_rate": 0.008576854334226986, + "loss": 0.7674, + "num_input_tokens_seen": 2233216, + "step": 3840 + }, + { + "epoch": 0.5726839439976169, + "grad_norm": 0.060791015625, + "learning_rate": 0.008588025022341376, + "loss": 0.8482, + "num_input_tokens_seen": 2236192, + "step": 3845 + }, + { + "epoch": 0.5734286565385761, + "grad_norm": 0.06494140625, + "learning_rate": 0.008599195710455764, + "loss": 0.8017, + "num_input_tokens_seen": 2239296, + "step": 3850 + }, + { + "epoch": 0.5741733690795353, + "grad_norm": 0.03271484375, + "learning_rate": 0.008610366398570152, + "loss": 0.8253, + "num_input_tokens_seen": 2242272, + "step": 3855 + }, + { + "epoch": 0.5749180816204945, + "grad_norm": 0.09228515625, + "learning_rate": 0.00862153708668454, + "loss": 0.7887, + "num_input_tokens_seen": 2245024, + "step": 3860 + }, + { + "epoch": 0.5756627941614537, + "grad_norm": 0.07470703125, + "learning_rate": 0.008632707774798928, + "loss": 0.801, + "num_input_tokens_seen": 2247744, + "step": 3865 + }, + { + "epoch": 0.5764075067024129, + "grad_norm": 0.07568359375, + "learning_rate": 0.008643878462913316, + "loss": 0.7703, + "num_input_tokens_seen": 2250624, + "step": 3870 + }, + { + "epoch": 0.577152219243372, + "grad_norm": 0.03955078125, + "learning_rate": 0.008655049151027704, + "loss": 0.8087, + "num_input_tokens_seen": 2253440, + "step": 3875 + }, + { + "epoch": 0.5778969317843312, + "grad_norm": 0.05517578125, + "learning_rate": 0.008666219839142092, + "loss": 0.8111, + "num_input_tokens_seen": 2256288, + "step": 3880 + }, + { + "epoch": 0.5786416443252904, + "grad_norm": 0.0556640625, + "learning_rate": 0.008677390527256478, + "loss": 0.8094, + "num_input_tokens_seen": 2259232, + "step": 3885 + }, + { + "epoch": 0.5793863568662496, + "grad_norm": 0.03515625, + "learning_rate": 0.008688561215370866, + "loss": 0.8219, + "num_input_tokens_seen": 2262400, + "step": 3890 + }, + { + "epoch": 0.5801310694072088, + "grad_norm": 0.031494140625, + "learning_rate": 0.008699731903485254, + "loss": 0.7938, + "num_input_tokens_seen": 2265056, + "step": 3895 + }, + { + "epoch": 0.580875781948168, + "grad_norm": 0.031005859375, + "learning_rate": 0.008710902591599642, + "loss": 0.7859, + "num_input_tokens_seen": 2268096, + "step": 3900 + }, + { + "epoch": 0.5816204944891272, + "grad_norm": 0.06494140625, + "learning_rate": 0.00872207327971403, + "loss": 0.782, + "num_input_tokens_seen": 2271136, + "step": 3905 + }, + { + "epoch": 0.5823652070300864, + "grad_norm": 0.0673828125, + "learning_rate": 0.008733243967828418, + "loss": 0.8139, + "num_input_tokens_seen": 2274048, + "step": 3910 + }, + { + "epoch": 0.5831099195710456, + "grad_norm": 0.050537109375, + "learning_rate": 0.008744414655942806, + "loss": 0.7835, + "num_input_tokens_seen": 2276736, + "step": 3915 + }, + { + "epoch": 0.5838546321120047, + "grad_norm": 0.03466796875, + "learning_rate": 0.008755585344057194, + "loss": 0.8034, + "num_input_tokens_seen": 2279584, + "step": 3920 + }, + { + "epoch": 0.5845993446529639, + "grad_norm": 0.0361328125, + "learning_rate": 0.00876675603217158, + "loss": 0.8201, + "num_input_tokens_seen": 2282592, + "step": 3925 + }, + { + "epoch": 0.5853440571939231, + "grad_norm": 0.0498046875, + "learning_rate": 0.008777926720285968, + "loss": 0.7893, + "num_input_tokens_seen": 2285824, + "step": 3930 + }, + { + "epoch": 0.5860887697348823, + "grad_norm": 0.06884765625, + "learning_rate": 0.008789097408400356, + "loss": 0.8064, + "num_input_tokens_seen": 2288672, + "step": 3935 + }, + { + "epoch": 0.5868334822758415, + "grad_norm": 0.05029296875, + "learning_rate": 0.008800268096514744, + "loss": 0.7908, + "num_input_tokens_seen": 2291520, + "step": 3940 + }, + { + "epoch": 0.5875781948168007, + "grad_norm": 0.059326171875, + "learning_rate": 0.008811438784629132, + "loss": 0.7738, + "num_input_tokens_seen": 2294048, + "step": 3945 + }, + { + "epoch": 0.5883229073577599, + "grad_norm": 0.04833984375, + "learning_rate": 0.00882260947274352, + "loss": 0.7884, + "num_input_tokens_seen": 2296832, + "step": 3950 + }, + { + "epoch": 0.5890676198987191, + "grad_norm": 0.052978515625, + "learning_rate": 0.008833780160857908, + "loss": 0.8101, + "num_input_tokens_seen": 2299808, + "step": 3955 + }, + { + "epoch": 0.5898123324396782, + "grad_norm": 0.06103515625, + "learning_rate": 0.008844950848972298, + "loss": 0.8054, + "num_input_tokens_seen": 2302592, + "step": 3960 + }, + { + "epoch": 0.5905570449806374, + "grad_norm": 0.0498046875, + "learning_rate": 0.008856121537086686, + "loss": 0.7896, + "num_input_tokens_seen": 2305344, + "step": 3965 + }, + { + "epoch": 0.5913017575215966, + "grad_norm": 0.0267333984375, + "learning_rate": 0.008867292225201072, + "loss": 0.7936, + "num_input_tokens_seen": 2308096, + "step": 3970 + }, + { + "epoch": 0.5920464700625558, + "grad_norm": 0.052734375, + "learning_rate": 0.00887846291331546, + "loss": 0.7878, + "num_input_tokens_seen": 2311104, + "step": 3975 + }, + { + "epoch": 0.592791182603515, + "grad_norm": 0.0927734375, + "learning_rate": 0.008889633601429848, + "loss": 0.7957, + "num_input_tokens_seen": 2314080, + "step": 3980 + }, + { + "epoch": 0.5935358951444742, + "grad_norm": 0.059814453125, + "learning_rate": 0.008900804289544236, + "loss": 0.7883, + "num_input_tokens_seen": 2317120, + "step": 3985 + }, + { + "epoch": 0.5942806076854334, + "grad_norm": 0.0703125, + "learning_rate": 0.008911974977658624, + "loss": 0.8281, + "num_input_tokens_seen": 2319968, + "step": 3990 + }, + { + "epoch": 0.5950253202263927, + "grad_norm": 0.049072265625, + "learning_rate": 0.008923145665773012, + "loss": 0.795, + "num_input_tokens_seen": 2322944, + "step": 3995 + }, + { + "epoch": 0.5957700327673519, + "grad_norm": 0.0576171875, + "learning_rate": 0.0089343163538874, + "loss": 0.7703, + "num_input_tokens_seen": 2325920, + "step": 4000 + }, + { + "epoch": 0.596514745308311, + "grad_norm": 0.03662109375, + "learning_rate": 0.008945487042001788, + "loss": 0.7966, + "num_input_tokens_seen": 2328768, + "step": 4005 + }, + { + "epoch": 0.5972594578492701, + "grad_norm": 0.052490234375, + "learning_rate": 0.008956657730116176, + "loss": 0.8396, + "num_input_tokens_seen": 2331616, + "step": 4010 + }, + { + "epoch": 0.5980041703902294, + "grad_norm": 0.07958984375, + "learning_rate": 0.008967828418230562, + "loss": 0.8076, + "num_input_tokens_seen": 2334592, + "step": 4015 + }, + { + "epoch": 0.5987488829311886, + "grad_norm": 0.043212890625, + "learning_rate": 0.00897899910634495, + "loss": 0.802, + "num_input_tokens_seen": 2337408, + "step": 4020 + }, + { + "epoch": 0.5994935954721478, + "grad_norm": 0.048583984375, + "learning_rate": 0.008990169794459338, + "loss": 0.798, + "num_input_tokens_seen": 2340320, + "step": 4025 + }, + { + "epoch": 0.600238308013107, + "grad_norm": 0.06298828125, + "learning_rate": 0.009001340482573726, + "loss": 0.7859, + "num_input_tokens_seen": 2343136, + "step": 4030 + }, + { + "epoch": 0.6009830205540662, + "grad_norm": 0.033935546875, + "learning_rate": 0.009012511170688114, + "loss": 0.7966, + "num_input_tokens_seen": 2346208, + "step": 4035 + }, + { + "epoch": 0.6017277330950254, + "grad_norm": 0.03515625, + "learning_rate": 0.009023681858802502, + "loss": 0.7902, + "num_input_tokens_seen": 2348864, + "step": 4040 + }, + { + "epoch": 0.6024724456359845, + "grad_norm": 0.061279296875, + "learning_rate": 0.00903485254691689, + "loss": 0.7895, + "num_input_tokens_seen": 2351520, + "step": 4045 + }, + { + "epoch": 0.6032171581769437, + "grad_norm": 0.05908203125, + "learning_rate": 0.009046023235031278, + "loss": 0.7987, + "num_input_tokens_seen": 2354592, + "step": 4050 + }, + { + "epoch": 0.6039618707179029, + "grad_norm": 0.032470703125, + "learning_rate": 0.009057193923145664, + "loss": 0.7909, + "num_input_tokens_seen": 2357376, + "step": 4055 + }, + { + "epoch": 0.6047065832588621, + "grad_norm": 0.046142578125, + "learning_rate": 0.009068364611260052, + "loss": 0.7862, + "num_input_tokens_seen": 2360128, + "step": 4060 + }, + { + "epoch": 0.6054512957998213, + "grad_norm": 0.044677734375, + "learning_rate": 0.00907953529937444, + "loss": 0.78, + "num_input_tokens_seen": 2363136, + "step": 4065 + }, + { + "epoch": 0.6061960083407805, + "grad_norm": 0.04833984375, + "learning_rate": 0.00909070598748883, + "loss": 0.7879, + "num_input_tokens_seen": 2365984, + "step": 4070 + }, + { + "epoch": 0.6069407208817397, + "grad_norm": 0.0546875, + "learning_rate": 0.009101876675603218, + "loss": 0.7751, + "num_input_tokens_seen": 2368640, + "step": 4075 + }, + { + "epoch": 0.6076854334226989, + "grad_norm": 0.07763671875, + "learning_rate": 0.009113047363717606, + "loss": 0.7679, + "num_input_tokens_seen": 2371424, + "step": 4080 + }, + { + "epoch": 0.6084301459636581, + "grad_norm": 0.0458984375, + "learning_rate": 0.009124218051831993, + "loss": 0.8093, + "num_input_tokens_seen": 2374240, + "step": 4085 + }, + { + "epoch": 0.6091748585046172, + "grad_norm": 0.051025390625, + "learning_rate": 0.009135388739946381, + "loss": 0.7717, + "num_input_tokens_seen": 2377376, + "step": 4090 + }, + { + "epoch": 0.6099195710455764, + "grad_norm": 0.055419921875, + "learning_rate": 0.00914655942806077, + "loss": 0.8076, + "num_input_tokens_seen": 2380128, + "step": 4095 + }, + { + "epoch": 0.6106642835865356, + "grad_norm": 0.05029296875, + "learning_rate": 0.009157730116175156, + "loss": 0.7631, + "num_input_tokens_seen": 2382944, + "step": 4100 + }, + { + "epoch": 0.6114089961274948, + "grad_norm": 0.07763671875, + "learning_rate": 0.009168900804289544, + "loss": 0.7851, + "num_input_tokens_seen": 2385824, + "step": 4105 + }, + { + "epoch": 0.612153708668454, + "grad_norm": 0.0732421875, + "learning_rate": 0.009180071492403932, + "loss": 0.9143, + "num_input_tokens_seen": 2388736, + "step": 4110 + }, + { + "epoch": 0.6128984212094132, + "grad_norm": 0.031982421875, + "learning_rate": 0.00919124218051832, + "loss": 0.8635, + "num_input_tokens_seen": 2391584, + "step": 4115 + }, + { + "epoch": 0.6136431337503724, + "grad_norm": 0.046875, + "learning_rate": 0.009202412868632708, + "loss": 0.7921, + "num_input_tokens_seen": 2394368, + "step": 4120 + }, + { + "epoch": 0.6143878462913316, + "grad_norm": 0.06396484375, + "learning_rate": 0.009213583556747095, + "loss": 0.8151, + "num_input_tokens_seen": 2397568, + "step": 4125 + }, + { + "epoch": 0.6151325588322908, + "grad_norm": 0.050048828125, + "learning_rate": 0.009224754244861483, + "loss": 0.8137, + "num_input_tokens_seen": 2400416, + "step": 4130 + }, + { + "epoch": 0.6158772713732499, + "grad_norm": 0.044921875, + "learning_rate": 0.009235924932975871, + "loss": 0.8102, + "num_input_tokens_seen": 2403360, + "step": 4135 + }, + { + "epoch": 0.6166219839142091, + "grad_norm": 0.0517578125, + "learning_rate": 0.00924709562109026, + "loss": 0.7946, + "num_input_tokens_seen": 2406112, + "step": 4140 + }, + { + "epoch": 0.6173666964551683, + "grad_norm": 0.047607421875, + "learning_rate": 0.009258266309204646, + "loss": 0.8033, + "num_input_tokens_seen": 2409120, + "step": 4145 + }, + { + "epoch": 0.6181114089961275, + "grad_norm": 0.0771484375, + "learning_rate": 0.009269436997319034, + "loss": 0.7955, + "num_input_tokens_seen": 2412000, + "step": 4150 + }, + { + "epoch": 0.6188561215370867, + "grad_norm": 0.053466796875, + "learning_rate": 0.009280607685433422, + "loss": 0.7905, + "num_input_tokens_seen": 2414624, + "step": 4155 + }, + { + "epoch": 0.6196008340780459, + "grad_norm": 0.0247802734375, + "learning_rate": 0.00929177837354781, + "loss": 0.7798, + "num_input_tokens_seen": 2417280, + "step": 4160 + }, + { + "epoch": 0.6203455466190051, + "grad_norm": 0.07080078125, + "learning_rate": 0.009302949061662198, + "loss": 0.7811, + "num_input_tokens_seen": 2420128, + "step": 4165 + }, + { + "epoch": 0.6210902591599643, + "grad_norm": 0.0625, + "learning_rate": 0.009314119749776585, + "loss": 0.7986, + "num_input_tokens_seen": 2423136, + "step": 4170 + }, + { + "epoch": 0.6218349717009234, + "grad_norm": 0.05615234375, + "learning_rate": 0.009325290437890973, + "loss": 0.8025, + "num_input_tokens_seen": 2426176, + "step": 4175 + }, + { + "epoch": 0.6225796842418826, + "grad_norm": 0.033935546875, + "learning_rate": 0.009336461126005361, + "loss": 0.8118, + "num_input_tokens_seen": 2429216, + "step": 4180 + }, + { + "epoch": 0.6233243967828418, + "grad_norm": 0.0634765625, + "learning_rate": 0.00934763181411975, + "loss": 0.8074, + "num_input_tokens_seen": 2432416, + "step": 4185 + }, + { + "epoch": 0.624069109323801, + "grad_norm": 0.047119140625, + "learning_rate": 0.009358802502234137, + "loss": 0.7722, + "num_input_tokens_seen": 2435264, + "step": 4190 + }, + { + "epoch": 0.6248138218647602, + "grad_norm": 0.062255859375, + "learning_rate": 0.009369973190348525, + "loss": 0.8144, + "num_input_tokens_seen": 2438336, + "step": 4195 + }, + { + "epoch": 0.6255585344057194, + "grad_norm": 0.07568359375, + "learning_rate": 0.009381143878462913, + "loss": 0.8083, + "num_input_tokens_seen": 2441376, + "step": 4200 + }, + { + "epoch": 0.6263032469466786, + "grad_norm": 0.0517578125, + "learning_rate": 0.009392314566577301, + "loss": 0.7892, + "num_input_tokens_seen": 2444544, + "step": 4205 + }, + { + "epoch": 0.6270479594876378, + "grad_norm": 0.09033203125, + "learning_rate": 0.00940348525469169, + "loss": 0.804, + "num_input_tokens_seen": 2447328, + "step": 4210 + }, + { + "epoch": 0.627792672028597, + "grad_norm": 0.103515625, + "learning_rate": 0.009414655942806077, + "loss": 0.8164, + "num_input_tokens_seen": 2450464, + "step": 4215 + }, + { + "epoch": 0.6285373845695561, + "grad_norm": 0.0537109375, + "learning_rate": 0.009425826630920465, + "loss": 0.8972, + "num_input_tokens_seen": 2453280, + "step": 4220 + }, + { + "epoch": 0.6292820971105153, + "grad_norm": 0.11328125, + "learning_rate": 0.009436997319034853, + "loss": 0.8273, + "num_input_tokens_seen": 2456256, + "step": 4225 + }, + { + "epoch": 0.6300268096514745, + "grad_norm": 0.0771484375, + "learning_rate": 0.00944816800714924, + "loss": 0.793, + "num_input_tokens_seen": 2459136, + "step": 4230 + }, + { + "epoch": 0.6307715221924337, + "grad_norm": 0.0751953125, + "learning_rate": 0.009459338695263627, + "loss": 0.7977, + "num_input_tokens_seen": 2461792, + "step": 4235 + }, + { + "epoch": 0.6315162347333929, + "grad_norm": 0.04150390625, + "learning_rate": 0.009470509383378015, + "loss": 0.8183, + "num_input_tokens_seen": 2464768, + "step": 4240 + }, + { + "epoch": 0.6322609472743521, + "grad_norm": 0.07470703125, + "learning_rate": 0.009481680071492403, + "loss": 0.7966, + "num_input_tokens_seen": 2467680, + "step": 4245 + }, + { + "epoch": 0.6330056598153113, + "grad_norm": 0.042724609375, + "learning_rate": 0.009492850759606791, + "loss": 0.7944, + "num_input_tokens_seen": 2470464, + "step": 4250 + }, + { + "epoch": 0.6337503723562705, + "grad_norm": 0.053466796875, + "learning_rate": 0.00950402144772118, + "loss": 0.7828, + "num_input_tokens_seen": 2473376, + "step": 4255 + }, + { + "epoch": 0.6344950848972297, + "grad_norm": 0.0673828125, + "learning_rate": 0.009515192135835567, + "loss": 0.7957, + "num_input_tokens_seen": 2476640, + "step": 4260 + }, + { + "epoch": 0.6352397974381888, + "grad_norm": 0.07470703125, + "learning_rate": 0.009526362823949955, + "loss": 0.7918, + "num_input_tokens_seen": 2479968, + "step": 4265 + }, + { + "epoch": 0.635984509979148, + "grad_norm": 0.06884765625, + "learning_rate": 0.009537533512064343, + "loss": 0.7856, + "num_input_tokens_seen": 2482816, + "step": 4270 + }, + { + "epoch": 0.6367292225201072, + "grad_norm": 0.1923828125, + "learning_rate": 0.00954870420017873, + "loss": 0.8232, + "num_input_tokens_seen": 2485568, + "step": 4275 + }, + { + "epoch": 0.6374739350610664, + "grad_norm": 0.142578125, + "learning_rate": 0.009559874888293117, + "loss": 0.7984, + "num_input_tokens_seen": 2488480, + "step": 4280 + }, + { + "epoch": 0.6382186476020256, + "grad_norm": 0.13671875, + "learning_rate": 0.009571045576407505, + "loss": 0.8232, + "num_input_tokens_seen": 2491360, + "step": 4285 + }, + { + "epoch": 0.6389633601429848, + "grad_norm": 0.06884765625, + "learning_rate": 0.009582216264521893, + "loss": 0.8204, + "num_input_tokens_seen": 2494272, + "step": 4290 + }, + { + "epoch": 0.639708072683944, + "grad_norm": 0.09619140625, + "learning_rate": 0.009593386952636283, + "loss": 0.798, + "num_input_tokens_seen": 2496928, + "step": 4295 + }, + { + "epoch": 0.6404527852249032, + "grad_norm": 0.11181640625, + "learning_rate": 0.009604557640750671, + "loss": 0.8097, + "num_input_tokens_seen": 2499680, + "step": 4300 + }, + { + "epoch": 0.6411974977658623, + "grad_norm": 0.10546875, + "learning_rate": 0.009615728328865059, + "loss": 0.805, + "num_input_tokens_seen": 2502400, + "step": 4305 + }, + { + "epoch": 0.6419422103068215, + "grad_norm": 0.251953125, + "learning_rate": 0.009626899016979447, + "loss": 0.8043, + "num_input_tokens_seen": 2505184, + "step": 4310 + }, + { + "epoch": 0.6426869228477807, + "grad_norm": 0.10205078125, + "learning_rate": 0.009638069705093833, + "loss": 0.8273, + "num_input_tokens_seen": 2508160, + "step": 4315 + }, + { + "epoch": 0.6434316353887399, + "grad_norm": 0.09375, + "learning_rate": 0.009649240393208221, + "loss": 0.7963, + "num_input_tokens_seen": 2510944, + "step": 4320 + }, + { + "epoch": 0.6441763479296991, + "grad_norm": 0.0732421875, + "learning_rate": 0.009660411081322609, + "loss": 0.9333, + "num_input_tokens_seen": 2513856, + "step": 4325 + }, + { + "epoch": 0.6449210604706583, + "grad_norm": 0.08740234375, + "learning_rate": 0.009671581769436997, + "loss": 0.7956, + "num_input_tokens_seen": 2516512, + "step": 4330 + }, + { + "epoch": 0.6456657730116175, + "grad_norm": 0.07861328125, + "learning_rate": 0.009682752457551385, + "loss": 0.8271, + "num_input_tokens_seen": 2519200, + "step": 4335 + }, + { + "epoch": 0.6464104855525767, + "grad_norm": 0.06640625, + "learning_rate": 0.009693923145665773, + "loss": 0.8016, + "num_input_tokens_seen": 2522016, + "step": 4340 + }, + { + "epoch": 0.6471551980935359, + "grad_norm": 0.0703125, + "learning_rate": 0.009705093833780161, + "loss": 0.8055, + "num_input_tokens_seen": 2524768, + "step": 4345 + }, + { + "epoch": 0.647899910634495, + "grad_norm": 0.142578125, + "learning_rate": 0.009716264521894549, + "loss": 0.7904, + "num_input_tokens_seen": 2527328, + "step": 4350 + }, + { + "epoch": 0.6486446231754542, + "grad_norm": 0.140625, + "learning_rate": 0.009727435210008937, + "loss": 0.7911, + "num_input_tokens_seen": 2530304, + "step": 4355 + }, + { + "epoch": 0.6493893357164134, + "grad_norm": 0.16796875, + "learning_rate": 0.009738605898123323, + "loss": 0.8286, + "num_input_tokens_seen": 2533536, + "step": 4360 + }, + { + "epoch": 0.6501340482573726, + "grad_norm": 0.158203125, + "learning_rate": 0.009749776586237711, + "loss": 0.7785, + "num_input_tokens_seen": 2536544, + "step": 4365 + }, + { + "epoch": 0.6508787607983318, + "grad_norm": 0.1640625, + "learning_rate": 0.009760947274352099, + "loss": 0.8489, + "num_input_tokens_seen": 2539456, + "step": 4370 + }, + { + "epoch": 0.651623473339291, + "grad_norm": 0.1064453125, + "learning_rate": 0.009772117962466487, + "loss": 0.815, + "num_input_tokens_seen": 2542528, + "step": 4375 + }, + { + "epoch": 0.6523681858802503, + "grad_norm": 0.1494140625, + "learning_rate": 0.009783288650580875, + "loss": 0.8119, + "num_input_tokens_seen": 2545664, + "step": 4380 + }, + { + "epoch": 0.6531128984212095, + "grad_norm": 0.162109375, + "learning_rate": 0.009794459338695263, + "loss": 0.7876, + "num_input_tokens_seen": 2548544, + "step": 4385 + }, + { + "epoch": 0.6538576109621687, + "grad_norm": 0.064453125, + "learning_rate": 0.009805630026809651, + "loss": 0.8323, + "num_input_tokens_seen": 2551392, + "step": 4390 + }, + { + "epoch": 0.6546023235031277, + "grad_norm": 0.08544921875, + "learning_rate": 0.009816800714924039, + "loss": 0.7707, + "num_input_tokens_seen": 2554336, + "step": 4395 + }, + { + "epoch": 0.655347036044087, + "grad_norm": 0.1982421875, + "learning_rate": 0.009827971403038427, + "loss": 0.7909, + "num_input_tokens_seen": 2557056, + "step": 4400 + }, + { + "epoch": 0.6560917485850462, + "grad_norm": 0.1650390625, + "learning_rate": 0.009839142091152815, + "loss": 0.8402, + "num_input_tokens_seen": 2559904, + "step": 4405 + }, + { + "epoch": 0.6568364611260054, + "grad_norm": 0.07421875, + "learning_rate": 0.009850312779267203, + "loss": 0.8202, + "num_input_tokens_seen": 2562752, + "step": 4410 + }, + { + "epoch": 0.6575811736669646, + "grad_norm": 0.04638671875, + "learning_rate": 0.00986148346738159, + "loss": 0.7745, + "num_input_tokens_seen": 2565536, + "step": 4415 + }, + { + "epoch": 0.6583258862079238, + "grad_norm": 0.171875, + "learning_rate": 0.009872654155495979, + "loss": 0.7915, + "num_input_tokens_seen": 2568512, + "step": 4420 + }, + { + "epoch": 0.659070598748883, + "grad_norm": 0.052490234375, + "learning_rate": 0.009883824843610367, + "loss": 0.7708, + "num_input_tokens_seen": 2571392, + "step": 4425 + }, + { + "epoch": 0.6598153112898422, + "grad_norm": 0.06396484375, + "learning_rate": 0.009894995531724755, + "loss": 0.7957, + "num_input_tokens_seen": 2574368, + "step": 4430 + }, + { + "epoch": 0.6605600238308013, + "grad_norm": 0.115234375, + "learning_rate": 0.009906166219839143, + "loss": 0.8209, + "num_input_tokens_seen": 2577440, + "step": 4435 + }, + { + "epoch": 0.6613047363717605, + "grad_norm": 0.07373046875, + "learning_rate": 0.00991733690795353, + "loss": 0.8521, + "num_input_tokens_seen": 2580608, + "step": 4440 + }, + { + "epoch": 0.6620494489127197, + "grad_norm": 0.1787109375, + "learning_rate": 0.009928507596067917, + "loss": 0.8234, + "num_input_tokens_seen": 2583232, + "step": 4445 + }, + { + "epoch": 0.6627941614536789, + "grad_norm": 0.037841796875, + "learning_rate": 0.009939678284182305, + "loss": 0.7927, + "num_input_tokens_seen": 2586208, + "step": 4450 + }, + { + "epoch": 0.6635388739946381, + "grad_norm": 0.04248046875, + "learning_rate": 0.009950848972296693, + "loss": 0.8258, + "num_input_tokens_seen": 2589120, + "step": 4455 + }, + { + "epoch": 0.6642835865355973, + "grad_norm": 0.09326171875, + "learning_rate": 0.00996201966041108, + "loss": 0.828, + "num_input_tokens_seen": 2591872, + "step": 4460 + }, + { + "epoch": 0.6650282990765565, + "grad_norm": 0.052001953125, + "learning_rate": 0.009973190348525469, + "loss": 0.8193, + "num_input_tokens_seen": 2594944, + "step": 4465 + }, + { + "epoch": 0.6657730116175157, + "grad_norm": 0.095703125, + "learning_rate": 0.009984361036639857, + "loss": 0.8049, + "num_input_tokens_seen": 2597760, + "step": 4470 + }, + { + "epoch": 0.6665177241584749, + "grad_norm": 0.07275390625, + "learning_rate": 0.009995531724754245, + "loss": 0.8179, + "num_input_tokens_seen": 2600672, + "step": 4475 + }, + { + "epoch": 0.667262436699434, + "grad_norm": 0.0869140625, + "learning_rate": 0.010006702412868633, + "loss": 0.7972, + "num_input_tokens_seen": 2603456, + "step": 4480 + }, + { + "epoch": 0.6680071492403932, + "grad_norm": 0.039794921875, + "learning_rate": 0.01001787310098302, + "loss": 0.8243, + "num_input_tokens_seen": 2606496, + "step": 4485 + }, + { + "epoch": 0.6687518617813524, + "grad_norm": 0.08837890625, + "learning_rate": 0.010029043789097407, + "loss": 0.8134, + "num_input_tokens_seen": 2609216, + "step": 4490 + }, + { + "epoch": 0.6694965743223116, + "grad_norm": 0.0751953125, + "learning_rate": 0.010040214477211795, + "loss": 0.8137, + "num_input_tokens_seen": 2611936, + "step": 4495 + }, + { + "epoch": 0.6702412868632708, + "grad_norm": 0.083984375, + "learning_rate": 0.010051385165326183, + "loss": 0.7941, + "num_input_tokens_seen": 2614848, + "step": 4500 + }, + { + "epoch": 0.67098599940423, + "grad_norm": 0.06396484375, + "learning_rate": 0.01006255585344057, + "loss": 0.8299, + "num_input_tokens_seen": 2617856, + "step": 4505 + }, + { + "epoch": 0.6717307119451892, + "grad_norm": 0.0791015625, + "learning_rate": 0.010073726541554959, + "loss": 0.8129, + "num_input_tokens_seen": 2620960, + "step": 4510 + }, + { + "epoch": 0.6724754244861484, + "grad_norm": 0.044921875, + "learning_rate": 0.010084897229669347, + "loss": 0.81, + "num_input_tokens_seen": 2623936, + "step": 4515 + }, + { + "epoch": 0.6732201370271076, + "grad_norm": 0.0546875, + "learning_rate": 0.010096067917783736, + "loss": 0.8098, + "num_input_tokens_seen": 2626816, + "step": 4520 + }, + { + "epoch": 0.6739648495680667, + "grad_norm": 0.072265625, + "learning_rate": 0.010107238605898124, + "loss": 0.8034, + "num_input_tokens_seen": 2629504, + "step": 4525 + }, + { + "epoch": 0.6747095621090259, + "grad_norm": 0.042236328125, + "learning_rate": 0.010118409294012512, + "loss": 0.7997, + "num_input_tokens_seen": 2632032, + "step": 4530 + }, + { + "epoch": 0.6754542746499851, + "grad_norm": 0.037841796875, + "learning_rate": 0.010129579982126899, + "loss": 0.8466, + "num_input_tokens_seen": 2635104, + "step": 4535 + }, + { + "epoch": 0.6761989871909443, + "grad_norm": 0.055419921875, + "learning_rate": 0.010140750670241287, + "loss": 0.8073, + "num_input_tokens_seen": 2637888, + "step": 4540 + }, + { + "epoch": 0.6769436997319035, + "grad_norm": 0.05615234375, + "learning_rate": 0.010151921358355675, + "loss": 0.8084, + "num_input_tokens_seen": 2640608, + "step": 4545 + }, + { + "epoch": 0.6776884122728627, + "grad_norm": 0.036376953125, + "learning_rate": 0.010163092046470062, + "loss": 0.811, + "num_input_tokens_seen": 2643616, + "step": 4550 + }, + { + "epoch": 0.6784331248138219, + "grad_norm": 0.0712890625, + "learning_rate": 0.01017426273458445, + "loss": 0.8017, + "num_input_tokens_seen": 2646464, + "step": 4555 + }, + { + "epoch": 0.6791778373547811, + "grad_norm": 0.0390625, + "learning_rate": 0.010185433422698838, + "loss": 0.8059, + "num_input_tokens_seen": 2649280, + "step": 4560 + }, + { + "epoch": 0.6799225498957402, + "grad_norm": 0.0654296875, + "learning_rate": 0.010196604110813226, + "loss": 0.7891, + "num_input_tokens_seen": 2652288, + "step": 4565 + }, + { + "epoch": 0.6806672624366994, + "grad_norm": 0.033203125, + "learning_rate": 0.010207774798927614, + "loss": 0.7952, + "num_input_tokens_seen": 2655520, + "step": 4570 + }, + { + "epoch": 0.6814119749776586, + "grad_norm": 0.044677734375, + "learning_rate": 0.010218945487042, + "loss": 0.8198, + "num_input_tokens_seen": 2658336, + "step": 4575 + }, + { + "epoch": 0.6821566875186178, + "grad_norm": 0.0654296875, + "learning_rate": 0.010230116175156389, + "loss": 0.7843, + "num_input_tokens_seen": 2661568, + "step": 4580 + }, + { + "epoch": 0.682901400059577, + "grad_norm": 0.043212890625, + "learning_rate": 0.010241286863270777, + "loss": 0.8044, + "num_input_tokens_seen": 2664576, + "step": 4585 + }, + { + "epoch": 0.6836461126005362, + "grad_norm": 0.0322265625, + "learning_rate": 0.010252457551385164, + "loss": 0.8178, + "num_input_tokens_seen": 2667584, + "step": 4590 + }, + { + "epoch": 0.6843908251414954, + "grad_norm": 0.0908203125, + "learning_rate": 0.010263628239499552, + "loss": 0.7923, + "num_input_tokens_seen": 2670272, + "step": 4595 + }, + { + "epoch": 0.6851355376824546, + "grad_norm": 0.0546875, + "learning_rate": 0.01027479892761394, + "loss": 0.7942, + "num_input_tokens_seen": 2673312, + "step": 4600 + }, + { + "epoch": 0.6858802502234138, + "grad_norm": 0.0654296875, + "learning_rate": 0.010285969615728328, + "loss": 0.8352, + "num_input_tokens_seen": 2676512, + "step": 4605 + }, + { + "epoch": 0.6866249627643729, + "grad_norm": 0.06982421875, + "learning_rate": 0.010297140303842716, + "loss": 0.8083, + "num_input_tokens_seen": 2679488, + "step": 4610 + }, + { + "epoch": 0.6873696753053321, + "grad_norm": 0.0673828125, + "learning_rate": 0.010308310991957104, + "loss": 0.812, + "num_input_tokens_seen": 2682208, + "step": 4615 + }, + { + "epoch": 0.6881143878462913, + "grad_norm": 0.05322265625, + "learning_rate": 0.01031948168007149, + "loss": 0.7994, + "num_input_tokens_seen": 2684992, + "step": 4620 + }, + { + "epoch": 0.6888591003872505, + "grad_norm": 0.059326171875, + "learning_rate": 0.010330652368185879, + "loss": 0.803, + "num_input_tokens_seen": 2687840, + "step": 4625 + }, + { + "epoch": 0.6896038129282097, + "grad_norm": 0.11962890625, + "learning_rate": 0.010341823056300267, + "loss": 0.8278, + "num_input_tokens_seen": 2690592, + "step": 4630 + }, + { + "epoch": 0.6903485254691689, + "grad_norm": 0.0595703125, + "learning_rate": 0.010352993744414656, + "loss": 0.8109, + "num_input_tokens_seen": 2693472, + "step": 4635 + }, + { + "epoch": 0.6910932380101281, + "grad_norm": 0.0556640625, + "learning_rate": 0.010364164432529044, + "loss": 0.7867, + "num_input_tokens_seen": 2696256, + "step": 4640 + }, + { + "epoch": 0.6918379505510873, + "grad_norm": 0.0849609375, + "learning_rate": 0.010375335120643432, + "loss": 0.8002, + "num_input_tokens_seen": 2699104, + "step": 4645 + }, + { + "epoch": 0.6925826630920465, + "grad_norm": 0.053466796875, + "learning_rate": 0.01038650580875782, + "loss": 0.7941, + "num_input_tokens_seen": 2702240, + "step": 4650 + }, + { + "epoch": 0.6933273756330056, + "grad_norm": 0.046875, + "learning_rate": 0.010397676496872208, + "loss": 0.7886, + "num_input_tokens_seen": 2704864, + "step": 4655 + }, + { + "epoch": 0.6940720881739648, + "grad_norm": 0.0361328125, + "learning_rate": 0.010408847184986596, + "loss": 0.7907, + "num_input_tokens_seen": 2707552, + "step": 4660 + }, + { + "epoch": 0.694816800714924, + "grad_norm": 0.042724609375, + "learning_rate": 0.010420017873100982, + "loss": 0.8159, + "num_input_tokens_seen": 2710592, + "step": 4665 + }, + { + "epoch": 0.6955615132558832, + "grad_norm": 0.032470703125, + "learning_rate": 0.01043118856121537, + "loss": 0.8271, + "num_input_tokens_seen": 2713504, + "step": 4670 + }, + { + "epoch": 0.6963062257968424, + "grad_norm": 0.06494140625, + "learning_rate": 0.010442359249329758, + "loss": 0.8205, + "num_input_tokens_seen": 2716448, + "step": 4675 + }, + { + "epoch": 0.6970509383378016, + "grad_norm": 0.052978515625, + "learning_rate": 0.010453529937444146, + "loss": 0.7938, + "num_input_tokens_seen": 2719296, + "step": 4680 + }, + { + "epoch": 0.6977956508787608, + "grad_norm": 0.05126953125, + "learning_rate": 0.010464700625558534, + "loss": 0.814, + "num_input_tokens_seen": 2722112, + "step": 4685 + }, + { + "epoch": 0.69854036341972, + "grad_norm": 0.03173828125, + "learning_rate": 0.010475871313672922, + "loss": 0.8062, + "num_input_tokens_seen": 2725056, + "step": 4690 + }, + { + "epoch": 0.6992850759606791, + "grad_norm": 0.0859375, + "learning_rate": 0.01048704200178731, + "loss": 0.7832, + "num_input_tokens_seen": 2727936, + "step": 4695 + }, + { + "epoch": 0.7000297885016383, + "grad_norm": 0.047607421875, + "learning_rate": 0.010498212689901698, + "loss": 0.8186, + "num_input_tokens_seen": 2730656, + "step": 4700 + }, + { + "epoch": 0.7007745010425975, + "grad_norm": 0.042724609375, + "learning_rate": 0.010509383378016084, + "loss": 0.8121, + "num_input_tokens_seen": 2733888, + "step": 4705 + }, + { + "epoch": 0.7015192135835567, + "grad_norm": 0.054931640625, + "learning_rate": 0.010520554066130472, + "loss": 0.8147, + "num_input_tokens_seen": 2736640, + "step": 4710 + }, + { + "epoch": 0.7022639261245159, + "grad_norm": 0.03955078125, + "learning_rate": 0.01053172475424486, + "loss": 0.7913, + "num_input_tokens_seen": 2739360, + "step": 4715 + }, + { + "epoch": 0.7030086386654751, + "grad_norm": 0.050537109375, + "learning_rate": 0.010542895442359248, + "loss": 0.7983, + "num_input_tokens_seen": 2742080, + "step": 4720 + }, + { + "epoch": 0.7037533512064343, + "grad_norm": 0.061279296875, + "learning_rate": 0.010554066130473636, + "loss": 0.7878, + "num_input_tokens_seen": 2744736, + "step": 4725 + }, + { + "epoch": 0.7044980637473935, + "grad_norm": 0.050537109375, + "learning_rate": 0.010565236818588024, + "loss": 0.8143, + "num_input_tokens_seen": 2747712, + "step": 4730 + }, + { + "epoch": 0.7052427762883527, + "grad_norm": 0.045166015625, + "learning_rate": 0.010576407506702412, + "loss": 0.8126, + "num_input_tokens_seen": 2750688, + "step": 4735 + }, + { + "epoch": 0.7059874888293118, + "grad_norm": 0.078125, + "learning_rate": 0.0105875781948168, + "loss": 0.788, + "num_input_tokens_seen": 2753568, + "step": 4740 + }, + { + "epoch": 0.706732201370271, + "grad_norm": 0.055908203125, + "learning_rate": 0.010598748882931188, + "loss": 0.8227, + "num_input_tokens_seen": 2756416, + "step": 4745 + }, + { + "epoch": 0.7074769139112302, + "grad_norm": 0.054931640625, + "learning_rate": 0.010609919571045576, + "loss": 0.7904, + "num_input_tokens_seen": 2759488, + "step": 4750 + }, + { + "epoch": 0.7082216264521894, + "grad_norm": 0.0302734375, + "learning_rate": 0.010621090259159964, + "loss": 0.7927, + "num_input_tokens_seen": 2762432, + "step": 4755 + }, + { + "epoch": 0.7089663389931486, + "grad_norm": 0.046630859375, + "learning_rate": 0.010632260947274352, + "loss": 0.8189, + "num_input_tokens_seen": 2765504, + "step": 4760 + }, + { + "epoch": 0.7097110515341079, + "grad_norm": 0.0751953125, + "learning_rate": 0.01064343163538874, + "loss": 0.8001, + "num_input_tokens_seen": 2768416, + "step": 4765 + }, + { + "epoch": 0.710455764075067, + "grad_norm": 0.04443359375, + "learning_rate": 0.010654602323503128, + "loss": 0.7854, + "num_input_tokens_seen": 2771456, + "step": 4770 + }, + { + "epoch": 0.7112004766160263, + "grad_norm": 0.055908203125, + "learning_rate": 0.010665773011617516, + "loss": 0.7999, + "num_input_tokens_seen": 2774720, + "step": 4775 + }, + { + "epoch": 0.7119451891569855, + "grad_norm": 0.04345703125, + "learning_rate": 0.010676943699731904, + "loss": 0.813, + "num_input_tokens_seen": 2777600, + "step": 4780 + }, + { + "epoch": 0.7126899016979446, + "grad_norm": 0.05078125, + "learning_rate": 0.010688114387846292, + "loss": 0.8085, + "num_input_tokens_seen": 2780608, + "step": 4785 + }, + { + "epoch": 0.7134346142389038, + "grad_norm": 0.039306640625, + "learning_rate": 0.010699285075960678, + "loss": 0.7913, + "num_input_tokens_seen": 2783328, + "step": 4790 + }, + { + "epoch": 0.714179326779863, + "grad_norm": 0.06494140625, + "learning_rate": 0.010710455764075066, + "loss": 0.7852, + "num_input_tokens_seen": 2786336, + "step": 4795 + }, + { + "epoch": 0.7149240393208222, + "grad_norm": 0.06787109375, + "learning_rate": 0.010721626452189454, + "loss": 0.7872, + "num_input_tokens_seen": 2789024, + "step": 4800 + }, + { + "epoch": 0.7156687518617814, + "grad_norm": 0.040283203125, + "learning_rate": 0.010732797140303842, + "loss": 0.7805, + "num_input_tokens_seen": 2792064, + "step": 4805 + }, + { + "epoch": 0.7164134644027406, + "grad_norm": 0.07373046875, + "learning_rate": 0.01074396782841823, + "loss": 0.7826, + "num_input_tokens_seen": 2794848, + "step": 4810 + }, + { + "epoch": 0.7171581769436998, + "grad_norm": 0.09033203125, + "learning_rate": 0.010755138516532618, + "loss": 0.7861, + "num_input_tokens_seen": 2797728, + "step": 4815 + }, + { + "epoch": 0.717902889484659, + "grad_norm": 0.027587890625, + "learning_rate": 0.010766309204647006, + "loss": 0.8036, + "num_input_tokens_seen": 2800832, + "step": 4820 + }, + { + "epoch": 0.7186476020256181, + "grad_norm": 0.09912109375, + "learning_rate": 0.010777479892761394, + "loss": 0.8138, + "num_input_tokens_seen": 2803712, + "step": 4825 + }, + { + "epoch": 0.7193923145665773, + "grad_norm": 0.047119140625, + "learning_rate": 0.010788650580875782, + "loss": 0.7826, + "num_input_tokens_seen": 2806528, + "step": 4830 + }, + { + "epoch": 0.7201370271075365, + "grad_norm": 0.0791015625, + "learning_rate": 0.010799821268990168, + "loss": 0.8322, + "num_input_tokens_seen": 2809216, + "step": 4835 + }, + { + "epoch": 0.7208817396484957, + "grad_norm": 0.037109375, + "learning_rate": 0.010810991957104556, + "loss": 0.8075, + "num_input_tokens_seen": 2812224, + "step": 4840 + }, + { + "epoch": 0.7216264521894549, + "grad_norm": 0.028564453125, + "learning_rate": 0.010822162645218944, + "loss": 0.8076, + "num_input_tokens_seen": 2815008, + "step": 4845 + }, + { + "epoch": 0.7223711647304141, + "grad_norm": 0.038330078125, + "learning_rate": 0.010833333333333332, + "loss": 0.8223, + "num_input_tokens_seen": 2817632, + "step": 4850 + }, + { + "epoch": 0.7231158772713733, + "grad_norm": 0.0419921875, + "learning_rate": 0.01084450402144772, + "loss": 0.7809, + "num_input_tokens_seen": 2820352, + "step": 4855 + }, + { + "epoch": 0.7238605898123325, + "grad_norm": 0.043212890625, + "learning_rate": 0.01085567470956211, + "loss": 0.8073, + "num_input_tokens_seen": 2823328, + "step": 4860 + }, + { + "epoch": 0.7246053023532917, + "grad_norm": 0.046875, + "learning_rate": 0.010866845397676498, + "loss": 0.8025, + "num_input_tokens_seen": 2826368, + "step": 4865 + }, + { + "epoch": 0.7253500148942508, + "grad_norm": 0.049072265625, + "learning_rate": 0.010878016085790886, + "loss": 0.8311, + "num_input_tokens_seen": 2829504, + "step": 4870 + }, + { + "epoch": 0.72609472743521, + "grad_norm": 0.064453125, + "learning_rate": 0.010889186773905274, + "loss": 0.7888, + "num_input_tokens_seen": 2832384, + "step": 4875 + }, + { + "epoch": 0.7268394399761692, + "grad_norm": 0.042724609375, + "learning_rate": 0.01090035746201966, + "loss": 0.7739, + "num_input_tokens_seen": 2834976, + "step": 4880 + }, + { + "epoch": 0.7275841525171284, + "grad_norm": 0.0380859375, + "learning_rate": 0.010911528150134048, + "loss": 0.8229, + "num_input_tokens_seen": 2837952, + "step": 4885 + }, + { + "epoch": 0.7283288650580876, + "grad_norm": 0.051513671875, + "learning_rate": 0.010922698838248436, + "loss": 0.7819, + "num_input_tokens_seen": 2840736, + "step": 4890 + }, + { + "epoch": 0.7290735775990468, + "grad_norm": 0.0208740234375, + "learning_rate": 0.010933869526362824, + "loss": 0.7933, + "num_input_tokens_seen": 2843552, + "step": 4895 + }, + { + "epoch": 0.729818290140006, + "grad_norm": 0.0439453125, + "learning_rate": 0.010945040214477212, + "loss": 0.8024, + "num_input_tokens_seen": 2846592, + "step": 4900 + }, + { + "epoch": 0.7305630026809652, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0109562109025916, + "loss": 0.8262, + "num_input_tokens_seen": 2849376, + "step": 4905 + }, + { + "epoch": 0.7313077152219244, + "grad_norm": 0.04345703125, + "learning_rate": 0.010967381590705988, + "loss": 0.7848, + "num_input_tokens_seen": 2852032, + "step": 4910 + }, + { + "epoch": 0.7320524277628835, + "grad_norm": 0.06103515625, + "learning_rate": 0.010978552278820376, + "loss": 0.7767, + "num_input_tokens_seen": 2855360, + "step": 4915 + }, + { + "epoch": 0.7327971403038427, + "grad_norm": 0.057861328125, + "learning_rate": 0.010989722966934762, + "loss": 0.7981, + "num_input_tokens_seen": 2858080, + "step": 4920 + }, + { + "epoch": 0.7335418528448019, + "grad_norm": 0.05029296875, + "learning_rate": 0.01100089365504915, + "loss": 0.7916, + "num_input_tokens_seen": 2861024, + "step": 4925 + }, + { + "epoch": 0.7342865653857611, + "grad_norm": 0.0439453125, + "learning_rate": 0.011012064343163538, + "loss": 0.8535, + "num_input_tokens_seen": 2864128, + "step": 4930 + }, + { + "epoch": 0.7350312779267203, + "grad_norm": 0.0230712890625, + "learning_rate": 0.011023235031277926, + "loss": 0.8038, + "num_input_tokens_seen": 2866912, + "step": 4935 + }, + { + "epoch": 0.7357759904676795, + "grad_norm": 0.0693359375, + "learning_rate": 0.011034405719392314, + "loss": 0.8031, + "num_input_tokens_seen": 2869824, + "step": 4940 + }, + { + "epoch": 0.7365207030086387, + "grad_norm": 0.042236328125, + "learning_rate": 0.011045576407506702, + "loss": 0.8035, + "num_input_tokens_seen": 2872576, + "step": 4945 + }, + { + "epoch": 0.7372654155495979, + "grad_norm": 0.02734375, + "learning_rate": 0.01105674709562109, + "loss": 0.8126, + "num_input_tokens_seen": 2875552, + "step": 4950 + }, + { + "epoch": 0.738010128090557, + "grad_norm": 0.0257568359375, + "learning_rate": 0.011067917783735478, + "loss": 0.8082, + "num_input_tokens_seen": 2878560, + "step": 4955 + }, + { + "epoch": 0.7387548406315162, + "grad_norm": 0.04345703125, + "learning_rate": 0.011079088471849866, + "loss": 0.7934, + "num_input_tokens_seen": 2881344, + "step": 4960 + }, + { + "epoch": 0.7394995531724754, + "grad_norm": 0.041748046875, + "learning_rate": 0.011090259159964252, + "loss": 0.8208, + "num_input_tokens_seen": 2884416, + "step": 4965 + }, + { + "epoch": 0.7402442657134346, + "grad_norm": 0.040771484375, + "learning_rate": 0.011101429848078641, + "loss": 0.7721, + "num_input_tokens_seen": 2887264, + "step": 4970 + }, + { + "epoch": 0.7409889782543938, + "grad_norm": 0.07421875, + "learning_rate": 0.01111260053619303, + "loss": 0.8015, + "num_input_tokens_seen": 2890016, + "step": 4975 + }, + { + "epoch": 0.741733690795353, + "grad_norm": 0.03173828125, + "learning_rate": 0.011123771224307417, + "loss": 0.8046, + "num_input_tokens_seen": 2892832, + "step": 4980 + }, + { + "epoch": 0.7424784033363122, + "grad_norm": 0.048583984375, + "learning_rate": 0.011134941912421805, + "loss": 0.8097, + "num_input_tokens_seen": 2895904, + "step": 4985 + }, + { + "epoch": 0.7432231158772714, + "grad_norm": 0.04541015625, + "learning_rate": 0.011146112600536193, + "loss": 0.8006, + "num_input_tokens_seen": 2898848, + "step": 4990 + }, + { + "epoch": 0.7439678284182306, + "grad_norm": 0.0400390625, + "learning_rate": 0.011157283288650581, + "loss": 0.79, + "num_input_tokens_seen": 2901696, + "step": 4995 + }, + { + "epoch": 0.7447125409591897, + "grad_norm": 0.060791015625, + "learning_rate": 0.01116845397676497, + "loss": 0.8128, + "num_input_tokens_seen": 2904224, + "step": 5000 + }, + { + "epoch": 0.7454572535001489, + "grad_norm": 0.02880859375, + "learning_rate": 0.011179624664879357, + "loss": 0.823, + "num_input_tokens_seen": 2906848, + "step": 5005 + }, + { + "epoch": 0.7462019660411081, + "grad_norm": 0.07080078125, + "learning_rate": 0.011190795352993744, + "loss": 0.8122, + "num_input_tokens_seen": 2909568, + "step": 5010 + }, + { + "epoch": 0.7469466785820673, + "grad_norm": 0.04931640625, + "learning_rate": 0.011201966041108131, + "loss": 0.8033, + "num_input_tokens_seen": 2912352, + "step": 5015 + }, + { + "epoch": 0.7476913911230265, + "grad_norm": 0.0390625, + "learning_rate": 0.01121313672922252, + "loss": 0.7918, + "num_input_tokens_seen": 2914720, + "step": 5020 + }, + { + "epoch": 0.7484361036639857, + "grad_norm": 0.0478515625, + "learning_rate": 0.011224307417336907, + "loss": 0.8046, + "num_input_tokens_seen": 2917536, + "step": 5025 + }, + { + "epoch": 0.7491808162049449, + "grad_norm": 0.04541015625, + "learning_rate": 0.011235478105451295, + "loss": 0.8188, + "num_input_tokens_seen": 2920448, + "step": 5030 + }, + { + "epoch": 0.7499255287459041, + "grad_norm": 0.051513671875, + "learning_rate": 0.011246648793565683, + "loss": 0.8006, + "num_input_tokens_seen": 2923712, + "step": 5035 + }, + { + "epoch": 0.7506702412868632, + "grad_norm": 0.0654296875, + "learning_rate": 0.011257819481680071, + "loss": 0.8031, + "num_input_tokens_seen": 2926528, + "step": 5040 + }, + { + "epoch": 0.7514149538278224, + "grad_norm": 0.023681640625, + "learning_rate": 0.01126899016979446, + "loss": 0.7961, + "num_input_tokens_seen": 2929280, + "step": 5045 + }, + { + "epoch": 0.7521596663687816, + "grad_norm": 0.05908203125, + "learning_rate": 0.011280160857908846, + "loss": 0.8013, + "num_input_tokens_seen": 2932288, + "step": 5050 + }, + { + "epoch": 0.7529043789097408, + "grad_norm": 0.044677734375, + "learning_rate": 0.011291331546023234, + "loss": 0.8097, + "num_input_tokens_seen": 2934976, + "step": 5055 + }, + { + "epoch": 0.7536490914507, + "grad_norm": 0.04150390625, + "learning_rate": 0.011302502234137621, + "loss": 0.8029, + "num_input_tokens_seen": 2937856, + "step": 5060 + }, + { + "epoch": 0.7543938039916592, + "grad_norm": 0.052001953125, + "learning_rate": 0.01131367292225201, + "loss": 0.7936, + "num_input_tokens_seen": 2940384, + "step": 5065 + }, + { + "epoch": 0.7551385165326184, + "grad_norm": 0.037109375, + "learning_rate": 0.011324843610366397, + "loss": 0.8093, + "num_input_tokens_seen": 2943232, + "step": 5070 + }, + { + "epoch": 0.7558832290735776, + "grad_norm": 0.04052734375, + "learning_rate": 0.011336014298480785, + "loss": 0.8153, + "num_input_tokens_seen": 2946208, + "step": 5075 + }, + { + "epoch": 0.7566279416145368, + "grad_norm": 0.0419921875, + "learning_rate": 0.011347184986595173, + "loss": 0.7927, + "num_input_tokens_seen": 2949280, + "step": 5080 + }, + { + "epoch": 0.7573726541554959, + "grad_norm": 0.043212890625, + "learning_rate": 0.011358355674709563, + "loss": 0.8057, + "num_input_tokens_seen": 2951808, + "step": 5085 + }, + { + "epoch": 0.7581173666964551, + "grad_norm": 0.043212890625, + "learning_rate": 0.011369526362823951, + "loss": 0.8003, + "num_input_tokens_seen": 2954560, + "step": 5090 + }, + { + "epoch": 0.7588620792374143, + "grad_norm": 0.04150390625, + "learning_rate": 0.011380697050938337, + "loss": 0.8073, + "num_input_tokens_seen": 2957376, + "step": 5095 + }, + { + "epoch": 0.7596067917783735, + "grad_norm": 0.06298828125, + "learning_rate": 0.011391867739052725, + "loss": 0.7684, + "num_input_tokens_seen": 2960352, + "step": 5100 + }, + { + "epoch": 0.7603515043193327, + "grad_norm": 0.058837890625, + "learning_rate": 0.011403038427167113, + "loss": 0.8232, + "num_input_tokens_seen": 2963328, + "step": 5105 + }, + { + "epoch": 0.7610962168602919, + "grad_norm": 0.03857421875, + "learning_rate": 0.011414209115281501, + "loss": 0.7832, + "num_input_tokens_seen": 2966208, + "step": 5110 + }, + { + "epoch": 0.7618409294012511, + "grad_norm": 0.047607421875, + "learning_rate": 0.01142537980339589, + "loss": 0.7985, + "num_input_tokens_seen": 2969120, + "step": 5115 + }, + { + "epoch": 0.7625856419422103, + "grad_norm": 0.06640625, + "learning_rate": 0.011436550491510277, + "loss": 0.8185, + "num_input_tokens_seen": 2971776, + "step": 5120 + }, + { + "epoch": 0.7633303544831695, + "grad_norm": 0.052734375, + "learning_rate": 0.011447721179624665, + "loss": 0.7989, + "num_input_tokens_seen": 2974560, + "step": 5125 + }, + { + "epoch": 0.7640750670241286, + "grad_norm": 0.042236328125, + "learning_rate": 0.011458891867739053, + "loss": 0.8193, + "num_input_tokens_seen": 2977376, + "step": 5130 + }, + { + "epoch": 0.7648197795650878, + "grad_norm": 0.0419921875, + "learning_rate": 0.011470062555853441, + "loss": 0.7911, + "num_input_tokens_seen": 2980192, + "step": 5135 + }, + { + "epoch": 0.765564492106047, + "grad_norm": 0.07763671875, + "learning_rate": 0.011481233243967827, + "loss": 0.8178, + "num_input_tokens_seen": 2982848, + "step": 5140 + }, + { + "epoch": 0.7663092046470062, + "grad_norm": 0.037353515625, + "learning_rate": 0.011492403932082215, + "loss": 0.8119, + "num_input_tokens_seen": 2985344, + "step": 5145 + }, + { + "epoch": 0.7670539171879655, + "grad_norm": 0.046875, + "learning_rate": 0.011503574620196603, + "loss": 0.8021, + "num_input_tokens_seen": 2988128, + "step": 5150 + }, + { + "epoch": 0.7677986297289247, + "grad_norm": 0.033935546875, + "learning_rate": 0.011514745308310991, + "loss": 0.8009, + "num_input_tokens_seen": 2991040, + "step": 5155 + }, + { + "epoch": 0.7685433422698839, + "grad_norm": 0.035400390625, + "learning_rate": 0.011525915996425379, + "loss": 0.7944, + "num_input_tokens_seen": 2994112, + "step": 5160 + }, + { + "epoch": 0.7692880548108431, + "grad_norm": 0.05615234375, + "learning_rate": 0.011537086684539767, + "loss": 0.7979, + "num_input_tokens_seen": 2996960, + "step": 5165 + }, + { + "epoch": 0.7700327673518021, + "grad_norm": 0.02197265625, + "learning_rate": 0.011548257372654155, + "loss": 0.7947, + "num_input_tokens_seen": 2999680, + "step": 5170 + }, + { + "epoch": 0.7707774798927614, + "grad_norm": 0.047119140625, + "learning_rate": 0.011559428060768543, + "loss": 0.7796, + "num_input_tokens_seen": 3002336, + "step": 5175 + }, + { + "epoch": 0.7715221924337206, + "grad_norm": 0.0419921875, + "learning_rate": 0.01157059874888293, + "loss": 0.8133, + "num_input_tokens_seen": 3005216, + "step": 5180 + }, + { + "epoch": 0.7722669049746798, + "grad_norm": 0.060791015625, + "learning_rate": 0.011581769436997317, + "loss": 0.771, + "num_input_tokens_seen": 3008032, + "step": 5185 + }, + { + "epoch": 0.773011617515639, + "grad_norm": 0.0260009765625, + "learning_rate": 0.011592940125111705, + "loss": 0.7818, + "num_input_tokens_seen": 3010848, + "step": 5190 + }, + { + "epoch": 0.7737563300565982, + "grad_norm": 0.039794921875, + "learning_rate": 0.011604110813226095, + "loss": 0.8213, + "num_input_tokens_seen": 3013568, + "step": 5195 + }, + { + "epoch": 0.7745010425975574, + "grad_norm": 0.049560546875, + "learning_rate": 0.011615281501340483, + "loss": 0.8254, + "num_input_tokens_seen": 3016512, + "step": 5200 + }, + { + "epoch": 0.7752457551385166, + "grad_norm": 0.08203125, + "learning_rate": 0.01162645218945487, + "loss": 0.86, + "num_input_tokens_seen": 3019424, + "step": 5205 + }, + { + "epoch": 0.7759904676794758, + "grad_norm": 0.039306640625, + "learning_rate": 0.011637622877569259, + "loss": 0.7721, + "num_input_tokens_seen": 3022368, + "step": 5210 + }, + { + "epoch": 0.7767351802204349, + "grad_norm": 0.044921875, + "learning_rate": 0.011648793565683647, + "loss": 0.7979, + "num_input_tokens_seen": 3025120, + "step": 5215 + }, + { + "epoch": 0.7774798927613941, + "grad_norm": 0.035400390625, + "learning_rate": 0.011659964253798035, + "loss": 0.7989, + "num_input_tokens_seen": 3028032, + "step": 5220 + }, + { + "epoch": 0.7782246053023533, + "grad_norm": 0.041015625, + "learning_rate": 0.011671134941912421, + "loss": 0.7838, + "num_input_tokens_seen": 3030624, + "step": 5225 + }, + { + "epoch": 0.7789693178433125, + "grad_norm": 0.06396484375, + "learning_rate": 0.011682305630026809, + "loss": 0.8213, + "num_input_tokens_seen": 3033472, + "step": 5230 + }, + { + "epoch": 0.7797140303842717, + "grad_norm": 0.05224609375, + "learning_rate": 0.011693476318141197, + "loss": 0.7751, + "num_input_tokens_seen": 3036384, + "step": 5235 + }, + { + "epoch": 0.7804587429252309, + "grad_norm": 0.054931640625, + "learning_rate": 0.011704647006255585, + "loss": 0.7908, + "num_input_tokens_seen": 3039200, + "step": 5240 + }, + { + "epoch": 0.7812034554661901, + "grad_norm": 0.03857421875, + "learning_rate": 0.011715817694369973, + "loss": 0.8105, + "num_input_tokens_seen": 3042208, + "step": 5245 + }, + { + "epoch": 0.7819481680071493, + "grad_norm": 0.06591796875, + "learning_rate": 0.01172698838248436, + "loss": 0.8146, + "num_input_tokens_seen": 3044992, + "step": 5250 + }, + { + "epoch": 0.7826928805481085, + "grad_norm": 0.06201171875, + "learning_rate": 0.011738159070598749, + "loss": 0.7683, + "num_input_tokens_seen": 3048288, + "step": 5255 + }, + { + "epoch": 0.7834375930890676, + "grad_norm": 0.0289306640625, + "learning_rate": 0.011749329758713137, + "loss": 0.8038, + "num_input_tokens_seen": 3051072, + "step": 5260 + }, + { + "epoch": 0.7841823056300268, + "grad_norm": 0.0296630859375, + "learning_rate": 0.011760500446827525, + "loss": 0.804, + "num_input_tokens_seen": 3054016, + "step": 5265 + }, + { + "epoch": 0.784927018170986, + "grad_norm": 0.06640625, + "learning_rate": 0.011771671134941911, + "loss": 0.7972, + "num_input_tokens_seen": 3057088, + "step": 5270 + }, + { + "epoch": 0.7856717307119452, + "grad_norm": 0.046875, + "learning_rate": 0.011782841823056299, + "loss": 0.8222, + "num_input_tokens_seen": 3060096, + "step": 5275 + }, + { + "epoch": 0.7864164432529044, + "grad_norm": 0.0252685546875, + "learning_rate": 0.011794012511170687, + "loss": 0.8099, + "num_input_tokens_seen": 3063040, + "step": 5280 + }, + { + "epoch": 0.7871611557938636, + "grad_norm": 0.034423828125, + "learning_rate": 0.011805183199285075, + "loss": 0.7993, + "num_input_tokens_seen": 3065824, + "step": 5285 + }, + { + "epoch": 0.7879058683348228, + "grad_norm": 0.061767578125, + "learning_rate": 0.011816353887399463, + "loss": 0.8205, + "num_input_tokens_seen": 3068672, + "step": 5290 + }, + { + "epoch": 0.788650580875782, + "grad_norm": 0.0546875, + "learning_rate": 0.01182752457551385, + "loss": 0.8064, + "num_input_tokens_seen": 3071616, + "step": 5295 + }, + { + "epoch": 0.7893952934167411, + "grad_norm": 0.049560546875, + "learning_rate": 0.011838695263628239, + "loss": 0.8012, + "num_input_tokens_seen": 3074432, + "step": 5300 + }, + { + "epoch": 0.7901400059577003, + "grad_norm": 0.359375, + "learning_rate": 0.011849865951742627, + "loss": 0.8249, + "num_input_tokens_seen": 3077472, + "step": 5305 + }, + { + "epoch": 0.7908847184986595, + "grad_norm": 0.1318359375, + "learning_rate": 0.011861036639857015, + "loss": 0.7426, + "num_input_tokens_seen": 3080544, + "step": 5310 + }, + { + "epoch": 0.7916294310396187, + "grad_norm": 0.1396484375, + "learning_rate": 0.011872207327971403, + "loss": 0.8176, + "num_input_tokens_seen": 3083584, + "step": 5315 + }, + { + "epoch": 0.7923741435805779, + "grad_norm": 0.1865234375, + "learning_rate": 0.01188337801608579, + "loss": 0.8499, + "num_input_tokens_seen": 3086208, + "step": 5320 + }, + { + "epoch": 0.7931188561215371, + "grad_norm": 0.08349609375, + "learning_rate": 0.011894548704200179, + "loss": 0.7876, + "num_input_tokens_seen": 3089152, + "step": 5325 + }, + { + "epoch": 0.7938635686624963, + "grad_norm": 0.046142578125, + "learning_rate": 0.011905719392314567, + "loss": 0.804, + "num_input_tokens_seen": 3091968, + "step": 5330 + }, + { + "epoch": 0.7946082812034555, + "grad_norm": 0.0279541015625, + "learning_rate": 0.011916890080428955, + "loss": 0.8121, + "num_input_tokens_seen": 3095072, + "step": 5335 + }, + { + "epoch": 0.7953529937444147, + "grad_norm": 0.048828125, + "learning_rate": 0.011928060768543343, + "loss": 0.7974, + "num_input_tokens_seen": 3097920, + "step": 5340 + }, + { + "epoch": 0.7960977062853738, + "grad_norm": 0.0257568359375, + "learning_rate": 0.01193923145665773, + "loss": 0.81, + "num_input_tokens_seen": 3101056, + "step": 5345 + }, + { + "epoch": 0.796842418826333, + "grad_norm": 0.09716796875, + "learning_rate": 0.011950402144772118, + "loss": 0.8107, + "num_input_tokens_seen": 3103904, + "step": 5350 + }, + { + "epoch": 0.7975871313672922, + "grad_norm": 0.053955078125, + "learning_rate": 0.011961572832886505, + "loss": 0.81, + "num_input_tokens_seen": 3106912, + "step": 5355 + }, + { + "epoch": 0.7983318439082514, + "grad_norm": 0.0400390625, + "learning_rate": 0.011972743521000893, + "loss": 0.8053, + "num_input_tokens_seen": 3109408, + "step": 5360 + }, + { + "epoch": 0.7990765564492106, + "grad_norm": 0.041259765625, + "learning_rate": 0.01198391420911528, + "loss": 0.8013, + "num_input_tokens_seen": 3112352, + "step": 5365 + }, + { + "epoch": 0.7998212689901698, + "grad_norm": 0.030029296875, + "learning_rate": 0.011995084897229669, + "loss": 0.7944, + "num_input_tokens_seen": 3115328, + "step": 5370 + }, + { + "epoch": 0.800565981531129, + "grad_norm": 0.06005859375, + "learning_rate": 0.012006255585344057, + "loss": 0.81, + "num_input_tokens_seen": 3118112, + "step": 5375 + }, + { + "epoch": 0.8013106940720882, + "grad_norm": 0.0303955078125, + "learning_rate": 0.012017426273458445, + "loss": 0.8168, + "num_input_tokens_seen": 3121280, + "step": 5380 + }, + { + "epoch": 0.8020554066130474, + "grad_norm": 0.044921875, + "learning_rate": 0.012028596961572833, + "loss": 0.7833, + "num_input_tokens_seen": 3124352, + "step": 5385 + }, + { + "epoch": 0.8028001191540065, + "grad_norm": 0.0303955078125, + "learning_rate": 0.01203976764968722, + "loss": 0.7945, + "num_input_tokens_seen": 3127296, + "step": 5390 + }, + { + "epoch": 0.8035448316949657, + "grad_norm": 0.02880859375, + "learning_rate": 0.012050938337801608, + "loss": 0.82, + "num_input_tokens_seen": 3130336, + "step": 5395 + }, + { + "epoch": 0.8042895442359249, + "grad_norm": 0.029052734375, + "learning_rate": 0.012062109025915995, + "loss": 0.8261, + "num_input_tokens_seen": 3133408, + "step": 5400 + }, + { + "epoch": 0.8050342567768841, + "grad_norm": 0.04248046875, + "learning_rate": 0.012073279714030383, + "loss": 0.8018, + "num_input_tokens_seen": 3136352, + "step": 5405 + }, + { + "epoch": 0.8057789693178433, + "grad_norm": 0.049560546875, + "learning_rate": 0.01208445040214477, + "loss": 0.7927, + "num_input_tokens_seen": 3139232, + "step": 5410 + }, + { + "epoch": 0.8065236818588025, + "grad_norm": 0.061279296875, + "learning_rate": 0.012095621090259159, + "loss": 0.8014, + "num_input_tokens_seen": 3142080, + "step": 5415 + }, + { + "epoch": 0.8072683943997617, + "grad_norm": 0.045654296875, + "learning_rate": 0.012106791778373548, + "loss": 0.8158, + "num_input_tokens_seen": 3144864, + "step": 5420 + }, + { + "epoch": 0.8080131069407209, + "grad_norm": 0.044921875, + "learning_rate": 0.012117962466487936, + "loss": 0.8133, + "num_input_tokens_seen": 3147712, + "step": 5425 + }, + { + "epoch": 0.80875781948168, + "grad_norm": 0.043701171875, + "learning_rate": 0.012129133154602324, + "loss": 0.8073, + "num_input_tokens_seen": 3150688, + "step": 5430 + }, + { + "epoch": 0.8095025320226392, + "grad_norm": 0.025390625, + "learning_rate": 0.012140303842716712, + "loss": 0.8118, + "num_input_tokens_seen": 3153760, + "step": 5435 + }, + { + "epoch": 0.8102472445635984, + "grad_norm": 0.05712890625, + "learning_rate": 0.012151474530831098, + "loss": 0.7649, + "num_input_tokens_seen": 3156896, + "step": 5440 + }, + { + "epoch": 0.8109919571045576, + "grad_norm": 0.0252685546875, + "learning_rate": 0.012162645218945486, + "loss": 0.8234, + "num_input_tokens_seen": 3159648, + "step": 5445 + }, + { + "epoch": 0.8117366696455168, + "grad_norm": 0.06201171875, + "learning_rate": 0.012173815907059874, + "loss": 0.7999, + "num_input_tokens_seen": 3162432, + "step": 5450 + }, + { + "epoch": 0.812481382186476, + "grad_norm": 0.037353515625, + "learning_rate": 0.012184986595174262, + "loss": 0.8002, + "num_input_tokens_seen": 3165088, + "step": 5455 + }, + { + "epoch": 0.8132260947274352, + "grad_norm": 0.05517578125, + "learning_rate": 0.01219615728328865, + "loss": 0.8089, + "num_input_tokens_seen": 3167936, + "step": 5460 + }, + { + "epoch": 0.8139708072683944, + "grad_norm": 0.04248046875, + "learning_rate": 0.012207327971403038, + "loss": 0.7996, + "num_input_tokens_seen": 3170848, + "step": 5465 + }, + { + "epoch": 0.8147155198093536, + "grad_norm": 0.042236328125, + "learning_rate": 0.012218498659517426, + "loss": 0.8074, + "num_input_tokens_seen": 3173760, + "step": 5470 + }, + { + "epoch": 0.8154602323503127, + "grad_norm": 0.037841796875, + "learning_rate": 0.012229669347631814, + "loss": 0.7984, + "num_input_tokens_seen": 3176288, + "step": 5475 + }, + { + "epoch": 0.8162049448912719, + "grad_norm": 0.0458984375, + "learning_rate": 0.012240840035746202, + "loss": 0.8244, + "num_input_tokens_seen": 3179392, + "step": 5480 + }, + { + "epoch": 0.8169496574322311, + "grad_norm": 0.053466796875, + "learning_rate": 0.012252010723860588, + "loss": 0.8104, + "num_input_tokens_seen": 3182080, + "step": 5485 + }, + { + "epoch": 0.8176943699731903, + "grad_norm": 0.048583984375, + "learning_rate": 0.012263181411974976, + "loss": 0.7917, + "num_input_tokens_seen": 3184480, + "step": 5490 + }, + { + "epoch": 0.8184390825141495, + "grad_norm": 0.02734375, + "learning_rate": 0.012274352100089364, + "loss": 0.8091, + "num_input_tokens_seen": 3187424, + "step": 5495 + }, + { + "epoch": 0.8191837950551087, + "grad_norm": 0.06982421875, + "learning_rate": 0.012285522788203752, + "loss": 0.8054, + "num_input_tokens_seen": 3190304, + "step": 5500 + }, + { + "epoch": 0.819928507596068, + "grad_norm": 0.045166015625, + "learning_rate": 0.01229669347631814, + "loss": 0.8003, + "num_input_tokens_seen": 3193280, + "step": 5505 + }, + { + "epoch": 0.8206732201370271, + "grad_norm": 0.040283203125, + "learning_rate": 0.012307864164432528, + "loss": 0.8141, + "num_input_tokens_seen": 3196160, + "step": 5510 + }, + { + "epoch": 0.8214179326779864, + "grad_norm": 0.04638671875, + "learning_rate": 0.012319034852546916, + "loss": 0.7999, + "num_input_tokens_seen": 3199072, + "step": 5515 + }, + { + "epoch": 0.8221626452189454, + "grad_norm": 0.0439453125, + "learning_rate": 0.012330205540661304, + "loss": 0.811, + "num_input_tokens_seen": 3201856, + "step": 5520 + }, + { + "epoch": 0.8229073577599046, + "grad_norm": 0.04443359375, + "learning_rate": 0.012341376228775692, + "loss": 0.818, + "num_input_tokens_seen": 3204672, + "step": 5525 + }, + { + "epoch": 0.8236520703008638, + "grad_norm": 0.0458984375, + "learning_rate": 0.012352546916890078, + "loss": 0.7992, + "num_input_tokens_seen": 3207392, + "step": 5530 + }, + { + "epoch": 0.824396782841823, + "grad_norm": 0.025634765625, + "learning_rate": 0.012363717605004468, + "loss": 0.8103, + "num_input_tokens_seen": 3210208, + "step": 5535 + }, + { + "epoch": 0.8251414953827823, + "grad_norm": 0.046630859375, + "learning_rate": 0.012374888293118856, + "loss": 0.8085, + "num_input_tokens_seen": 3212832, + "step": 5540 + }, + { + "epoch": 0.8258862079237415, + "grad_norm": 0.04931640625, + "learning_rate": 0.012386058981233244, + "loss": 0.8143, + "num_input_tokens_seen": 3215360, + "step": 5545 + }, + { + "epoch": 0.8266309204647007, + "grad_norm": 0.048095703125, + "learning_rate": 0.012397229669347632, + "loss": 0.7987, + "num_input_tokens_seen": 3218208, + "step": 5550 + }, + { + "epoch": 0.8273756330056599, + "grad_norm": 0.05078125, + "learning_rate": 0.01240840035746202, + "loss": 0.813, + "num_input_tokens_seen": 3221248, + "step": 5555 + }, + { + "epoch": 0.828120345546619, + "grad_norm": 0.048095703125, + "learning_rate": 0.012419571045576408, + "loss": 0.8108, + "num_input_tokens_seen": 3224416, + "step": 5560 + }, + { + "epoch": 0.8288650580875782, + "grad_norm": 0.058837890625, + "learning_rate": 0.012430741733690796, + "loss": 0.8077, + "num_input_tokens_seen": 3227104, + "step": 5565 + }, + { + "epoch": 0.8296097706285374, + "grad_norm": 0.064453125, + "learning_rate": 0.012441912421805182, + "loss": 0.7919, + "num_input_tokens_seen": 3229856, + "step": 5570 + }, + { + "epoch": 0.8303544831694966, + "grad_norm": 0.042236328125, + "learning_rate": 0.01245308310991957, + "loss": 0.818, + "num_input_tokens_seen": 3232960, + "step": 5575 + }, + { + "epoch": 0.8310991957104558, + "grad_norm": 0.0537109375, + "learning_rate": 0.012464253798033958, + "loss": 0.8051, + "num_input_tokens_seen": 3235840, + "step": 5580 + }, + { + "epoch": 0.831843908251415, + "grad_norm": 0.047119140625, + "learning_rate": 0.012475424486148346, + "loss": 0.8106, + "num_input_tokens_seen": 3238496, + "step": 5585 + }, + { + "epoch": 0.8325886207923742, + "grad_norm": 0.05712890625, + "learning_rate": 0.012486595174262734, + "loss": 0.7981, + "num_input_tokens_seen": 3241088, + "step": 5590 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.0257568359375, + "learning_rate": 0.012497765862377122, + "loss": 0.8031, + "num_input_tokens_seen": 3244032, + "step": 5595 + }, + { + "epoch": 0.8340780458742926, + "grad_norm": 0.0771484375, + "learning_rate": 0.01250893655049151, + "loss": 0.8035, + "num_input_tokens_seen": 3247040, + "step": 5600 + }, + { + "epoch": 0.8348227584152517, + "grad_norm": 0.06689453125, + "learning_rate": 0.012520107238605898, + "loss": 0.7781, + "num_input_tokens_seen": 3250080, + "step": 5605 + }, + { + "epoch": 0.8355674709562109, + "grad_norm": 0.08935546875, + "learning_rate": 0.012531277926720286, + "loss": 0.8418, + "num_input_tokens_seen": 3252992, + "step": 5610 + }, + { + "epoch": 0.8363121834971701, + "grad_norm": 0.043701171875, + "learning_rate": 0.012542448614834672, + "loss": 0.7999, + "num_input_tokens_seen": 3256032, + "step": 5615 + }, + { + "epoch": 0.8370568960381293, + "grad_norm": 0.06298828125, + "learning_rate": 0.01255361930294906, + "loss": 0.7991, + "num_input_tokens_seen": 3258656, + "step": 5620 + }, + { + "epoch": 0.8378016085790885, + "grad_norm": 0.07080078125, + "learning_rate": 0.012564789991063448, + "loss": 0.82, + "num_input_tokens_seen": 3261472, + "step": 5625 + }, + { + "epoch": 0.8385463211200477, + "grad_norm": 0.040771484375, + "learning_rate": 0.012575960679177836, + "loss": 0.7853, + "num_input_tokens_seen": 3264576, + "step": 5630 + }, + { + "epoch": 0.8392910336610069, + "grad_norm": 0.04150390625, + "learning_rate": 0.012587131367292224, + "loss": 0.795, + "num_input_tokens_seen": 3267488, + "step": 5635 + }, + { + "epoch": 0.8400357462019661, + "grad_norm": 0.0223388671875, + "learning_rate": 0.012598302055406612, + "loss": 0.8036, + "num_input_tokens_seen": 3270432, + "step": 5640 + }, + { + "epoch": 0.8407804587429253, + "grad_norm": 0.0439453125, + "learning_rate": 0.012609472743521, + "loss": 0.7911, + "num_input_tokens_seen": 3273216, + "step": 5645 + }, + { + "epoch": 0.8415251712838844, + "grad_norm": 0.042724609375, + "learning_rate": 0.01262064343163539, + "loss": 0.7837, + "num_input_tokens_seen": 3276448, + "step": 5650 + }, + { + "epoch": 0.8422698838248436, + "grad_norm": 0.05029296875, + "learning_rate": 0.012631814119749778, + "loss": 0.8102, + "num_input_tokens_seen": 3279296, + "step": 5655 + }, + { + "epoch": 0.8430145963658028, + "grad_norm": 0.05517578125, + "learning_rate": 0.012642984807864164, + "loss": 0.8369, + "num_input_tokens_seen": 3282112, + "step": 5660 + }, + { + "epoch": 0.843759308906762, + "grad_norm": 0.0361328125, + "learning_rate": 0.012654155495978552, + "loss": 0.784, + "num_input_tokens_seen": 3285344, + "step": 5665 + }, + { + "epoch": 0.8445040214477212, + "grad_norm": 0.0262451171875, + "learning_rate": 0.01266532618409294, + "loss": 0.82, + "num_input_tokens_seen": 3288384, + "step": 5670 + }, + { + "epoch": 0.8452487339886804, + "grad_norm": 0.034423828125, + "learning_rate": 0.012676496872207328, + "loss": 0.7832, + "num_input_tokens_seen": 3291360, + "step": 5675 + }, + { + "epoch": 0.8459934465296396, + "grad_norm": 0.040771484375, + "learning_rate": 0.012687667560321716, + "loss": 0.8049, + "num_input_tokens_seen": 3294496, + "step": 5680 + }, + { + "epoch": 0.8467381590705988, + "grad_norm": 0.037109375, + "learning_rate": 0.012698838248436104, + "loss": 0.8111, + "num_input_tokens_seen": 3297216, + "step": 5685 + }, + { + "epoch": 0.8474828716115579, + "grad_norm": 0.046875, + "learning_rate": 0.012710008936550492, + "loss": 0.8078, + "num_input_tokens_seen": 3300000, + "step": 5690 + }, + { + "epoch": 0.8482275841525171, + "grad_norm": 0.06591796875, + "learning_rate": 0.01272117962466488, + "loss": 0.7891, + "num_input_tokens_seen": 3302560, + "step": 5695 + }, + { + "epoch": 0.8489722966934763, + "grad_norm": 0.038330078125, + "learning_rate": 0.012732350312779266, + "loss": 0.7877, + "num_input_tokens_seen": 3305312, + "step": 5700 + }, + { + "epoch": 0.8497170092344355, + "grad_norm": 0.0260009765625, + "learning_rate": 0.012743521000893654, + "loss": 0.8203, + "num_input_tokens_seen": 3308192, + "step": 5705 + }, + { + "epoch": 0.8504617217753947, + "grad_norm": 0.05078125, + "learning_rate": 0.012754691689008042, + "loss": 0.8158, + "num_input_tokens_seen": 3311296, + "step": 5710 + }, + { + "epoch": 0.8512064343163539, + "grad_norm": 0.02294921875, + "learning_rate": 0.01276586237712243, + "loss": 0.8054, + "num_input_tokens_seen": 3313984, + "step": 5715 + }, + { + "epoch": 0.8519511468573131, + "grad_norm": 0.06005859375, + "learning_rate": 0.012777033065236818, + "loss": 0.8266, + "num_input_tokens_seen": 3316864, + "step": 5720 + }, + { + "epoch": 0.8526958593982723, + "grad_norm": 0.040771484375, + "learning_rate": 0.012788203753351206, + "loss": 0.7955, + "num_input_tokens_seen": 3319840, + "step": 5725 + }, + { + "epoch": 0.8534405719392315, + "grad_norm": 0.05224609375, + "learning_rate": 0.012799374441465594, + "loss": 0.8058, + "num_input_tokens_seen": 3322720, + "step": 5730 + }, + { + "epoch": 0.8541852844801906, + "grad_norm": 0.040283203125, + "learning_rate": 0.012810545129579982, + "loss": 0.7936, + "num_input_tokens_seen": 3325536, + "step": 5735 + }, + { + "epoch": 0.8549299970211498, + "grad_norm": 0.0556640625, + "learning_rate": 0.01282171581769437, + "loss": 0.7893, + "num_input_tokens_seen": 3328544, + "step": 5740 + }, + { + "epoch": 0.855674709562109, + "grad_norm": 0.04345703125, + "learning_rate": 0.012832886505808756, + "loss": 0.8038, + "num_input_tokens_seen": 3331520, + "step": 5745 + }, + { + "epoch": 0.8564194221030682, + "grad_norm": 0.0634765625, + "learning_rate": 0.012844057193923144, + "loss": 0.803, + "num_input_tokens_seen": 3334464, + "step": 5750 + }, + { + "epoch": 0.8571641346440274, + "grad_norm": 0.06396484375, + "learning_rate": 0.012855227882037532, + "loss": 0.8052, + "num_input_tokens_seen": 3337248, + "step": 5755 + }, + { + "epoch": 0.8579088471849866, + "grad_norm": 0.0291748046875, + "learning_rate": 0.012866398570151922, + "loss": 0.7954, + "num_input_tokens_seen": 3340128, + "step": 5760 + }, + { + "epoch": 0.8586535597259458, + "grad_norm": 0.0279541015625, + "learning_rate": 0.01287756925826631, + "loss": 0.818, + "num_input_tokens_seen": 3342784, + "step": 5765 + }, + { + "epoch": 0.859398272266905, + "grad_norm": 0.0400390625, + "learning_rate": 0.012888739946380698, + "loss": 0.8068, + "num_input_tokens_seen": 3345504, + "step": 5770 + }, + { + "epoch": 0.8601429848078642, + "grad_norm": 0.04150390625, + "learning_rate": 0.012899910634495085, + "loss": 0.8134, + "num_input_tokens_seen": 3348416, + "step": 5775 + }, + { + "epoch": 0.8608876973488233, + "grad_norm": 0.045654296875, + "learning_rate": 0.012911081322609473, + "loss": 0.7868, + "num_input_tokens_seen": 3351296, + "step": 5780 + }, + { + "epoch": 0.8616324098897825, + "grad_norm": 0.05517578125, + "learning_rate": 0.01292225201072386, + "loss": 0.8039, + "num_input_tokens_seen": 3354368, + "step": 5785 + }, + { + "epoch": 0.8623771224307417, + "grad_norm": 0.0400390625, + "learning_rate": 0.012933422698838248, + "loss": 0.8047, + "num_input_tokens_seen": 3357120, + "step": 5790 + }, + { + "epoch": 0.8631218349717009, + "grad_norm": 0.04541015625, + "learning_rate": 0.012944593386952636, + "loss": 0.8105, + "num_input_tokens_seen": 3360192, + "step": 5795 + }, + { + "epoch": 0.8638665475126601, + "grad_norm": 0.058837890625, + "learning_rate": 0.012955764075067024, + "loss": 0.8054, + "num_input_tokens_seen": 3362944, + "step": 5800 + }, + { + "epoch": 0.8646112600536193, + "grad_norm": 0.046142578125, + "learning_rate": 0.012966934763181412, + "loss": 0.7974, + "num_input_tokens_seen": 3365920, + "step": 5805 + }, + { + "epoch": 0.8653559725945785, + "grad_norm": 0.046142578125, + "learning_rate": 0.0129781054512958, + "loss": 0.8119, + "num_input_tokens_seen": 3368832, + "step": 5810 + }, + { + "epoch": 0.8661006851355377, + "grad_norm": 0.0306396484375, + "learning_rate": 0.012989276139410188, + "loss": 0.8121, + "num_input_tokens_seen": 3371808, + "step": 5815 + }, + { + "epoch": 0.8668453976764968, + "grad_norm": 0.0390625, + "learning_rate": 0.013000446827524575, + "loss": 0.8148, + "num_input_tokens_seen": 3374976, + "step": 5820 + }, + { + "epoch": 0.867590110217456, + "grad_norm": 0.0693359375, + "learning_rate": 0.013011617515638963, + "loss": 0.7944, + "num_input_tokens_seen": 3378112, + "step": 5825 + }, + { + "epoch": 0.8683348227584152, + "grad_norm": 0.03125, + "learning_rate": 0.01302278820375335, + "loss": 0.8143, + "num_input_tokens_seen": 3381024, + "step": 5830 + }, + { + "epoch": 0.8690795352993744, + "grad_norm": 0.053955078125, + "learning_rate": 0.013033958891867738, + "loss": 0.8114, + "num_input_tokens_seen": 3383808, + "step": 5835 + }, + { + "epoch": 0.8698242478403336, + "grad_norm": 0.046875, + "learning_rate": 0.013045129579982126, + "loss": 0.8025, + "num_input_tokens_seen": 3386848, + "step": 5840 + }, + { + "epoch": 0.8705689603812928, + "grad_norm": 0.026611328125, + "learning_rate": 0.013056300268096514, + "loss": 0.8064, + "num_input_tokens_seen": 3389760, + "step": 5845 + }, + { + "epoch": 0.871313672922252, + "grad_norm": 0.0634765625, + "learning_rate": 0.013067470956210902, + "loss": 0.8154, + "num_input_tokens_seen": 3392704, + "step": 5850 + }, + { + "epoch": 0.8720583854632112, + "grad_norm": 0.07421875, + "learning_rate": 0.01307864164432529, + "loss": 0.8065, + "num_input_tokens_seen": 3395744, + "step": 5855 + }, + { + "epoch": 0.8728030980041704, + "grad_norm": 0.050048828125, + "learning_rate": 0.013089812332439677, + "loss": 0.8019, + "num_input_tokens_seen": 3398592, + "step": 5860 + }, + { + "epoch": 0.8735478105451295, + "grad_norm": 0.0634765625, + "learning_rate": 0.013100983020554065, + "loss": 0.805, + "num_input_tokens_seen": 3401216, + "step": 5865 + }, + { + "epoch": 0.8742925230860887, + "grad_norm": 0.055908203125, + "learning_rate": 0.013112153708668453, + "loss": 0.8075, + "num_input_tokens_seen": 3404256, + "step": 5870 + }, + { + "epoch": 0.8750372356270479, + "grad_norm": 0.0303955078125, + "learning_rate": 0.013123324396782841, + "loss": 0.8127, + "num_input_tokens_seen": 3407232, + "step": 5875 + }, + { + "epoch": 0.8757819481680071, + "grad_norm": 0.06298828125, + "learning_rate": 0.01313449508489723, + "loss": 0.8013, + "num_input_tokens_seen": 3409984, + "step": 5880 + }, + { + "epoch": 0.8765266607089663, + "grad_norm": 0.052734375, + "learning_rate": 0.013145665773011617, + "loss": 0.8063, + "num_input_tokens_seen": 3412768, + "step": 5885 + }, + { + "epoch": 0.8772713732499255, + "grad_norm": 0.1552734375, + "learning_rate": 0.013156836461126005, + "loss": 0.8171, + "num_input_tokens_seen": 3415456, + "step": 5890 + }, + { + "epoch": 0.8780160857908847, + "grad_norm": 0.049560546875, + "learning_rate": 0.013168007149240393, + "loss": 0.8067, + "num_input_tokens_seen": 3418368, + "step": 5895 + }, + { + "epoch": 0.878760798331844, + "grad_norm": 0.059814453125, + "learning_rate": 0.013179177837354781, + "loss": 0.816, + "num_input_tokens_seen": 3421248, + "step": 5900 + }, + { + "epoch": 0.8795055108728032, + "grad_norm": 0.051025390625, + "learning_rate": 0.01319034852546917, + "loss": 0.8065, + "num_input_tokens_seen": 3424544, + "step": 5905 + }, + { + "epoch": 0.8802502234137622, + "grad_norm": 0.0771484375, + "learning_rate": 0.013201519213583557, + "loss": 0.8183, + "num_input_tokens_seen": 3427584, + "step": 5910 + }, + { + "epoch": 0.8809949359547214, + "grad_norm": 0.049560546875, + "learning_rate": 0.013212689901697943, + "loss": 0.8124, + "num_input_tokens_seen": 3430400, + "step": 5915 + }, + { + "epoch": 0.8817396484956807, + "grad_norm": 0.04443359375, + "learning_rate": 0.013223860589812331, + "loss": 0.8006, + "num_input_tokens_seen": 3433248, + "step": 5920 + }, + { + "epoch": 0.8824843610366399, + "grad_norm": 0.047119140625, + "learning_rate": 0.01323503127792672, + "loss": 0.7992, + "num_input_tokens_seen": 3436032, + "step": 5925 + }, + { + "epoch": 0.883229073577599, + "grad_norm": 0.052978515625, + "learning_rate": 0.013246201966041107, + "loss": 0.8244, + "num_input_tokens_seen": 3438912, + "step": 5930 + }, + { + "epoch": 0.8839737861185583, + "grad_norm": 0.0576171875, + "learning_rate": 0.013257372654155495, + "loss": 0.7996, + "num_input_tokens_seen": 3441760, + "step": 5935 + }, + { + "epoch": 0.8847184986595175, + "grad_norm": 0.0673828125, + "learning_rate": 0.013268543342269883, + "loss": 0.8019, + "num_input_tokens_seen": 3444448, + "step": 5940 + }, + { + "epoch": 0.8854632112004767, + "grad_norm": 0.02392578125, + "learning_rate": 0.013279714030384271, + "loss": 0.8066, + "num_input_tokens_seen": 3447104, + "step": 5945 + }, + { + "epoch": 0.8862079237414358, + "grad_norm": 0.05126953125, + "learning_rate": 0.01329088471849866, + "loss": 0.8091, + "num_input_tokens_seen": 3449856, + "step": 5950 + }, + { + "epoch": 0.886952636282395, + "grad_norm": 0.02734375, + "learning_rate": 0.013302055406613047, + "loss": 0.7831, + "num_input_tokens_seen": 3452544, + "step": 5955 + }, + { + "epoch": 0.8876973488233542, + "grad_norm": 0.05810546875, + "learning_rate": 0.013313226094727433, + "loss": 0.8009, + "num_input_tokens_seen": 3455136, + "step": 5960 + }, + { + "epoch": 0.8884420613643134, + "grad_norm": 0.0400390625, + "learning_rate": 0.013324396782841821, + "loss": 0.8075, + "num_input_tokens_seen": 3458272, + "step": 5965 + }, + { + "epoch": 0.8891867739052726, + "grad_norm": 0.053466796875, + "learning_rate": 0.01333556747095621, + "loss": 0.8151, + "num_input_tokens_seen": 3461504, + "step": 5970 + }, + { + "epoch": 0.8899314864462318, + "grad_norm": 0.028564453125, + "learning_rate": 0.013346738159070597, + "loss": 0.7904, + "num_input_tokens_seen": 3464448, + "step": 5975 + }, + { + "epoch": 0.890676198987191, + "grad_norm": 0.08251953125, + "learning_rate": 0.013357908847184985, + "loss": 0.8159, + "num_input_tokens_seen": 3467264, + "step": 5980 + }, + { + "epoch": 0.8914209115281502, + "grad_norm": 0.0274658203125, + "learning_rate": 0.013369079535299375, + "loss": 0.7972, + "num_input_tokens_seen": 3470080, + "step": 5985 + }, + { + "epoch": 0.8921656240691094, + "grad_norm": 0.052001953125, + "learning_rate": 0.013380250223413763, + "loss": 0.8169, + "num_input_tokens_seen": 3473088, + "step": 5990 + }, + { + "epoch": 0.8929103366100685, + "grad_norm": 0.05078125, + "learning_rate": 0.013391420911528151, + "loss": 0.8193, + "num_input_tokens_seen": 3476032, + "step": 5995 + }, + { + "epoch": 0.8936550491510277, + "grad_norm": 0.04638671875, + "learning_rate": 0.013402591599642539, + "loss": 0.8121, + "num_input_tokens_seen": 3478560, + "step": 6000 + }, + { + "epoch": 0.8943997616919869, + "grad_norm": 0.038818359375, + "learning_rate": 0.013413762287756925, + "loss": 0.8072, + "num_input_tokens_seen": 3482048, + "step": 6005 + }, + { + "epoch": 0.8951444742329461, + "grad_norm": 0.041748046875, + "learning_rate": 0.013424932975871313, + "loss": 0.8076, + "num_input_tokens_seen": 3485184, + "step": 6010 + }, + { + "epoch": 0.8958891867739053, + "grad_norm": 0.038818359375, + "learning_rate": 0.013436103663985701, + "loss": 0.7952, + "num_input_tokens_seen": 3488000, + "step": 6015 + }, + { + "epoch": 0.8966338993148645, + "grad_norm": 0.0281982421875, + "learning_rate": 0.013447274352100089, + "loss": 0.7965, + "num_input_tokens_seen": 3490816, + "step": 6020 + }, + { + "epoch": 0.8973786118558237, + "grad_norm": 0.036376953125, + "learning_rate": 0.013458445040214477, + "loss": 0.7955, + "num_input_tokens_seen": 3493728, + "step": 6025 + }, + { + "epoch": 0.8981233243967829, + "grad_norm": 0.0400390625, + "learning_rate": 0.013469615728328865, + "loss": 0.8088, + "num_input_tokens_seen": 3496384, + "step": 6030 + }, + { + "epoch": 0.898868036937742, + "grad_norm": 0.0517578125, + "learning_rate": 0.013480786416443253, + "loss": 0.8103, + "num_input_tokens_seen": 3499104, + "step": 6035 + }, + { + "epoch": 0.8996127494787012, + "grad_norm": 0.02490234375, + "learning_rate": 0.013491957104557641, + "loss": 0.8183, + "num_input_tokens_seen": 3501888, + "step": 6040 + }, + { + "epoch": 0.9003574620196604, + "grad_norm": 0.060302734375, + "learning_rate": 0.013503127792672027, + "loss": 0.8303, + "num_input_tokens_seen": 3504608, + "step": 6045 + }, + { + "epoch": 0.9011021745606196, + "grad_norm": 0.036865234375, + "learning_rate": 0.013514298480786415, + "loss": 0.7928, + "num_input_tokens_seen": 3507232, + "step": 6050 + }, + { + "epoch": 0.9018468871015788, + "grad_norm": 0.03564453125, + "learning_rate": 0.013525469168900803, + "loss": 0.8083, + "num_input_tokens_seen": 3510080, + "step": 6055 + }, + { + "epoch": 0.902591599642538, + "grad_norm": 0.022705078125, + "learning_rate": 0.013536639857015191, + "loss": 0.8051, + "num_input_tokens_seen": 3513280, + "step": 6060 + }, + { + "epoch": 0.9033363121834972, + "grad_norm": 0.03759765625, + "learning_rate": 0.013547810545129579, + "loss": 0.7937, + "num_input_tokens_seen": 3516128, + "step": 6065 + }, + { + "epoch": 0.9040810247244564, + "grad_norm": 0.0546875, + "learning_rate": 0.013558981233243967, + "loss": 0.8062, + "num_input_tokens_seen": 3518944, + "step": 6070 + }, + { + "epoch": 0.9048257372654156, + "grad_norm": 0.033447265625, + "learning_rate": 0.013570151921358355, + "loss": 0.7959, + "num_input_tokens_seen": 3521824, + "step": 6075 + }, + { + "epoch": 0.9055704498063747, + "grad_norm": 0.037109375, + "learning_rate": 0.013581322609472743, + "loss": 0.7991, + "num_input_tokens_seen": 3524640, + "step": 6080 + }, + { + "epoch": 0.9063151623473339, + "grad_norm": 0.037841796875, + "learning_rate": 0.013592493297587131, + "loss": 0.813, + "num_input_tokens_seen": 3527648, + "step": 6085 + }, + { + "epoch": 0.9070598748882931, + "grad_norm": 0.03466796875, + "learning_rate": 0.013603663985701517, + "loss": 0.7946, + "num_input_tokens_seen": 3530720, + "step": 6090 + }, + { + "epoch": 0.9078045874292523, + "grad_norm": 0.05615234375, + "learning_rate": 0.013614834673815907, + "loss": 0.7993, + "num_input_tokens_seen": 3533600, + "step": 6095 + }, + { + "epoch": 0.9085492999702115, + "grad_norm": 0.048095703125, + "learning_rate": 0.013626005361930295, + "loss": 0.7929, + "num_input_tokens_seen": 3536320, + "step": 6100 + }, + { + "epoch": 0.9092940125111707, + "grad_norm": 0.035400390625, + "learning_rate": 0.013637176050044683, + "loss": 0.7831, + "num_input_tokens_seen": 3539104, + "step": 6105 + }, + { + "epoch": 0.9100387250521299, + "grad_norm": 0.024169921875, + "learning_rate": 0.01364834673815907, + "loss": 0.8207, + "num_input_tokens_seen": 3542272, + "step": 6110 + }, + { + "epoch": 0.9107834375930891, + "grad_norm": 0.045166015625, + "learning_rate": 0.013659517426273459, + "loss": 0.8237, + "num_input_tokens_seen": 3545120, + "step": 6115 + }, + { + "epoch": 0.9115281501340483, + "grad_norm": 0.042236328125, + "learning_rate": 0.013670688114387847, + "loss": 0.7968, + "num_input_tokens_seen": 3548128, + "step": 6120 + }, + { + "epoch": 0.9122728626750074, + "grad_norm": 0.04736328125, + "learning_rate": 0.013681858802502235, + "loss": 0.8067, + "num_input_tokens_seen": 3551712, + "step": 6125 + }, + { + "epoch": 0.9130175752159666, + "grad_norm": 0.036865234375, + "learning_rate": 0.013693029490616623, + "loss": 0.8164, + "num_input_tokens_seen": 3554784, + "step": 6130 + }, + { + "epoch": 0.9137622877569258, + "grad_norm": 0.035400390625, + "learning_rate": 0.013704200178731009, + "loss": 0.7939, + "num_input_tokens_seen": 3557888, + "step": 6135 + }, + { + "epoch": 0.914507000297885, + "grad_norm": 0.040771484375, + "learning_rate": 0.013715370866845397, + "loss": 0.7987, + "num_input_tokens_seen": 3561248, + "step": 6140 + }, + { + "epoch": 0.9152517128388442, + "grad_norm": 0.02099609375, + "learning_rate": 0.013726541554959785, + "loss": 0.8002, + "num_input_tokens_seen": 3564256, + "step": 6145 + }, + { + "epoch": 0.9159964253798034, + "grad_norm": 0.023193359375, + "learning_rate": 0.013737712243074173, + "loss": 0.8237, + "num_input_tokens_seen": 3567424, + "step": 6150 + }, + { + "epoch": 0.9167411379207626, + "grad_norm": 0.03369140625, + "learning_rate": 0.01374888293118856, + "loss": 0.7894, + "num_input_tokens_seen": 3570432, + "step": 6155 + }, + { + "epoch": 0.9174858504617218, + "grad_norm": 0.05859375, + "learning_rate": 0.013760053619302949, + "loss": 0.8018, + "num_input_tokens_seen": 3573216, + "step": 6160 + }, + { + "epoch": 0.9182305630026809, + "grad_norm": 0.0203857421875, + "learning_rate": 0.013771224307417337, + "loss": 0.8075, + "num_input_tokens_seen": 3575872, + "step": 6165 + }, + { + "epoch": 0.9189752755436401, + "grad_norm": 0.033203125, + "learning_rate": 0.013782394995531725, + "loss": 0.8003, + "num_input_tokens_seen": 3578848, + "step": 6170 + }, + { + "epoch": 0.9197199880845993, + "grad_norm": 0.08251953125, + "learning_rate": 0.013793565683646111, + "loss": 0.8148, + "num_input_tokens_seen": 3581952, + "step": 6175 + }, + { + "epoch": 0.9204647006255585, + "grad_norm": 0.0458984375, + "learning_rate": 0.013804736371760499, + "loss": 0.8258, + "num_input_tokens_seen": 3584960, + "step": 6180 + }, + { + "epoch": 0.9212094131665177, + "grad_norm": 0.0242919921875, + "learning_rate": 0.013815907059874887, + "loss": 0.8054, + "num_input_tokens_seen": 3588192, + "step": 6185 + }, + { + "epoch": 0.9219541257074769, + "grad_norm": 0.036865234375, + "learning_rate": 0.013827077747989275, + "loss": 0.813, + "num_input_tokens_seen": 3591072, + "step": 6190 + }, + { + "epoch": 0.9226988382484361, + "grad_norm": 0.041748046875, + "learning_rate": 0.013838248436103663, + "loss": 0.8193, + "num_input_tokens_seen": 3594048, + "step": 6195 + }, + { + "epoch": 0.9234435507893953, + "grad_norm": 0.036376953125, + "learning_rate": 0.01384941912421805, + "loss": 0.7956, + "num_input_tokens_seen": 3596960, + "step": 6200 + }, + { + "epoch": 0.9241882633303545, + "grad_norm": 0.048828125, + "learning_rate": 0.013860589812332439, + "loss": 0.8127, + "num_input_tokens_seen": 3600224, + "step": 6205 + }, + { + "epoch": 0.9249329758713136, + "grad_norm": 0.03466796875, + "learning_rate": 0.013871760500446828, + "loss": 0.7939, + "num_input_tokens_seen": 3603360, + "step": 6210 + }, + { + "epoch": 0.9256776884122728, + "grad_norm": 0.038818359375, + "learning_rate": 0.013882931188561216, + "loss": 0.8018, + "num_input_tokens_seen": 3606144, + "step": 6215 + }, + { + "epoch": 0.926422400953232, + "grad_norm": 0.064453125, + "learning_rate": 0.013894101876675603, + "loss": 0.8331, + "num_input_tokens_seen": 3609280, + "step": 6220 + }, + { + "epoch": 0.9271671134941912, + "grad_norm": 0.04541015625, + "learning_rate": 0.01390527256478999, + "loss": 0.8229, + "num_input_tokens_seen": 3612320, + "step": 6225 + }, + { + "epoch": 0.9279118260351504, + "grad_norm": 0.05908203125, + "learning_rate": 0.013916443252904379, + "loss": 0.8139, + "num_input_tokens_seen": 3615296, + "step": 6230 + }, + { + "epoch": 0.9286565385761096, + "grad_norm": 0.037841796875, + "learning_rate": 0.013927613941018767, + "loss": 0.8148, + "num_input_tokens_seen": 3618336, + "step": 6235 + }, + { + "epoch": 0.9294012511170688, + "grad_norm": 0.039794921875, + "learning_rate": 0.013938784629133154, + "loss": 0.8099, + "num_input_tokens_seen": 3621280, + "step": 6240 + }, + { + "epoch": 0.930145963658028, + "grad_norm": 0.048583984375, + "learning_rate": 0.013949955317247542, + "loss": 0.7992, + "num_input_tokens_seen": 3624352, + "step": 6245 + }, + { + "epoch": 0.9308906761989872, + "grad_norm": 0.060791015625, + "learning_rate": 0.01396112600536193, + "loss": 0.8125, + "num_input_tokens_seen": 3627488, + "step": 6250 + }, + { + "epoch": 0.9316353887399463, + "grad_norm": 0.0274658203125, + "learning_rate": 0.013972296693476318, + "loss": 0.8096, + "num_input_tokens_seen": 3630400, + "step": 6255 + }, + { + "epoch": 0.9323801012809055, + "grad_norm": 0.0234375, + "learning_rate": 0.013983467381590706, + "loss": 0.8004, + "num_input_tokens_seen": 3632928, + "step": 6260 + }, + { + "epoch": 0.9331248138218647, + "grad_norm": 0.045166015625, + "learning_rate": 0.013994638069705093, + "loss": 0.8005, + "num_input_tokens_seen": 3635840, + "step": 6265 + }, + { + "epoch": 0.9338695263628239, + "grad_norm": 0.0255126953125, + "learning_rate": 0.01400580875781948, + "loss": 0.8054, + "num_input_tokens_seen": 3638656, + "step": 6270 + }, + { + "epoch": 0.9346142389037831, + "grad_norm": 0.0693359375, + "learning_rate": 0.014016979445933869, + "loss": 0.8107, + "num_input_tokens_seen": 3641472, + "step": 6275 + }, + { + "epoch": 0.9353589514447423, + "grad_norm": 0.03955078125, + "learning_rate": 0.014028150134048257, + "loss": 0.8041, + "num_input_tokens_seen": 3644192, + "step": 6280 + }, + { + "epoch": 0.9361036639857016, + "grad_norm": 0.041259765625, + "learning_rate": 0.014039320822162644, + "loss": 0.818, + "num_input_tokens_seen": 3646976, + "step": 6285 + }, + { + "epoch": 0.9368483765266608, + "grad_norm": 0.05712890625, + "learning_rate": 0.014050491510277032, + "loss": 0.8055, + "num_input_tokens_seen": 3649856, + "step": 6290 + }, + { + "epoch": 0.9375930890676198, + "grad_norm": 0.04052734375, + "learning_rate": 0.01406166219839142, + "loss": 0.7981, + "num_input_tokens_seen": 3652736, + "step": 6295 + }, + { + "epoch": 0.938337801608579, + "grad_norm": 0.041259765625, + "learning_rate": 0.014072832886505808, + "loss": 0.7895, + "num_input_tokens_seen": 3655648, + "step": 6300 + }, + { + "epoch": 0.9390825141495382, + "grad_norm": 0.07958984375, + "learning_rate": 0.014084003574620195, + "loss": 0.8139, + "num_input_tokens_seen": 3658432, + "step": 6305 + }, + { + "epoch": 0.9398272266904975, + "grad_norm": 0.060546875, + "learning_rate": 0.014095174262734583, + "loss": 0.8006, + "num_input_tokens_seen": 3661280, + "step": 6310 + }, + { + "epoch": 0.9405719392314567, + "grad_norm": 0.026123046875, + "learning_rate": 0.01410634495084897, + "loss": 0.8163, + "num_input_tokens_seen": 3664576, + "step": 6315 + }, + { + "epoch": 0.9413166517724159, + "grad_norm": 0.05126953125, + "learning_rate": 0.014117515638963359, + "loss": 0.8059, + "num_input_tokens_seen": 3667264, + "step": 6320 + }, + { + "epoch": 0.9420613643133751, + "grad_norm": 0.054931640625, + "learning_rate": 0.014128686327077748, + "loss": 0.7986, + "num_input_tokens_seen": 3670144, + "step": 6325 + }, + { + "epoch": 0.9428060768543343, + "grad_norm": 0.050048828125, + "learning_rate": 0.014139857015192136, + "loss": 0.8215, + "num_input_tokens_seen": 3672992, + "step": 6330 + }, + { + "epoch": 0.9435507893952935, + "grad_norm": 0.02392578125, + "learning_rate": 0.014151027703306524, + "loss": 0.8138, + "num_input_tokens_seen": 3675840, + "step": 6335 + }, + { + "epoch": 0.9442955019362526, + "grad_norm": 0.061279296875, + "learning_rate": 0.014162198391420912, + "loss": 0.7865, + "num_input_tokens_seen": 3678944, + "step": 6340 + }, + { + "epoch": 0.9450402144772118, + "grad_norm": 0.02099609375, + "learning_rate": 0.0141733690795353, + "loss": 0.8031, + "num_input_tokens_seen": 3681728, + "step": 6345 + }, + { + "epoch": 0.945784927018171, + "grad_norm": 0.046142578125, + "learning_rate": 0.014184539767649686, + "loss": 0.8163, + "num_input_tokens_seen": 3684672, + "step": 6350 + }, + { + "epoch": 0.9465296395591302, + "grad_norm": 0.059814453125, + "learning_rate": 0.014195710455764074, + "loss": 0.8092, + "num_input_tokens_seen": 3687296, + "step": 6355 + }, + { + "epoch": 0.9472743521000894, + "grad_norm": 0.046875, + "learning_rate": 0.014206881143878462, + "loss": 0.7923, + "num_input_tokens_seen": 3689856, + "step": 6360 + }, + { + "epoch": 0.9480190646410486, + "grad_norm": 0.04541015625, + "learning_rate": 0.01421805183199285, + "loss": 0.7985, + "num_input_tokens_seen": 3692704, + "step": 6365 + }, + { + "epoch": 0.9487637771820078, + "grad_norm": 0.0234375, + "learning_rate": 0.014229222520107238, + "loss": 0.7914, + "num_input_tokens_seen": 3695424, + "step": 6370 + }, + { + "epoch": 0.949508489722967, + "grad_norm": 0.04736328125, + "learning_rate": 0.014240393208221626, + "loss": 0.8087, + "num_input_tokens_seen": 3698592, + "step": 6375 + }, + { + "epoch": 0.9502532022639262, + "grad_norm": 0.0576171875, + "learning_rate": 0.014251563896336014, + "loss": 0.8106, + "num_input_tokens_seen": 3701568, + "step": 6380 + }, + { + "epoch": 0.9509979148048853, + "grad_norm": 0.038330078125, + "learning_rate": 0.014262734584450402, + "loss": 0.81, + "num_input_tokens_seen": 3704672, + "step": 6385 + }, + { + "epoch": 0.9517426273458445, + "grad_norm": 0.05712890625, + "learning_rate": 0.01427390527256479, + "loss": 0.7914, + "num_input_tokens_seen": 3707488, + "step": 6390 + }, + { + "epoch": 0.9524873398868037, + "grad_norm": 0.0546875, + "learning_rate": 0.014285075960679176, + "loss": 0.7909, + "num_input_tokens_seen": 3710528, + "step": 6395 + }, + { + "epoch": 0.9532320524277629, + "grad_norm": 0.0224609375, + "learning_rate": 0.014296246648793564, + "loss": 0.8176, + "num_input_tokens_seen": 3713664, + "step": 6400 + }, + { + "epoch": 0.9539767649687221, + "grad_norm": 0.044189453125, + "learning_rate": 0.014307417336907952, + "loss": 0.7824, + "num_input_tokens_seen": 3716704, + "step": 6405 + }, + { + "epoch": 0.9547214775096813, + "grad_norm": 0.04248046875, + "learning_rate": 0.01431858802502234, + "loss": 0.8177, + "num_input_tokens_seen": 3719456, + "step": 6410 + }, + { + "epoch": 0.9554661900506405, + "grad_norm": 0.039794921875, + "learning_rate": 0.014329758713136728, + "loss": 0.7949, + "num_input_tokens_seen": 3722400, + "step": 6415 + }, + { + "epoch": 0.9562109025915997, + "grad_norm": 0.0322265625, + "learning_rate": 0.014340929401251116, + "loss": 0.8079, + "num_input_tokens_seen": 3725024, + "step": 6420 + }, + { + "epoch": 0.9569556151325588, + "grad_norm": 0.043701171875, + "learning_rate": 0.014352100089365504, + "loss": 0.823, + "num_input_tokens_seen": 3727808, + "step": 6425 + }, + { + "epoch": 0.957700327673518, + "grad_norm": 0.031982421875, + "learning_rate": 0.014363270777479892, + "loss": 0.8397, + "num_input_tokens_seen": 3730432, + "step": 6430 + }, + { + "epoch": 0.9584450402144772, + "grad_norm": 0.0341796875, + "learning_rate": 0.01437444146559428, + "loss": 0.8023, + "num_input_tokens_seen": 3733088, + "step": 6435 + }, + { + "epoch": 0.9591897527554364, + "grad_norm": 0.036376953125, + "learning_rate": 0.014385612153708668, + "loss": 0.7973, + "num_input_tokens_seen": 3735840, + "step": 6440 + }, + { + "epoch": 0.9599344652963956, + "grad_norm": 0.031494140625, + "learning_rate": 0.014396782841823056, + "loss": 0.7992, + "num_input_tokens_seen": 3738464, + "step": 6445 + }, + { + "epoch": 0.9606791778373548, + "grad_norm": 0.03271484375, + "learning_rate": 0.014407953529937444, + "loss": 0.7993, + "num_input_tokens_seen": 3741280, + "step": 6450 + }, + { + "epoch": 0.961423890378314, + "grad_norm": 0.0537109375, + "learning_rate": 0.014419124218051832, + "loss": 0.7792, + "num_input_tokens_seen": 3744256, + "step": 6455 + }, + { + "epoch": 0.9621686029192732, + "grad_norm": 0.035400390625, + "learning_rate": 0.01443029490616622, + "loss": 0.8167, + "num_input_tokens_seen": 3747456, + "step": 6460 + }, + { + "epoch": 0.9629133154602324, + "grad_norm": 0.05224609375, + "learning_rate": 0.014441465594280608, + "loss": 0.787, + "num_input_tokens_seen": 3750240, + "step": 6465 + }, + { + "epoch": 0.9636580280011915, + "grad_norm": 0.0260009765625, + "learning_rate": 0.014452636282394996, + "loss": 0.8191, + "num_input_tokens_seen": 3752960, + "step": 6470 + }, + { + "epoch": 0.9644027405421507, + "grad_norm": 0.05517578125, + "learning_rate": 0.014463806970509384, + "loss": 0.7898, + "num_input_tokens_seen": 3755840, + "step": 6475 + }, + { + "epoch": 0.9651474530831099, + "grad_norm": 0.0400390625, + "learning_rate": 0.01447497765862377, + "loss": 0.8084, + "num_input_tokens_seen": 3758560, + "step": 6480 + }, + { + "epoch": 0.9658921656240691, + "grad_norm": 0.0238037109375, + "learning_rate": 0.014486148346738158, + "loss": 0.8083, + "num_input_tokens_seen": 3761376, + "step": 6485 + }, + { + "epoch": 0.9666368781650283, + "grad_norm": 0.046630859375, + "learning_rate": 0.014497319034852546, + "loss": 0.8182, + "num_input_tokens_seen": 3764128, + "step": 6490 + }, + { + "epoch": 0.9673815907059875, + "grad_norm": 0.03466796875, + "learning_rate": 0.014508489722966934, + "loss": 0.8054, + "num_input_tokens_seen": 3768128, + "step": 6495 + }, + { + "epoch": 0.9681263032469467, + "grad_norm": 0.03564453125, + "learning_rate": 0.014519660411081322, + "loss": 0.8145, + "num_input_tokens_seen": 3770976, + "step": 6500 + }, + { + "epoch": 0.9688710157879059, + "grad_norm": 0.050048828125, + "learning_rate": 0.01453083109919571, + "loss": 0.7892, + "num_input_tokens_seen": 3773888, + "step": 6505 + }, + { + "epoch": 0.9696157283288651, + "grad_norm": 0.03515625, + "learning_rate": 0.014542001787310098, + "loss": 0.7948, + "num_input_tokens_seen": 3777024, + "step": 6510 + }, + { + "epoch": 0.9703604408698242, + "grad_norm": 0.036865234375, + "learning_rate": 0.014553172475424486, + "loss": 0.8057, + "num_input_tokens_seen": 3779872, + "step": 6515 + }, + { + "epoch": 0.9711051534107834, + "grad_norm": 0.0322265625, + "learning_rate": 0.014564343163538874, + "loss": 0.804, + "num_input_tokens_seen": 3782656, + "step": 6520 + }, + { + "epoch": 0.9718498659517426, + "grad_norm": 0.037109375, + "learning_rate": 0.01457551385165326, + "loss": 0.8154, + "num_input_tokens_seen": 3785504, + "step": 6525 + }, + { + "epoch": 0.9725945784927018, + "grad_norm": 0.04736328125, + "learning_rate": 0.014586684539767648, + "loss": 0.7985, + "num_input_tokens_seen": 3788160, + "step": 6530 + }, + { + "epoch": 0.973339291033661, + "grad_norm": 0.056640625, + "learning_rate": 0.014597855227882036, + "loss": 0.812, + "num_input_tokens_seen": 3791424, + "step": 6535 + }, + { + "epoch": 0.9740840035746202, + "grad_norm": 0.020751953125, + "learning_rate": 0.014609025915996424, + "loss": 0.8049, + "num_input_tokens_seen": 3794400, + "step": 6540 + }, + { + "epoch": 0.9748287161155794, + "grad_norm": 0.0201416015625, + "learning_rate": 0.014620196604110812, + "loss": 0.8068, + "num_input_tokens_seen": 3797408, + "step": 6545 + }, + { + "epoch": 0.9755734286565386, + "grad_norm": 0.03466796875, + "learning_rate": 0.014631367292225202, + "loss": 0.8016, + "num_input_tokens_seen": 3800352, + "step": 6550 + }, + { + "epoch": 0.9763181411974977, + "grad_norm": 0.0184326171875, + "learning_rate": 0.01464253798033959, + "loss": 0.7978, + "num_input_tokens_seen": 3803328, + "step": 6555 + }, + { + "epoch": 0.9770628537384569, + "grad_norm": 0.038818359375, + "learning_rate": 0.014653708668453978, + "loss": 0.8172, + "num_input_tokens_seen": 3806176, + "step": 6560 + }, + { + "epoch": 0.9778075662794161, + "grad_norm": 0.03076171875, + "learning_rate": 0.014664879356568364, + "loss": 0.8204, + "num_input_tokens_seen": 3809088, + "step": 6565 + }, + { + "epoch": 0.9785522788203753, + "grad_norm": 0.0196533203125, + "learning_rate": 0.014676050044682752, + "loss": 0.799, + "num_input_tokens_seen": 3811680, + "step": 6570 + }, + { + "epoch": 0.9792969913613345, + "grad_norm": 0.0341796875, + "learning_rate": 0.01468722073279714, + "loss": 0.8039, + "num_input_tokens_seen": 3814592, + "step": 6575 + }, + { + "epoch": 0.9800417039022937, + "grad_norm": 0.0186767578125, + "learning_rate": 0.014698391420911528, + "loss": 0.7918, + "num_input_tokens_seen": 3817728, + "step": 6580 + }, + { + "epoch": 0.9807864164432529, + "grad_norm": 0.020263671875, + "learning_rate": 0.014709562109025916, + "loss": 0.8036, + "num_input_tokens_seen": 3820608, + "step": 6585 + }, + { + "epoch": 0.9815311289842121, + "grad_norm": 0.03369140625, + "learning_rate": 0.014720732797140304, + "loss": 0.7988, + "num_input_tokens_seen": 3823648, + "step": 6590 + }, + { + "epoch": 0.9822758415251713, + "grad_norm": 0.0223388671875, + "learning_rate": 0.014731903485254692, + "loss": 0.7981, + "num_input_tokens_seen": 3826624, + "step": 6595 + }, + { + "epoch": 0.9830205540661304, + "grad_norm": 0.039794921875, + "learning_rate": 0.01474307417336908, + "loss": 0.7964, + "num_input_tokens_seen": 3829632, + "step": 6600 + }, + { + "epoch": 0.9837652666070896, + "grad_norm": 0.07568359375, + "learning_rate": 0.014754244861483468, + "loss": 0.8394, + "num_input_tokens_seen": 3832384, + "step": 6605 + }, + { + "epoch": 0.9845099791480488, + "grad_norm": 0.0380859375, + "learning_rate": 0.014765415549597854, + "loss": 0.822, + "num_input_tokens_seen": 3835488, + "step": 6610 + }, + { + "epoch": 0.985254691689008, + "grad_norm": 0.0654296875, + "learning_rate": 0.014776586237712242, + "loss": 0.7932, + "num_input_tokens_seen": 3838560, + "step": 6615 + }, + { + "epoch": 0.9859994042299672, + "grad_norm": 0.037109375, + "learning_rate": 0.01478775692582663, + "loss": 0.7911, + "num_input_tokens_seen": 3841376, + "step": 6620 + }, + { + "epoch": 0.9867441167709264, + "grad_norm": 0.083984375, + "learning_rate": 0.014798927613941018, + "loss": 0.8141, + "num_input_tokens_seen": 3844192, + "step": 6625 + }, + { + "epoch": 0.9874888293118856, + "grad_norm": 0.04345703125, + "learning_rate": 0.014810098302055406, + "loss": 0.8215, + "num_input_tokens_seen": 3847424, + "step": 6630 + }, + { + "epoch": 0.9882335418528448, + "grad_norm": 0.0625, + "learning_rate": 0.014821268990169794, + "loss": 0.8119, + "num_input_tokens_seen": 3850048, + "step": 6635 + }, + { + "epoch": 0.988978254393804, + "grad_norm": 0.039794921875, + "learning_rate": 0.014832439678284182, + "loss": 0.8461, + "num_input_tokens_seen": 3852736, + "step": 6640 + }, + { + "epoch": 0.9897229669347631, + "grad_norm": 0.0478515625, + "learning_rate": 0.01484361036639857, + "loss": 0.8075, + "num_input_tokens_seen": 3855456, + "step": 6645 + }, + { + "epoch": 0.9904676794757223, + "grad_norm": 0.04248046875, + "learning_rate": 0.014854781054512958, + "loss": 0.8061, + "num_input_tokens_seen": 3858080, + "step": 6650 + }, + { + "epoch": 0.9912123920166815, + "grad_norm": 0.06640625, + "learning_rate": 0.014865951742627344, + "loss": 0.7976, + "num_input_tokens_seen": 3860864, + "step": 6655 + }, + { + "epoch": 0.9919571045576407, + "grad_norm": 0.04052734375, + "learning_rate": 0.014877122430741734, + "loss": 0.8055, + "num_input_tokens_seen": 3863840, + "step": 6660 + }, + { + "epoch": 0.9927018170986, + "grad_norm": 0.05712890625, + "learning_rate": 0.014888293118856121, + "loss": 0.7923, + "num_input_tokens_seen": 3866720, + "step": 6665 + }, + { + "epoch": 0.9934465296395592, + "grad_norm": 0.06640625, + "learning_rate": 0.01489946380697051, + "loss": 0.8172, + "num_input_tokens_seen": 3869504, + "step": 6670 + }, + { + "epoch": 0.9941912421805184, + "grad_norm": 0.045654296875, + "learning_rate": 0.014910634495084897, + "loss": 0.8171, + "num_input_tokens_seen": 3872608, + "step": 6675 + }, + { + "epoch": 0.9949359547214776, + "grad_norm": 0.0703125, + "learning_rate": 0.014921805183199285, + "loss": 0.8407, + "num_input_tokens_seen": 3875584, + "step": 6680 + }, + { + "epoch": 0.9956806672624366, + "grad_norm": 0.042236328125, + "learning_rate": 0.014932975871313673, + "loss": 0.8208, + "num_input_tokens_seen": 3878336, + "step": 6685 + }, + { + "epoch": 0.9964253798033958, + "grad_norm": 0.036376953125, + "learning_rate": 0.014944146559428061, + "loss": 0.806, + "num_input_tokens_seen": 3881600, + "step": 6690 + }, + { + "epoch": 0.997170092344355, + "grad_norm": 0.045654296875, + "learning_rate": 0.014955317247542448, + "loss": 0.822, + "num_input_tokens_seen": 3884160, + "step": 6695 + }, + { + "epoch": 0.9979148048853143, + "grad_norm": 0.041748046875, + "learning_rate": 0.014966487935656836, + "loss": 0.8004, + "num_input_tokens_seen": 3886944, + "step": 6700 + }, + { + "epoch": 0.9986595174262735, + "grad_norm": 0.0712890625, + "learning_rate": 0.014977658623771224, + "loss": 0.8172, + "num_input_tokens_seen": 3889856, + "step": 6705 + }, + { + "epoch": 0.9994042299672327, + "grad_norm": 0.04345703125, + "learning_rate": 0.014988829311885611, + "loss": 0.8045, + "num_input_tokens_seen": 3892704, + "step": 6710 + }, + { + "epoch": 1.0, + "eval_loss": 0.8024358153343201, + "eval_runtime": 70.6851, + "eval_samples_per_second": 42.215, + "eval_steps_per_second": 10.554, + "num_input_tokens_seen": 3894688, + "step": 6714 + }, + { + "epoch": 1.0001489425081918, + "grad_norm": 0.04833984375, + "learning_rate": 0.015, + "loss": 0.7923, + "num_input_tokens_seen": 3895200, + "step": 6715 + }, + { + "epoch": 1.000893655049151, + "grad_norm": 0.025634765625, + "learning_rate": 0.015011170688114387, + "loss": 0.8033, + "num_input_tokens_seen": 3897984, + "step": 6720 + }, + { + "epoch": 1.0016383675901102, + "grad_norm": 0.0262451171875, + "learning_rate": 0.015022341376228775, + "loss": 0.8, + "num_input_tokens_seen": 3901024, + "step": 6725 + }, + { + "epoch": 1.0023830801310694, + "grad_norm": 0.07080078125, + "learning_rate": 0.015033512064343163, + "loss": 0.8144, + "num_input_tokens_seen": 3904032, + "step": 6730 + }, + { + "epoch": 1.0031277926720286, + "grad_norm": 0.0537109375, + "learning_rate": 0.015044682752457551, + "loss": 0.7985, + "num_input_tokens_seen": 3906912, + "step": 6735 + }, + { + "epoch": 1.0038725052129878, + "grad_norm": 0.044677734375, + "learning_rate": 0.01505585344057194, + "loss": 0.8279, + "num_input_tokens_seen": 3909952, + "step": 6740 + }, + { + "epoch": 1.004617217753947, + "grad_norm": 0.04443359375, + "learning_rate": 0.015067024128686327, + "loss": 0.795, + "num_input_tokens_seen": 3912800, + "step": 6745 + }, + { + "epoch": 1.0053619302949062, + "grad_norm": 0.0274658203125, + "learning_rate": 0.015078194816800715, + "loss": 0.8183, + "num_input_tokens_seen": 3915584, + "step": 6750 + }, + { + "epoch": 1.0061066428358654, + "grad_norm": 0.03662109375, + "learning_rate": 0.015089365504915103, + "loss": 0.8189, + "num_input_tokens_seen": 3918432, + "step": 6755 + }, + { + "epoch": 1.0068513553768246, + "grad_norm": 0.041748046875, + "learning_rate": 0.015100536193029488, + "loss": 0.8139, + "num_input_tokens_seen": 3921216, + "step": 6760 + }, + { + "epoch": 1.0075960679177838, + "grad_norm": 0.050537109375, + "learning_rate": 0.015111706881143876, + "loss": 0.8178, + "num_input_tokens_seen": 3924256, + "step": 6765 + }, + { + "epoch": 1.008340780458743, + "grad_norm": 0.06396484375, + "learning_rate": 0.015122877569258265, + "loss": 0.8088, + "num_input_tokens_seen": 3926976, + "step": 6770 + }, + { + "epoch": 1.0090854929997022, + "grad_norm": 0.02490234375, + "learning_rate": 0.015134048257372653, + "loss": 0.7981, + "num_input_tokens_seen": 3929792, + "step": 6775 + }, + { + "epoch": 1.0098302055406614, + "grad_norm": 0.060546875, + "learning_rate": 0.015145218945487041, + "loss": 0.7922, + "num_input_tokens_seen": 3932768, + "step": 6780 + }, + { + "epoch": 1.0105749180816206, + "grad_norm": 0.04638671875, + "learning_rate": 0.01515638963360143, + "loss": 0.8037, + "num_input_tokens_seen": 3935456, + "step": 6785 + }, + { + "epoch": 1.0113196306225798, + "grad_norm": 0.06494140625, + "learning_rate": 0.015167560321715817, + "loss": 0.8015, + "num_input_tokens_seen": 3938208, + "step": 6790 + }, + { + "epoch": 1.0120643431635388, + "grad_norm": 0.05322265625, + "learning_rate": 0.015178731009830205, + "loss": 0.8209, + "num_input_tokens_seen": 3941088, + "step": 6795 + }, + { + "epoch": 1.012809055704498, + "grad_norm": 0.041015625, + "learning_rate": 0.015189901697944593, + "loss": 0.8025, + "num_input_tokens_seen": 3943904, + "step": 6800 + }, + { + "epoch": 1.0135537682454572, + "grad_norm": 0.050048828125, + "learning_rate": 0.015201072386058981, + "loss": 0.7822, + "num_input_tokens_seen": 3946752, + "step": 6805 + }, + { + "epoch": 1.0142984807864164, + "grad_norm": 0.05322265625, + "learning_rate": 0.015212243074173369, + "loss": 0.7825, + "num_input_tokens_seen": 3949632, + "step": 6810 + }, + { + "epoch": 1.0150431933273756, + "grad_norm": 0.054931640625, + "learning_rate": 0.015223413762287757, + "loss": 0.8204, + "num_input_tokens_seen": 3952864, + "step": 6815 + }, + { + "epoch": 1.0157879058683348, + "grad_norm": 0.058837890625, + "learning_rate": 0.015234584450402145, + "loss": 0.7648, + "num_input_tokens_seen": 3956032, + "step": 6820 + }, + { + "epoch": 1.016532618409294, + "grad_norm": 0.0400390625, + "learning_rate": 0.015245755138516533, + "loss": 0.7856, + "num_input_tokens_seen": 3958944, + "step": 6825 + }, + { + "epoch": 1.0172773309502532, + "grad_norm": 0.05517578125, + "learning_rate": 0.015256925826630921, + "loss": 0.8415, + "num_input_tokens_seen": 3961984, + "step": 6830 + }, + { + "epoch": 1.0180220434912124, + "grad_norm": 0.049072265625, + "learning_rate": 0.015268096514745309, + "loss": 0.8105, + "num_input_tokens_seen": 3964768, + "step": 6835 + }, + { + "epoch": 1.0187667560321716, + "grad_norm": 0.036865234375, + "learning_rate": 0.015279267202859697, + "loss": 0.809, + "num_input_tokens_seen": 3967584, + "step": 6840 + }, + { + "epoch": 1.0195114685731308, + "grad_norm": 0.060302734375, + "learning_rate": 0.015290437890974081, + "loss": 0.8168, + "num_input_tokens_seen": 3970336, + "step": 6845 + }, + { + "epoch": 1.02025618111409, + "grad_norm": 0.03466796875, + "learning_rate": 0.01530160857908847, + "loss": 0.7899, + "num_input_tokens_seen": 3972896, + "step": 6850 + }, + { + "epoch": 1.0210008936550492, + "grad_norm": 0.0220947265625, + "learning_rate": 0.015312779267202857, + "loss": 0.818, + "num_input_tokens_seen": 3975808, + "step": 6855 + }, + { + "epoch": 1.0217456061960084, + "grad_norm": 0.033447265625, + "learning_rate": 0.015323949955317245, + "loss": 0.8074, + "num_input_tokens_seen": 3978688, + "step": 6860 + }, + { + "epoch": 1.0224903187369676, + "grad_norm": 0.0400390625, + "learning_rate": 0.015335120643431633, + "loss": 0.8066, + "num_input_tokens_seen": 3981408, + "step": 6865 + }, + { + "epoch": 1.0232350312779268, + "grad_norm": 0.0380859375, + "learning_rate": 0.015346291331546021, + "loss": 0.823, + "num_input_tokens_seen": 3984384, + "step": 6870 + }, + { + "epoch": 1.023979743818886, + "grad_norm": 0.04248046875, + "learning_rate": 0.01535746201966041, + "loss": 0.813, + "num_input_tokens_seen": 3987040, + "step": 6875 + }, + { + "epoch": 1.024724456359845, + "grad_norm": 0.032958984375, + "learning_rate": 0.015368632707774797, + "loss": 0.7965, + "num_input_tokens_seen": 3990016, + "step": 6880 + }, + { + "epoch": 1.0254691689008042, + "grad_norm": 0.04736328125, + "learning_rate": 0.015379803395889187, + "loss": 0.7969, + "num_input_tokens_seen": 3993248, + "step": 6885 + }, + { + "epoch": 1.0262138814417634, + "grad_norm": 0.054443359375, + "learning_rate": 0.015390974084003575, + "loss": 0.8033, + "num_input_tokens_seen": 3996256, + "step": 6890 + }, + { + "epoch": 1.0269585939827226, + "grad_norm": 0.05517578125, + "learning_rate": 0.015402144772117963, + "loss": 0.8123, + "num_input_tokens_seen": 3999360, + "step": 6895 + }, + { + "epoch": 1.0277033065236818, + "grad_norm": 0.057373046875, + "learning_rate": 0.01541331546023235, + "loss": 0.8015, + "num_input_tokens_seen": 4002400, + "step": 6900 + }, + { + "epoch": 1.028448019064641, + "grad_norm": 0.0673828125, + "learning_rate": 0.015424486148346739, + "loss": 0.8289, + "num_input_tokens_seen": 4005664, + "step": 6905 + }, + { + "epoch": 1.0291927316056002, + "grad_norm": 0.0419921875, + "learning_rate": 0.015435656836461127, + "loss": 0.8076, + "num_input_tokens_seen": 4008736, + "step": 6910 + }, + { + "epoch": 1.0299374441465594, + "grad_norm": 0.0615234375, + "learning_rate": 0.015446827524575515, + "loss": 0.8024, + "num_input_tokens_seen": 4011616, + "step": 6915 + }, + { + "epoch": 1.0306821566875186, + "grad_norm": 0.056396484375, + "learning_rate": 0.015457998212689903, + "loss": 0.8047, + "num_input_tokens_seen": 4014272, + "step": 6920 + }, + { + "epoch": 1.0314268692284778, + "grad_norm": 0.021240234375, + "learning_rate": 0.01546916890080429, + "loss": 0.7993, + "num_input_tokens_seen": 4017248, + "step": 6925 + }, + { + "epoch": 1.032171581769437, + "grad_norm": 0.055419921875, + "learning_rate": 0.015480339588918679, + "loss": 0.8188, + "num_input_tokens_seen": 4019680, + "step": 6930 + }, + { + "epoch": 1.0329162943103962, + "grad_norm": 0.024169921875, + "learning_rate": 0.015491510277033063, + "loss": 0.8175, + "num_input_tokens_seen": 4022336, + "step": 6935 + }, + { + "epoch": 1.0336610068513554, + "grad_norm": 0.037841796875, + "learning_rate": 0.015502680965147451, + "loss": 0.8063, + "num_input_tokens_seen": 4025216, + "step": 6940 + }, + { + "epoch": 1.0344057193923146, + "grad_norm": 0.037841796875, + "learning_rate": 0.015513851653261839, + "loss": 0.7984, + "num_input_tokens_seen": 4028064, + "step": 6945 + }, + { + "epoch": 1.0351504319332738, + "grad_norm": 0.035888671875, + "learning_rate": 0.015525022341376227, + "loss": 0.7955, + "num_input_tokens_seen": 4030912, + "step": 6950 + }, + { + "epoch": 1.035895144474233, + "grad_norm": 0.05712890625, + "learning_rate": 0.015536193029490615, + "loss": 0.7933, + "num_input_tokens_seen": 4033952, + "step": 6955 + }, + { + "epoch": 1.0366398570151922, + "grad_norm": 0.0277099609375, + "learning_rate": 0.015547363717605003, + "loss": 0.8143, + "num_input_tokens_seen": 4036544, + "step": 6960 + }, + { + "epoch": 1.0373845695561514, + "grad_norm": 0.03955078125, + "learning_rate": 0.015558534405719391, + "loss": 0.7787, + "num_input_tokens_seen": 4039296, + "step": 6965 + }, + { + "epoch": 1.0381292820971104, + "grad_norm": 0.033935546875, + "learning_rate": 0.015569705093833779, + "loss": 0.8042, + "num_input_tokens_seen": 4042400, + "step": 6970 + }, + { + "epoch": 1.0388739946380696, + "grad_norm": 0.027099609375, + "learning_rate": 0.015580875781948167, + "loss": 0.8285, + "num_input_tokens_seen": 4045152, + "step": 6975 + }, + { + "epoch": 1.0396187071790288, + "grad_norm": 0.0322265625, + "learning_rate": 0.015592046470062555, + "loss": 0.7893, + "num_input_tokens_seen": 4048032, + "step": 6980 + }, + { + "epoch": 1.040363419719988, + "grad_norm": 0.047607421875, + "learning_rate": 0.015603217158176943, + "loss": 0.8091, + "num_input_tokens_seen": 4050752, + "step": 6985 + }, + { + "epoch": 1.0411081322609472, + "grad_norm": 0.04736328125, + "learning_rate": 0.01561438784629133, + "loss": 0.8025, + "num_input_tokens_seen": 4053600, + "step": 6990 + }, + { + "epoch": 1.0418528448019064, + "grad_norm": 0.054931640625, + "learning_rate": 0.01562555853440572, + "loss": 0.798, + "num_input_tokens_seen": 4056608, + "step": 6995 + }, + { + "epoch": 1.0425975573428656, + "grad_norm": 0.0654296875, + "learning_rate": 0.01563672922252011, + "loss": 0.8131, + "num_input_tokens_seen": 4059616, + "step": 7000 + }, + { + "epoch": 1.0433422698838248, + "grad_norm": 0.021728515625, + "learning_rate": 0.015647899910634495, + "loss": 0.8044, + "num_input_tokens_seen": 4062688, + "step": 7005 + }, + { + "epoch": 1.044086982424784, + "grad_norm": 0.0537109375, + "learning_rate": 0.015659070598748884, + "loss": 0.8073, + "num_input_tokens_seen": 4065408, + "step": 7010 + }, + { + "epoch": 1.0448316949657432, + "grad_norm": 0.037353515625, + "learning_rate": 0.01567024128686327, + "loss": 0.7934, + "num_input_tokens_seen": 4068288, + "step": 7015 + }, + { + "epoch": 1.0455764075067024, + "grad_norm": 0.034423828125, + "learning_rate": 0.015681411974977657, + "loss": 0.8063, + "num_input_tokens_seen": 4070944, + "step": 7020 + }, + { + "epoch": 1.0463211200476616, + "grad_norm": 0.033447265625, + "learning_rate": 0.015692582663092043, + "loss": 0.7997, + "num_input_tokens_seen": 4073696, + "step": 7025 + }, + { + "epoch": 1.0470658325886208, + "grad_norm": 0.033203125, + "learning_rate": 0.015703753351206433, + "loss": 0.795, + "num_input_tokens_seen": 4076576, + "step": 7030 + }, + { + "epoch": 1.04781054512958, + "grad_norm": 0.068359375, + "learning_rate": 0.01571492403932082, + "loss": 0.786, + "num_input_tokens_seen": 4079136, + "step": 7035 + }, + { + "epoch": 1.0485552576705393, + "grad_norm": 0.050048828125, + "learning_rate": 0.01572609472743521, + "loss": 0.818, + "num_input_tokens_seen": 4082208, + "step": 7040 + }, + { + "epoch": 1.0492999702114985, + "grad_norm": 0.050048828125, + "learning_rate": 0.015737265415549595, + "loss": 0.8051, + "num_input_tokens_seen": 4085120, + "step": 7045 + }, + { + "epoch": 1.0500446827524577, + "grad_norm": 0.02001953125, + "learning_rate": 0.015748436103663985, + "loss": 0.8095, + "num_input_tokens_seen": 4087712, + "step": 7050 + }, + { + "epoch": 1.0507893952934166, + "grad_norm": 0.043212890625, + "learning_rate": 0.015759606791778374, + "loss": 0.8034, + "num_input_tokens_seen": 4090464, + "step": 7055 + }, + { + "epoch": 1.0515341078343758, + "grad_norm": 0.0177001953125, + "learning_rate": 0.01577077747989276, + "loss": 0.8131, + "num_input_tokens_seen": 4093184, + "step": 7060 + }, + { + "epoch": 1.052278820375335, + "grad_norm": 0.055908203125, + "learning_rate": 0.01578194816800715, + "loss": 0.8109, + "num_input_tokens_seen": 4095968, + "step": 7065 + }, + { + "epoch": 1.0530235329162942, + "grad_norm": 0.0218505859375, + "learning_rate": 0.015793118856121537, + "loss": 0.787, + "num_input_tokens_seen": 4098912, + "step": 7070 + }, + { + "epoch": 1.0537682454572534, + "grad_norm": 0.0341796875, + "learning_rate": 0.015804289544235926, + "loss": 0.7734, + "num_input_tokens_seen": 4101696, + "step": 7075 + }, + { + "epoch": 1.0545129579982127, + "grad_norm": 0.052001953125, + "learning_rate": 0.015815460232350313, + "loss": 0.8148, + "num_input_tokens_seen": 4104736, + "step": 7080 + }, + { + "epoch": 1.0552576705391719, + "grad_norm": 0.05078125, + "learning_rate": 0.015826630920464702, + "loss": 0.8221, + "num_input_tokens_seen": 4107776, + "step": 7085 + }, + { + "epoch": 1.056002383080131, + "grad_norm": 0.033935546875, + "learning_rate": 0.01583780160857909, + "loss": 0.8062, + "num_input_tokens_seen": 4110592, + "step": 7090 + }, + { + "epoch": 1.0567470956210903, + "grad_norm": 0.0223388671875, + "learning_rate": 0.015848972296693478, + "loss": 0.8328, + "num_input_tokens_seen": 4113568, + "step": 7095 + }, + { + "epoch": 1.0574918081620495, + "grad_norm": 0.05615234375, + "learning_rate": 0.015860142984807864, + "loss": 0.7884, + "num_input_tokens_seen": 4116608, + "step": 7100 + }, + { + "epoch": 1.0582365207030087, + "grad_norm": 0.031494140625, + "learning_rate": 0.01587131367292225, + "loss": 0.8045, + "num_input_tokens_seen": 4119712, + "step": 7105 + }, + { + "epoch": 1.0589812332439679, + "grad_norm": 0.037841796875, + "learning_rate": 0.015882484361036637, + "loss": 0.7895, + "num_input_tokens_seen": 4122336, + "step": 7110 + }, + { + "epoch": 1.059725945784927, + "grad_norm": 0.0693359375, + "learning_rate": 0.015893655049151027, + "loss": 0.8097, + "num_input_tokens_seen": 4125088, + "step": 7115 + }, + { + "epoch": 1.0604706583258863, + "grad_norm": 0.0498046875, + "learning_rate": 0.015904825737265413, + "loss": 0.7902, + "num_input_tokens_seen": 4128448, + "step": 7120 + }, + { + "epoch": 1.0612153708668455, + "grad_norm": 0.035400390625, + "learning_rate": 0.015915996425379803, + "loss": 0.8035, + "num_input_tokens_seen": 4131520, + "step": 7125 + }, + { + "epoch": 1.0619600834078047, + "grad_norm": 0.048095703125, + "learning_rate": 0.01592716711349419, + "loss": 0.8241, + "num_input_tokens_seen": 4134944, + "step": 7130 + }, + { + "epoch": 1.0627047959487639, + "grad_norm": 0.0206298828125, + "learning_rate": 0.01593833780160858, + "loss": 0.8201, + "num_input_tokens_seen": 4138016, + "step": 7135 + }, + { + "epoch": 1.063449508489723, + "grad_norm": 0.04052734375, + "learning_rate": 0.015949508489722965, + "loss": 0.8077, + "num_input_tokens_seen": 4140896, + "step": 7140 + }, + { + "epoch": 1.064194221030682, + "grad_norm": 0.0517578125, + "learning_rate": 0.015960679177837354, + "loss": 0.7862, + "num_input_tokens_seen": 4143584, + "step": 7145 + }, + { + "epoch": 1.0649389335716413, + "grad_norm": 0.03759765625, + "learning_rate": 0.01597184986595174, + "loss": 0.7973, + "num_input_tokens_seen": 4146528, + "step": 7150 + }, + { + "epoch": 1.0656836461126005, + "grad_norm": 0.0223388671875, + "learning_rate": 0.01598302055406613, + "loss": 0.8095, + "num_input_tokens_seen": 4149536, + "step": 7155 + }, + { + "epoch": 1.0664283586535597, + "grad_norm": 0.02099609375, + "learning_rate": 0.015994191242180517, + "loss": 0.8007, + "num_input_tokens_seen": 4152384, + "step": 7160 + }, + { + "epoch": 1.0671730711945189, + "grad_norm": 0.034912109375, + "learning_rate": 0.016005361930294906, + "loss": 0.7997, + "num_input_tokens_seen": 4155200, + "step": 7165 + }, + { + "epoch": 1.067917783735478, + "grad_norm": 0.033203125, + "learning_rate": 0.016016532618409296, + "loss": 0.7964, + "num_input_tokens_seen": 4158112, + "step": 7170 + }, + { + "epoch": 1.0686624962764373, + "grad_norm": 0.040283203125, + "learning_rate": 0.016027703306523682, + "loss": 0.7972, + "num_input_tokens_seen": 4160768, + "step": 7175 + }, + { + "epoch": 1.0694072088173965, + "grad_norm": 0.0220947265625, + "learning_rate": 0.016038873994638072, + "loss": 0.8107, + "num_input_tokens_seen": 4163872, + "step": 7180 + }, + { + "epoch": 1.0701519213583557, + "grad_norm": 0.05810546875, + "learning_rate": 0.016050044682752458, + "loss": 0.8276, + "num_input_tokens_seen": 4166816, + "step": 7185 + }, + { + "epoch": 1.0708966338993149, + "grad_norm": 0.047119140625, + "learning_rate": 0.016061215370866848, + "loss": 0.7928, + "num_input_tokens_seen": 4169568, + "step": 7190 + }, + { + "epoch": 1.071641346440274, + "grad_norm": 0.0390625, + "learning_rate": 0.01607238605898123, + "loss": 0.8006, + "num_input_tokens_seen": 4172480, + "step": 7195 + }, + { + "epoch": 1.0723860589812333, + "grad_norm": 0.040771484375, + "learning_rate": 0.01608355674709562, + "loss": 0.8227, + "num_input_tokens_seen": 4175744, + "step": 7200 + }, + { + "epoch": 1.0731307715221925, + "grad_norm": 0.0206298828125, + "learning_rate": 0.016094727435210007, + "loss": 0.7819, + "num_input_tokens_seen": 4178528, + "step": 7205 + }, + { + "epoch": 1.0738754840631517, + "grad_norm": 0.020751953125, + "learning_rate": 0.016105898123324396, + "loss": 0.8053, + "num_input_tokens_seen": 4181728, + "step": 7210 + }, + { + "epoch": 1.074620196604111, + "grad_norm": 0.031982421875, + "learning_rate": 0.016117068811438783, + "loss": 0.7897, + "num_input_tokens_seen": 4184320, + "step": 7215 + }, + { + "epoch": 1.07536490914507, + "grad_norm": 0.021728515625, + "learning_rate": 0.016128239499553172, + "loss": 0.7937, + "num_input_tokens_seen": 4187040, + "step": 7220 + }, + { + "epoch": 1.076109621686029, + "grad_norm": 0.0439453125, + "learning_rate": 0.01613941018766756, + "loss": 0.8075, + "num_input_tokens_seen": 4190144, + "step": 7225 + }, + { + "epoch": 1.0768543342269883, + "grad_norm": 0.04150390625, + "learning_rate": 0.016150580875781948, + "loss": 0.8057, + "num_input_tokens_seen": 4193152, + "step": 7230 + }, + { + "epoch": 1.0775990467679475, + "grad_norm": 0.032958984375, + "learning_rate": 0.016161751563896334, + "loss": 0.7825, + "num_input_tokens_seen": 4196000, + "step": 7235 + }, + { + "epoch": 1.0783437593089067, + "grad_norm": 0.0223388671875, + "learning_rate": 0.016172922252010724, + "loss": 0.8082, + "num_input_tokens_seen": 4198880, + "step": 7240 + }, + { + "epoch": 1.079088471849866, + "grad_norm": 0.02587890625, + "learning_rate": 0.01618409294012511, + "loss": 0.7915, + "num_input_tokens_seen": 4201664, + "step": 7245 + }, + { + "epoch": 1.079833184390825, + "grad_norm": 0.0458984375, + "learning_rate": 0.0161952636282395, + "loss": 0.8199, + "num_input_tokens_seen": 4204672, + "step": 7250 + }, + { + "epoch": 1.0805778969317843, + "grad_norm": 0.0419921875, + "learning_rate": 0.016206434316353886, + "loss": 0.8064, + "num_input_tokens_seen": 4207552, + "step": 7255 + }, + { + "epoch": 1.0813226094727435, + "grad_norm": 0.0634765625, + "learning_rate": 0.016217605004468276, + "loss": 0.8127, + "num_input_tokens_seen": 4210336, + "step": 7260 + }, + { + "epoch": 1.0820673220137027, + "grad_norm": 0.0308837890625, + "learning_rate": 0.016228775692582662, + "loss": 0.7895, + "num_input_tokens_seen": 4212992, + "step": 7265 + }, + { + "epoch": 1.082812034554662, + "grad_norm": 0.04541015625, + "learning_rate": 0.016239946380697052, + "loss": 0.8101, + "num_input_tokens_seen": 4215712, + "step": 7270 + }, + { + "epoch": 1.083556747095621, + "grad_norm": 0.032470703125, + "learning_rate": 0.016251117068811438, + "loss": 0.7959, + "num_input_tokens_seen": 4219072, + "step": 7275 + }, + { + "epoch": 1.0843014596365803, + "grad_norm": 0.0235595703125, + "learning_rate": 0.016262287756925824, + "loss": 0.8017, + "num_input_tokens_seen": 4221856, + "step": 7280 + }, + { + "epoch": 1.0850461721775395, + "grad_norm": 0.042236328125, + "learning_rate": 0.016273458445040214, + "loss": 0.8312, + "num_input_tokens_seen": 4224896, + "step": 7285 + }, + { + "epoch": 1.0857908847184987, + "grad_norm": 0.039794921875, + "learning_rate": 0.0162846291331546, + "loss": 0.8059, + "num_input_tokens_seen": 4227680, + "step": 7290 + }, + { + "epoch": 1.086535597259458, + "grad_norm": 0.09130859375, + "learning_rate": 0.01629579982126899, + "loss": 0.7765, + "num_input_tokens_seen": 4230432, + "step": 7295 + }, + { + "epoch": 1.0872803098004171, + "grad_norm": 1.3046875, + "learning_rate": 0.016306970509383376, + "loss": 0.9055, + "num_input_tokens_seen": 4233632, + "step": 7300 + }, + { + "epoch": 1.0880250223413763, + "grad_norm": 0.09716796875, + "learning_rate": 0.016318141197497766, + "loss": 0.8277, + "num_input_tokens_seen": 4237024, + "step": 7305 + }, + { + "epoch": 1.0887697348823355, + "grad_norm": 0.0791015625, + "learning_rate": 0.016329311885612152, + "loss": 0.808, + "num_input_tokens_seen": 4240320, + "step": 7310 + }, + { + "epoch": 1.0895144474232945, + "grad_norm": 0.0947265625, + "learning_rate": 0.016340482573726542, + "loss": 0.7882, + "num_input_tokens_seen": 4242976, + "step": 7315 + }, + { + "epoch": 1.0902591599642537, + "grad_norm": 0.0771484375, + "learning_rate": 0.016351653261840928, + "loss": 0.7969, + "num_input_tokens_seen": 4245792, + "step": 7320 + }, + { + "epoch": 1.091003872505213, + "grad_norm": 0.056640625, + "learning_rate": 0.016362823949955318, + "loss": 0.7941, + "num_input_tokens_seen": 4248608, + "step": 7325 + }, + { + "epoch": 1.0917485850461721, + "grad_norm": 0.1162109375, + "learning_rate": 0.016373994638069704, + "loss": 0.7962, + "num_input_tokens_seen": 4251616, + "step": 7330 + }, + { + "epoch": 1.0924932975871313, + "grad_norm": 0.150390625, + "learning_rate": 0.016385165326184094, + "loss": 0.8012, + "num_input_tokens_seen": 4254720, + "step": 7335 + }, + { + "epoch": 1.0932380101280905, + "grad_norm": 0.0400390625, + "learning_rate": 0.01639633601429848, + "loss": 0.8109, + "num_input_tokens_seen": 4257568, + "step": 7340 + }, + { + "epoch": 1.0939827226690497, + "grad_norm": 0.09326171875, + "learning_rate": 0.01640750670241287, + "loss": 0.822, + "num_input_tokens_seen": 4260384, + "step": 7345 + }, + { + "epoch": 1.094727435210009, + "grad_norm": 0.04833984375, + "learning_rate": 0.016418677390527256, + "loss": 0.8082, + "num_input_tokens_seen": 4263616, + "step": 7350 + }, + { + "epoch": 1.0954721477509681, + "grad_norm": 0.0380859375, + "learning_rate": 0.016429848078641646, + "loss": 0.8062, + "num_input_tokens_seen": 4266368, + "step": 7355 + }, + { + "epoch": 1.0962168602919273, + "grad_norm": 0.044189453125, + "learning_rate": 0.016441018766756032, + "loss": 0.8016, + "num_input_tokens_seen": 4269120, + "step": 7360 + }, + { + "epoch": 1.0969615728328865, + "grad_norm": 0.07421875, + "learning_rate": 0.016452189454870418, + "loss": 0.8172, + "num_input_tokens_seen": 4272064, + "step": 7365 + }, + { + "epoch": 1.0977062853738457, + "grad_norm": 0.08251953125, + "learning_rate": 0.016463360142984804, + "loss": 0.7935, + "num_input_tokens_seen": 4275072, + "step": 7370 + }, + { + "epoch": 1.098450997914805, + "grad_norm": 0.049072265625, + "learning_rate": 0.016474530831099194, + "loss": 0.8096, + "num_input_tokens_seen": 4278240, + "step": 7375 + }, + { + "epoch": 1.0991957104557641, + "grad_norm": 0.04345703125, + "learning_rate": 0.01648570151921358, + "loss": 0.7946, + "num_input_tokens_seen": 4281184, + "step": 7380 + }, + { + "epoch": 1.0999404229967233, + "grad_norm": 0.05517578125, + "learning_rate": 0.01649687220732797, + "loss": 0.8011, + "num_input_tokens_seen": 4284032, + "step": 7385 + }, + { + "epoch": 1.1006851355376825, + "grad_norm": 0.08154296875, + "learning_rate": 0.01650804289544236, + "loss": 0.7974, + "num_input_tokens_seen": 4286976, + "step": 7390 + }, + { + "epoch": 1.1014298480786415, + "grad_norm": 0.07958984375, + "learning_rate": 0.016519213583556746, + "loss": 0.7994, + "num_input_tokens_seen": 4289920, + "step": 7395 + }, + { + "epoch": 1.1021745606196007, + "grad_norm": 0.189453125, + "learning_rate": 0.016530384271671136, + "loss": 0.8047, + "num_input_tokens_seen": 4292768, + "step": 7400 + }, + { + "epoch": 1.10291927316056, + "grad_norm": 0.0712890625, + "learning_rate": 0.016541554959785522, + "loss": 0.753, + "num_input_tokens_seen": 4295840, + "step": 7405 + }, + { + "epoch": 1.1036639857015191, + "grad_norm": 0.1328125, + "learning_rate": 0.01655272564789991, + "loss": 0.7888, + "num_input_tokens_seen": 4298976, + "step": 7410 + }, + { + "epoch": 1.1044086982424783, + "grad_norm": 0.265625, + "learning_rate": 0.016563896336014298, + "loss": 0.7777, + "num_input_tokens_seen": 4302016, + "step": 7415 + }, + { + "epoch": 1.1051534107834375, + "grad_norm": 0.076171875, + "learning_rate": 0.016575067024128688, + "loss": 0.8144, + "num_input_tokens_seen": 4305216, + "step": 7420 + }, + { + "epoch": 1.1058981233243967, + "grad_norm": 0.0279541015625, + "learning_rate": 0.016586237712243074, + "loss": 0.8113, + "num_input_tokens_seen": 4308032, + "step": 7425 + }, + { + "epoch": 1.106642835865356, + "grad_norm": 0.020751953125, + "learning_rate": 0.016597408400357463, + "loss": 0.8295, + "num_input_tokens_seen": 4310752, + "step": 7430 + }, + { + "epoch": 1.1073875484063151, + "grad_norm": 0.0194091796875, + "learning_rate": 0.01660857908847185, + "loss": 0.8187, + "num_input_tokens_seen": 4313344, + "step": 7435 + }, + { + "epoch": 1.1081322609472744, + "grad_norm": 0.04931640625, + "learning_rate": 0.01661974977658624, + "loss": 0.8004, + "num_input_tokens_seen": 4316000, + "step": 7440 + }, + { + "epoch": 1.1088769734882336, + "grad_norm": 0.057861328125, + "learning_rate": 0.016630920464700626, + "loss": 0.8225, + "num_input_tokens_seen": 4318688, + "step": 7445 + }, + { + "epoch": 1.1096216860291928, + "grad_norm": 0.0380859375, + "learning_rate": 0.016642091152815015, + "loss": 0.7985, + "num_input_tokens_seen": 4321440, + "step": 7450 + }, + { + "epoch": 1.110366398570152, + "grad_norm": 0.03271484375, + "learning_rate": 0.016653261840929398, + "loss": 0.8119, + "num_input_tokens_seen": 4324224, + "step": 7455 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.044189453125, + "learning_rate": 0.016664432529043788, + "loss": 0.8032, + "num_input_tokens_seen": 4327008, + "step": 7460 + }, + { + "epoch": 1.1118558236520704, + "grad_norm": 0.034912109375, + "learning_rate": 0.016675603217158174, + "loss": 0.795, + "num_input_tokens_seen": 4330112, + "step": 7465 + }, + { + "epoch": 1.1126005361930296, + "grad_norm": 0.0556640625, + "learning_rate": 0.016686773905272564, + "loss": 0.781, + "num_input_tokens_seen": 4332992, + "step": 7470 + }, + { + "epoch": 1.1133452487339888, + "grad_norm": 0.0634765625, + "learning_rate": 0.01669794459338695, + "loss": 0.8071, + "num_input_tokens_seen": 4336128, + "step": 7475 + }, + { + "epoch": 1.114089961274948, + "grad_norm": 0.068359375, + "learning_rate": 0.01670911528150134, + "loss": 0.8279, + "num_input_tokens_seen": 4339040, + "step": 7480 + }, + { + "epoch": 1.1148346738159072, + "grad_norm": 0.068359375, + "learning_rate": 0.016720285969615726, + "loss": 0.8166, + "num_input_tokens_seen": 4341920, + "step": 7485 + }, + { + "epoch": 1.1155793863568662, + "grad_norm": 0.043212890625, + "learning_rate": 0.016731456657730116, + "loss": 0.8095, + "num_input_tokens_seen": 4344960, + "step": 7490 + }, + { + "epoch": 1.1163240988978254, + "grad_norm": 0.038330078125, + "learning_rate": 0.016742627345844502, + "loss": 0.8099, + "num_input_tokens_seen": 4347936, + "step": 7495 + }, + { + "epoch": 1.1170688114387846, + "grad_norm": 0.0361328125, + "learning_rate": 0.01675379803395889, + "loss": 0.8128, + "num_input_tokens_seen": 4350688, + "step": 7500 + }, + { + "epoch": 1.1178135239797438, + "grad_norm": 0.020751953125, + "learning_rate": 0.01676496872207328, + "loss": 0.8191, + "num_input_tokens_seen": 4353664, + "step": 7505 + }, + { + "epoch": 1.118558236520703, + "grad_norm": 0.05322265625, + "learning_rate": 0.016776139410187667, + "loss": 0.8198, + "num_input_tokens_seen": 4356736, + "step": 7510 + }, + { + "epoch": 1.1193029490616622, + "grad_norm": 0.043212890625, + "learning_rate": 0.016787310098302057, + "loss": 0.8036, + "num_input_tokens_seen": 4359808, + "step": 7515 + }, + { + "epoch": 1.1200476616026214, + "grad_norm": 0.04150390625, + "learning_rate": 0.016798480786416443, + "loss": 0.8121, + "num_input_tokens_seen": 4362752, + "step": 7520 + }, + { + "epoch": 1.1207923741435806, + "grad_norm": 0.05859375, + "learning_rate": 0.016809651474530833, + "loss": 0.8064, + "num_input_tokens_seen": 4365632, + "step": 7525 + }, + { + "epoch": 1.1215370866845398, + "grad_norm": 0.041259765625, + "learning_rate": 0.01682082216264522, + "loss": 0.82, + "num_input_tokens_seen": 4368512, + "step": 7530 + }, + { + "epoch": 1.122281799225499, + "grad_norm": 0.0458984375, + "learning_rate": 0.01683199285075961, + "loss": 0.8132, + "num_input_tokens_seen": 4371584, + "step": 7535 + }, + { + "epoch": 1.1230265117664582, + "grad_norm": 0.046875, + "learning_rate": 0.016843163538873992, + "loss": 0.8021, + "num_input_tokens_seen": 4374336, + "step": 7540 + }, + { + "epoch": 1.1237712243074174, + "grad_norm": 0.02001953125, + "learning_rate": 0.01685433422698838, + "loss": 0.8075, + "num_input_tokens_seen": 4377312, + "step": 7545 + }, + { + "epoch": 1.1245159368483766, + "grad_norm": 0.037109375, + "learning_rate": 0.016865504915102768, + "loss": 0.8063, + "num_input_tokens_seen": 4380064, + "step": 7550 + }, + { + "epoch": 1.1252606493893358, + "grad_norm": 0.056884765625, + "learning_rate": 0.016876675603217157, + "loss": 0.7939, + "num_input_tokens_seen": 4382816, + "step": 7555 + }, + { + "epoch": 1.126005361930295, + "grad_norm": 0.036865234375, + "learning_rate": 0.016887846291331544, + "loss": 0.7972, + "num_input_tokens_seen": 4385568, + "step": 7560 + }, + { + "epoch": 1.1267500744712542, + "grad_norm": 0.037841796875, + "learning_rate": 0.016899016979445933, + "loss": 0.8083, + "num_input_tokens_seen": 4388608, + "step": 7565 + }, + { + "epoch": 1.1274947870122132, + "grad_norm": 0.047607421875, + "learning_rate": 0.01691018766756032, + "loss": 0.8395, + "num_input_tokens_seen": 4391744, + "step": 7570 + }, + { + "epoch": 1.1282394995531724, + "grad_norm": 0.044921875, + "learning_rate": 0.01692135835567471, + "loss": 0.8066, + "num_input_tokens_seen": 4394688, + "step": 7575 + }, + { + "epoch": 1.1289842120941316, + "grad_norm": 0.052490234375, + "learning_rate": 0.016932529043789096, + "loss": 0.807, + "num_input_tokens_seen": 4397696, + "step": 7580 + }, + { + "epoch": 1.1297289246350908, + "grad_norm": 0.045654296875, + "learning_rate": 0.016943699731903485, + "loss": 0.8061, + "num_input_tokens_seen": 4400608, + "step": 7585 + }, + { + "epoch": 1.13047363717605, + "grad_norm": 0.0390625, + "learning_rate": 0.01695487042001787, + "loss": 0.8005, + "num_input_tokens_seen": 4403552, + "step": 7590 + }, + { + "epoch": 1.1312183497170092, + "grad_norm": 0.04052734375, + "learning_rate": 0.01696604110813226, + "loss": 0.8161, + "num_input_tokens_seen": 4406496, + "step": 7595 + }, + { + "epoch": 1.1319630622579684, + "grad_norm": 0.038818359375, + "learning_rate": 0.016977211796246647, + "loss": 0.7926, + "num_input_tokens_seen": 4409120, + "step": 7600 + }, + { + "epoch": 1.1327077747989276, + "grad_norm": 0.0208740234375, + "learning_rate": 0.016988382484361037, + "loss": 0.7986, + "num_input_tokens_seen": 4411744, + "step": 7605 + }, + { + "epoch": 1.1334524873398868, + "grad_norm": 0.047119140625, + "learning_rate": 0.016999553172475423, + "loss": 0.8287, + "num_input_tokens_seen": 4414720, + "step": 7610 + }, + { + "epoch": 1.134197199880846, + "grad_norm": 0.021240234375, + "learning_rate": 0.017010723860589813, + "loss": 0.8044, + "num_input_tokens_seen": 4417472, + "step": 7615 + }, + { + "epoch": 1.1349419124218052, + "grad_norm": 0.04296875, + "learning_rate": 0.0170218945487042, + "loss": 0.825, + "num_input_tokens_seen": 4420640, + "step": 7620 + }, + { + "epoch": 1.1356866249627644, + "grad_norm": 0.0693359375, + "learning_rate": 0.017033065236818586, + "loss": 0.7904, + "num_input_tokens_seen": 4423520, + "step": 7625 + }, + { + "epoch": 1.1364313375037236, + "grad_norm": 0.034912109375, + "learning_rate": 0.017044235924932975, + "loss": 0.8013, + "num_input_tokens_seen": 4426496, + "step": 7630 + }, + { + "epoch": 1.1371760500446828, + "grad_norm": 0.0205078125, + "learning_rate": 0.01705540661304736, + "loss": 0.8184, + "num_input_tokens_seen": 4429440, + "step": 7635 + }, + { + "epoch": 1.137920762585642, + "grad_norm": 0.031982421875, + "learning_rate": 0.01706657730116175, + "loss": 0.8104, + "num_input_tokens_seen": 4432320, + "step": 7640 + }, + { + "epoch": 1.1386654751266012, + "grad_norm": 0.037353515625, + "learning_rate": 0.017077747989276137, + "loss": 0.8065, + "num_input_tokens_seen": 4435584, + "step": 7645 + }, + { + "epoch": 1.1394101876675604, + "grad_norm": 0.033447265625, + "learning_rate": 0.017088918677390527, + "loss": 0.8009, + "num_input_tokens_seen": 4438464, + "step": 7650 + }, + { + "epoch": 1.1401549002085196, + "grad_norm": 0.0322265625, + "learning_rate": 0.017100089365504913, + "loss": 0.7967, + "num_input_tokens_seen": 4441120, + "step": 7655 + }, + { + "epoch": 1.1408996127494788, + "grad_norm": 0.0341796875, + "learning_rate": 0.017111260053619303, + "loss": 0.7947, + "num_input_tokens_seen": 4443680, + "step": 7660 + }, + { + "epoch": 1.1416443252904378, + "grad_norm": 0.044921875, + "learning_rate": 0.01712243074173369, + "loss": 0.8023, + "num_input_tokens_seen": 4446528, + "step": 7665 + }, + { + "epoch": 1.142389037831397, + "grad_norm": 0.05078125, + "learning_rate": 0.01713360142984808, + "loss": 0.8091, + "num_input_tokens_seen": 4449504, + "step": 7670 + }, + { + "epoch": 1.1431337503723562, + "grad_norm": 0.0458984375, + "learning_rate": 0.017144772117962465, + "loss": 0.8177, + "num_input_tokens_seen": 4452352, + "step": 7675 + }, + { + "epoch": 1.1438784629133154, + "grad_norm": 0.051513671875, + "learning_rate": 0.017155942806076855, + "loss": 0.8242, + "num_input_tokens_seen": 4455136, + "step": 7680 + }, + { + "epoch": 1.1446231754542746, + "grad_norm": 0.041748046875, + "learning_rate": 0.01716711349419124, + "loss": 0.8139, + "num_input_tokens_seen": 4458144, + "step": 7685 + }, + { + "epoch": 1.1453678879952338, + "grad_norm": 0.0306396484375, + "learning_rate": 0.01717828418230563, + "loss": 0.8093, + "num_input_tokens_seen": 4460800, + "step": 7690 + }, + { + "epoch": 1.146112600536193, + "grad_norm": 0.040283203125, + "learning_rate": 0.017189454870420017, + "loss": 0.7968, + "num_input_tokens_seen": 4463904, + "step": 7695 + }, + { + "epoch": 1.1468573130771522, + "grad_norm": 0.04443359375, + "learning_rate": 0.017200625558534407, + "loss": 0.8186, + "num_input_tokens_seen": 4466816, + "step": 7700 + }, + { + "epoch": 1.1476020256181114, + "grad_norm": 0.02001953125, + "learning_rate": 0.017211796246648793, + "loss": 0.8187, + "num_input_tokens_seen": 4469504, + "step": 7705 + }, + { + "epoch": 1.1483467381590706, + "grad_norm": 0.03662109375, + "learning_rate": 0.01722296693476318, + "loss": 0.7986, + "num_input_tokens_seen": 4472736, + "step": 7710 + }, + { + "epoch": 1.1490914507000298, + "grad_norm": 0.0712890625, + "learning_rate": 0.017234137622877566, + "loss": 0.8093, + "num_input_tokens_seen": 4475552, + "step": 7715 + }, + { + "epoch": 1.149836163240989, + "grad_norm": 0.0390625, + "learning_rate": 0.017245308310991955, + "loss": 0.8104, + "num_input_tokens_seen": 4478432, + "step": 7720 + }, + { + "epoch": 1.1505808757819482, + "grad_norm": 0.01806640625, + "learning_rate": 0.01725647899910634, + "loss": 0.7934, + "num_input_tokens_seen": 4481312, + "step": 7725 + }, + { + "epoch": 1.1513255883229074, + "grad_norm": 0.052978515625, + "learning_rate": 0.01726764968722073, + "loss": 0.801, + "num_input_tokens_seen": 4484608, + "step": 7730 + }, + { + "epoch": 1.1520703008638666, + "grad_norm": 0.034912109375, + "learning_rate": 0.01727882037533512, + "loss": 0.7918, + "num_input_tokens_seen": 4487808, + "step": 7735 + }, + { + "epoch": 1.1528150134048256, + "grad_norm": 0.032470703125, + "learning_rate": 0.017289991063449507, + "loss": 0.7871, + "num_input_tokens_seen": 4490688, + "step": 7740 + }, + { + "epoch": 1.1535597259457848, + "grad_norm": 0.032958984375, + "learning_rate": 0.017301161751563897, + "loss": 0.8181, + "num_input_tokens_seen": 4493920, + "step": 7745 + }, + { + "epoch": 1.154304438486744, + "grad_norm": 0.02294921875, + "learning_rate": 0.017312332439678283, + "loss": 0.8145, + "num_input_tokens_seen": 4496768, + "step": 7750 + }, + { + "epoch": 1.1550491510277032, + "grad_norm": 0.04736328125, + "learning_rate": 0.017323503127792673, + "loss": 0.7917, + "num_input_tokens_seen": 4499840, + "step": 7755 + }, + { + "epoch": 1.1557938635686624, + "grad_norm": 0.0252685546875, + "learning_rate": 0.01733467381590706, + "loss": 0.8015, + "num_input_tokens_seen": 4502912, + "step": 7760 + }, + { + "epoch": 1.1565385761096216, + "grad_norm": 0.04150390625, + "learning_rate": 0.01734584450402145, + "loss": 0.7961, + "num_input_tokens_seen": 4505856, + "step": 7765 + }, + { + "epoch": 1.1572832886505808, + "grad_norm": 0.0191650390625, + "learning_rate": 0.017357015192135835, + "loss": 0.8117, + "num_input_tokens_seen": 4508928, + "step": 7770 + }, + { + "epoch": 1.15802800119154, + "grad_norm": 0.04736328125, + "learning_rate": 0.017368185880250225, + "loss": 0.786, + "num_input_tokens_seen": 4512064, + "step": 7775 + }, + { + "epoch": 1.1587727137324992, + "grad_norm": 0.034912109375, + "learning_rate": 0.01737935656836461, + "loss": 0.7695, + "num_input_tokens_seen": 4514880, + "step": 7780 + }, + { + "epoch": 1.1595174262734584, + "grad_norm": 0.0341796875, + "learning_rate": 0.017390527256479, + "loss": 0.8151, + "num_input_tokens_seen": 4517632, + "step": 7785 + }, + { + "epoch": 1.1602621388144176, + "grad_norm": 0.0240478515625, + "learning_rate": 0.017401697944593387, + "loss": 0.8238, + "num_input_tokens_seen": 4520416, + "step": 7790 + }, + { + "epoch": 1.1610068513553768, + "grad_norm": 0.03125, + "learning_rate": 0.017412868632707777, + "loss": 0.8142, + "num_input_tokens_seen": 4523296, + "step": 7795 + }, + { + "epoch": 1.161751563896336, + "grad_norm": 0.0341796875, + "learning_rate": 0.01742403932082216, + "loss": 0.772, + "num_input_tokens_seen": 4526880, + "step": 7800 + }, + { + "epoch": 1.1624962764372953, + "grad_norm": 0.017333984375, + "learning_rate": 0.01743521000893655, + "loss": 0.7983, + "num_input_tokens_seen": 4529504, + "step": 7805 + }, + { + "epoch": 1.1632409889782545, + "grad_norm": 0.03662109375, + "learning_rate": 0.017446380697050935, + "loss": 0.7803, + "num_input_tokens_seen": 4532320, + "step": 7810 + }, + { + "epoch": 1.1639857015192137, + "grad_norm": 0.033935546875, + "learning_rate": 0.017457551385165325, + "loss": 0.798, + "num_input_tokens_seen": 4535104, + "step": 7815 + }, + { + "epoch": 1.1647304140601729, + "grad_norm": 0.032958984375, + "learning_rate": 0.01746872207327971, + "loss": 0.813, + "num_input_tokens_seen": 4537600, + "step": 7820 + }, + { + "epoch": 1.165475126601132, + "grad_norm": 0.03466796875, + "learning_rate": 0.0174798927613941, + "loss": 0.8204, + "num_input_tokens_seen": 4540512, + "step": 7825 + }, + { + "epoch": 1.1662198391420913, + "grad_norm": 0.01806640625, + "learning_rate": 0.017491063449508487, + "loss": 0.7994, + "num_input_tokens_seen": 4543328, + "step": 7830 + }, + { + "epoch": 1.1669645516830505, + "grad_norm": 0.0169677734375, + "learning_rate": 0.017502234137622877, + "loss": 0.797, + "num_input_tokens_seen": 4546176, + "step": 7835 + }, + { + "epoch": 1.1677092642240094, + "grad_norm": 0.059814453125, + "learning_rate": 0.017513404825737263, + "loss": 0.8166, + "num_input_tokens_seen": 4549344, + "step": 7840 + }, + { + "epoch": 1.1684539767649686, + "grad_norm": 0.0164794921875, + "learning_rate": 0.017524575513851653, + "loss": 0.8188, + "num_input_tokens_seen": 4552160, + "step": 7845 + }, + { + "epoch": 1.1691986893059279, + "grad_norm": 0.031982421875, + "learning_rate": 0.017535746201966042, + "loss": 0.8256, + "num_input_tokens_seen": 4554880, + "step": 7850 + }, + { + "epoch": 1.169943401846887, + "grad_norm": 0.044677734375, + "learning_rate": 0.01754691689008043, + "loss": 0.7967, + "num_input_tokens_seen": 4557696, + "step": 7855 + }, + { + "epoch": 1.1706881143878463, + "grad_norm": 0.048583984375, + "learning_rate": 0.01755808757819482, + "loss": 0.7922, + "num_input_tokens_seen": 4560224, + "step": 7860 + }, + { + "epoch": 1.1714328269288055, + "grad_norm": 0.031982421875, + "learning_rate": 0.017569258266309205, + "loss": 0.7961, + "num_input_tokens_seen": 4563168, + "step": 7865 + }, + { + "epoch": 1.1721775394697647, + "grad_norm": 0.0174560546875, + "learning_rate": 0.017580428954423594, + "loss": 0.7987, + "num_input_tokens_seen": 4565856, + "step": 7870 + }, + { + "epoch": 1.1729222520107239, + "grad_norm": 0.046875, + "learning_rate": 0.01759159964253798, + "loss": 0.8026, + "num_input_tokens_seen": 4569216, + "step": 7875 + }, + { + "epoch": 1.173666964551683, + "grad_norm": 0.033203125, + "learning_rate": 0.01760277033065237, + "loss": 0.8038, + "num_input_tokens_seen": 4572064, + "step": 7880 + }, + { + "epoch": 1.1744116770926423, + "grad_norm": 0.041015625, + "learning_rate": 0.017613941018766753, + "loss": 0.8147, + "num_input_tokens_seen": 4575136, + "step": 7885 + }, + { + "epoch": 1.1751563896336015, + "grad_norm": 0.042724609375, + "learning_rate": 0.017625111706881143, + "loss": 0.8055, + "num_input_tokens_seen": 4577856, + "step": 7890 + }, + { + "epoch": 1.1759011021745607, + "grad_norm": 0.032958984375, + "learning_rate": 0.01763628239499553, + "loss": 0.8287, + "num_input_tokens_seen": 4580608, + "step": 7895 + }, + { + "epoch": 1.1766458147155199, + "grad_norm": 0.0289306640625, + "learning_rate": 0.01764745308310992, + "loss": 0.8042, + "num_input_tokens_seen": 4583200, + "step": 7900 + }, + { + "epoch": 1.177390527256479, + "grad_norm": 0.03076171875, + "learning_rate": 0.017658623771224305, + "loss": 0.7864, + "num_input_tokens_seen": 4585952, + "step": 7905 + }, + { + "epoch": 1.1781352397974383, + "grad_norm": 0.0194091796875, + "learning_rate": 0.017669794459338695, + "loss": 0.8252, + "num_input_tokens_seen": 4589056, + "step": 7910 + }, + { + "epoch": 1.1788799523383973, + "grad_norm": 0.0556640625, + "learning_rate": 0.01768096514745308, + "loss": 0.807, + "num_input_tokens_seen": 4592096, + "step": 7915 + }, + { + "epoch": 1.1796246648793565, + "grad_norm": 0.01708984375, + "learning_rate": 0.01769213583556747, + "loss": 0.8059, + "num_input_tokens_seen": 4594784, + "step": 7920 + }, + { + "epoch": 1.1803693774203157, + "grad_norm": 0.0203857421875, + "learning_rate": 0.017703306523681857, + "loss": 0.8071, + "num_input_tokens_seen": 4597856, + "step": 7925 + }, + { + "epoch": 1.1811140899612749, + "grad_norm": 0.031982421875, + "learning_rate": 0.017714477211796247, + "loss": 0.7971, + "num_input_tokens_seen": 4600512, + "step": 7930 + }, + { + "epoch": 1.181858802502234, + "grad_norm": 0.032958984375, + "learning_rate": 0.017725647899910633, + "loss": 0.8098, + "num_input_tokens_seen": 4603328, + "step": 7935 + }, + { + "epoch": 1.1826035150431933, + "grad_norm": 0.03369140625, + "learning_rate": 0.017736818588025022, + "loss": 0.7903, + "num_input_tokens_seen": 4606144, + "step": 7940 + }, + { + "epoch": 1.1833482275841525, + "grad_norm": 0.039306640625, + "learning_rate": 0.01774798927613941, + "loss": 0.7985, + "num_input_tokens_seen": 4609088, + "step": 7945 + }, + { + "epoch": 1.1840929401251117, + "grad_norm": 0.048828125, + "learning_rate": 0.0177591599642538, + "loss": 0.8039, + "num_input_tokens_seen": 4612160, + "step": 7950 + }, + { + "epoch": 1.1848376526660709, + "grad_norm": 0.03271484375, + "learning_rate": 0.017770330652368185, + "loss": 0.8053, + "num_input_tokens_seen": 4615072, + "step": 7955 + }, + { + "epoch": 1.18558236520703, + "grad_norm": 0.04248046875, + "learning_rate": 0.017781501340482574, + "loss": 0.8059, + "num_input_tokens_seen": 4617600, + "step": 7960 + }, + { + "epoch": 1.1863270777479893, + "grad_norm": 0.03271484375, + "learning_rate": 0.017792672028596964, + "loss": 0.7983, + "num_input_tokens_seen": 4620320, + "step": 7965 + }, + { + "epoch": 1.1870717902889485, + "grad_norm": 0.0693359375, + "learning_rate": 0.017803842716711347, + "loss": 0.8106, + "num_input_tokens_seen": 4623520, + "step": 7970 + }, + { + "epoch": 1.1878165028299077, + "grad_norm": 0.0234375, + "learning_rate": 0.017815013404825737, + "loss": 0.8165, + "num_input_tokens_seen": 4626656, + "step": 7975 + }, + { + "epoch": 1.188561215370867, + "grad_norm": 0.0498046875, + "learning_rate": 0.017826184092940123, + "loss": 0.806, + "num_input_tokens_seen": 4629408, + "step": 7980 + }, + { + "epoch": 1.189305927911826, + "grad_norm": 0.0279541015625, + "learning_rate": 0.017837354781054512, + "loss": 0.7847, + "num_input_tokens_seen": 4632320, + "step": 7985 + }, + { + "epoch": 1.1900506404527853, + "grad_norm": 0.033935546875, + "learning_rate": 0.0178485254691689, + "loss": 0.8236, + "num_input_tokens_seen": 4635040, + "step": 7990 + }, + { + "epoch": 1.1907953529937445, + "grad_norm": 0.040771484375, + "learning_rate": 0.01785969615728329, + "loss": 0.834, + "num_input_tokens_seen": 4638048, + "step": 7995 + }, + { + "epoch": 1.1915400655347037, + "grad_norm": 0.0185546875, + "learning_rate": 0.017870866845397675, + "loss": 0.8111, + "num_input_tokens_seen": 4640864, + "step": 8000 + }, + { + "epoch": 1.192284778075663, + "grad_norm": 0.0478515625, + "learning_rate": 0.017882037533512064, + "loss": 0.7837, + "num_input_tokens_seen": 4643488, + "step": 8005 + }, + { + "epoch": 1.193029490616622, + "grad_norm": 0.0283203125, + "learning_rate": 0.01789320822162645, + "loss": 0.8101, + "num_input_tokens_seen": 4646432, + "step": 8010 + }, + { + "epoch": 1.193774203157581, + "grad_norm": 0.033447265625, + "learning_rate": 0.01790437890974084, + "loss": 0.8026, + "num_input_tokens_seen": 4649280, + "step": 8015 + }, + { + "epoch": 1.1945189156985403, + "grad_norm": 0.0184326171875, + "learning_rate": 0.017915549597855226, + "loss": 0.8145, + "num_input_tokens_seen": 4652192, + "step": 8020 + }, + { + "epoch": 1.1952636282394995, + "grad_norm": 0.03125, + "learning_rate": 0.017926720285969616, + "loss": 0.7982, + "num_input_tokens_seen": 4654848, + "step": 8025 + }, + { + "epoch": 1.1960083407804587, + "grad_norm": 0.04833984375, + "learning_rate": 0.017937890974084002, + "loss": 0.8189, + "num_input_tokens_seen": 4657600, + "step": 8030 + }, + { + "epoch": 1.196753053321418, + "grad_norm": 0.036376953125, + "learning_rate": 0.017949061662198392, + "loss": 0.7872, + "num_input_tokens_seen": 4660352, + "step": 8035 + }, + { + "epoch": 1.197497765862377, + "grad_norm": 0.0302734375, + "learning_rate": 0.01796023235031278, + "loss": 0.7923, + "num_input_tokens_seen": 4663296, + "step": 8040 + }, + { + "epoch": 1.1982424784033363, + "grad_norm": 0.0556640625, + "learning_rate": 0.017971403038427168, + "loss": 0.7954, + "num_input_tokens_seen": 4666240, + "step": 8045 + }, + { + "epoch": 1.1989871909442955, + "grad_norm": 0.0400390625, + "learning_rate": 0.017982573726541554, + "loss": 0.825, + "num_input_tokens_seen": 4668992, + "step": 8050 + }, + { + "epoch": 1.1997319034852547, + "grad_norm": 0.039794921875, + "learning_rate": 0.017993744414655944, + "loss": 0.811, + "num_input_tokens_seen": 4671584, + "step": 8055 + }, + { + "epoch": 1.200476616026214, + "grad_norm": 0.0308837890625, + "learning_rate": 0.018004915102770327, + "loss": 0.796, + "num_input_tokens_seen": 4674144, + "step": 8060 + }, + { + "epoch": 1.2012213285671731, + "grad_norm": 0.0208740234375, + "learning_rate": 0.018016085790884716, + "loss": 0.8021, + "num_input_tokens_seen": 4676864, + "step": 8065 + }, + { + "epoch": 1.2019660411081323, + "grad_norm": 0.050537109375, + "learning_rate": 0.018027256478999106, + "loss": 0.8315, + "num_input_tokens_seen": 4679744, + "step": 8070 + }, + { + "epoch": 1.2027107536490915, + "grad_norm": 0.045166015625, + "learning_rate": 0.018038427167113492, + "loss": 0.8024, + "num_input_tokens_seen": 4682592, + "step": 8075 + }, + { + "epoch": 1.2034554661900507, + "grad_norm": 0.035400390625, + "learning_rate": 0.018049597855227882, + "loss": 0.818, + "num_input_tokens_seen": 4685344, + "step": 8080 + }, + { + "epoch": 1.2042001787310097, + "grad_norm": 0.033447265625, + "learning_rate": 0.01806076854334227, + "loss": 0.8042, + "num_input_tokens_seen": 4688000, + "step": 8085 + }, + { + "epoch": 1.204944891271969, + "grad_norm": 0.052734375, + "learning_rate": 0.018071939231456658, + "loss": 0.7952, + "num_input_tokens_seen": 4691456, + "step": 8090 + }, + { + "epoch": 1.2056896038129281, + "grad_norm": 0.038818359375, + "learning_rate": 0.018083109919571044, + "loss": 0.8064, + "num_input_tokens_seen": 4694432, + "step": 8095 + }, + { + "epoch": 1.2064343163538873, + "grad_norm": 0.047119140625, + "learning_rate": 0.018094280607685434, + "loss": 0.8013, + "num_input_tokens_seen": 4697376, + "step": 8100 + }, + { + "epoch": 1.2071790288948465, + "grad_norm": 0.032958984375, + "learning_rate": 0.01810545129579982, + "loss": 0.8019, + "num_input_tokens_seen": 4700320, + "step": 8105 + }, + { + "epoch": 1.2079237414358057, + "grad_norm": 0.0284423828125, + "learning_rate": 0.01811662198391421, + "loss": 0.8116, + "num_input_tokens_seen": 4703232, + "step": 8110 + }, + { + "epoch": 1.208668453976765, + "grad_norm": 0.047607421875, + "learning_rate": 0.018127792672028596, + "loss": 0.8193, + "num_input_tokens_seen": 4706208, + "step": 8115 + }, + { + "epoch": 1.2094131665177241, + "grad_norm": 0.0322265625, + "learning_rate": 0.018138963360142986, + "loss": 0.7847, + "num_input_tokens_seen": 4709056, + "step": 8120 + }, + { + "epoch": 1.2101578790586833, + "grad_norm": 0.04833984375, + "learning_rate": 0.018150134048257372, + "loss": 0.8142, + "num_input_tokens_seen": 4711904, + "step": 8125 + }, + { + "epoch": 1.2109025915996425, + "grad_norm": 0.0303955078125, + "learning_rate": 0.018161304736371762, + "loss": 0.8016, + "num_input_tokens_seen": 4714560, + "step": 8130 + }, + { + "epoch": 1.2116473041406017, + "grad_norm": 0.0294189453125, + "learning_rate": 0.018172475424486148, + "loss": 0.7886, + "num_input_tokens_seen": 4717472, + "step": 8135 + }, + { + "epoch": 1.212392016681561, + "grad_norm": 0.01904296875, + "learning_rate": 0.018183646112600538, + "loss": 0.8101, + "num_input_tokens_seen": 4720288, + "step": 8140 + }, + { + "epoch": 1.2131367292225201, + "grad_norm": 0.034423828125, + "learning_rate": 0.01819481680071492, + "loss": 0.7955, + "num_input_tokens_seen": 4723360, + "step": 8145 + }, + { + "epoch": 1.2138814417634793, + "grad_norm": 0.03515625, + "learning_rate": 0.01820598748882931, + "loss": 0.8059, + "num_input_tokens_seen": 4726208, + "step": 8150 + }, + { + "epoch": 1.2146261543044385, + "grad_norm": 0.048583984375, + "learning_rate": 0.018217158176943696, + "loss": 0.8093, + "num_input_tokens_seen": 4729024, + "step": 8155 + }, + { + "epoch": 1.2153708668453977, + "grad_norm": 0.0517578125, + "learning_rate": 0.018228328865058086, + "loss": 0.7906, + "num_input_tokens_seen": 4731904, + "step": 8160 + }, + { + "epoch": 1.216115579386357, + "grad_norm": 0.038330078125, + "learning_rate": 0.018239499553172472, + "loss": 0.7944, + "num_input_tokens_seen": 4734432, + "step": 8165 + }, + { + "epoch": 1.2168602919273162, + "grad_norm": 0.025146484375, + "learning_rate": 0.018250670241286862, + "loss": 0.7999, + "num_input_tokens_seen": 4737216, + "step": 8170 + }, + { + "epoch": 1.2176050044682754, + "grad_norm": 0.0498046875, + "learning_rate": 0.01826184092940125, + "loss": 0.8191, + "num_input_tokens_seen": 4739936, + "step": 8175 + }, + { + "epoch": 1.2183497170092346, + "grad_norm": 0.0322265625, + "learning_rate": 0.018273011617515638, + "loss": 0.8053, + "num_input_tokens_seen": 4743072, + "step": 8180 + }, + { + "epoch": 1.2190944295501935, + "grad_norm": 0.04833984375, + "learning_rate": 0.018284182305630028, + "loss": 0.8188, + "num_input_tokens_seen": 4745984, + "step": 8185 + }, + { + "epoch": 1.2198391420911527, + "grad_norm": 0.037841796875, + "learning_rate": 0.018295352993744414, + "loss": 0.7919, + "num_input_tokens_seen": 4748896, + "step": 8190 + }, + { + "epoch": 1.220583854632112, + "grad_norm": 0.019775390625, + "learning_rate": 0.018306523681858804, + "loss": 0.8043, + "num_input_tokens_seen": 4751808, + "step": 8195 + }, + { + "epoch": 1.2213285671730711, + "grad_norm": 0.032470703125, + "learning_rate": 0.01831769436997319, + "loss": 0.8138, + "num_input_tokens_seen": 4754496, + "step": 8200 + }, + { + "epoch": 1.2220732797140303, + "grad_norm": 0.0240478515625, + "learning_rate": 0.01832886505808758, + "loss": 0.7999, + "num_input_tokens_seen": 4757344, + "step": 8205 + }, + { + "epoch": 1.2228179922549895, + "grad_norm": 0.07421875, + "learning_rate": 0.018340035746201966, + "loss": 0.7868, + "num_input_tokens_seen": 4760480, + "step": 8210 + }, + { + "epoch": 1.2235627047959488, + "grad_norm": 0.0220947265625, + "learning_rate": 0.018351206434316356, + "loss": 0.8184, + "num_input_tokens_seen": 4763200, + "step": 8215 + }, + { + "epoch": 1.224307417336908, + "grad_norm": 0.03857421875, + "learning_rate": 0.018362377122430742, + "loss": 0.8288, + "num_input_tokens_seen": 4766144, + "step": 8220 + }, + { + "epoch": 1.2250521298778672, + "grad_norm": 0.04150390625, + "learning_rate": 0.01837354781054513, + "loss": 0.8345, + "num_input_tokens_seen": 4769440, + "step": 8225 + }, + { + "epoch": 1.2257968424188264, + "grad_norm": 0.0186767578125, + "learning_rate": 0.018384718498659514, + "loss": 0.7958, + "num_input_tokens_seen": 4772320, + "step": 8230 + }, + { + "epoch": 1.2265415549597856, + "grad_norm": 0.0189208984375, + "learning_rate": 0.018395889186773904, + "loss": 0.8015, + "num_input_tokens_seen": 4775232, + "step": 8235 + }, + { + "epoch": 1.2272862675007448, + "grad_norm": 0.0299072265625, + "learning_rate": 0.01840705987488829, + "loss": 0.7855, + "num_input_tokens_seen": 4778368, + "step": 8240 + }, + { + "epoch": 1.228030980041704, + "grad_norm": 0.030517578125, + "learning_rate": 0.01841823056300268, + "loss": 0.8219, + "num_input_tokens_seen": 4781312, + "step": 8245 + }, + { + "epoch": 1.2287756925826632, + "grad_norm": 0.032470703125, + "learning_rate": 0.018429401251117066, + "loss": 0.7849, + "num_input_tokens_seen": 4784128, + "step": 8250 + }, + { + "epoch": 1.2295204051236224, + "grad_norm": 0.0361328125, + "learning_rate": 0.018440571939231456, + "loss": 0.7923, + "num_input_tokens_seen": 4786976, + "step": 8255 + }, + { + "epoch": 1.2302651176645814, + "grad_norm": 0.0322265625, + "learning_rate": 0.018451742627345842, + "loss": 0.7982, + "num_input_tokens_seen": 4789856, + "step": 8260 + }, + { + "epoch": 1.2310098302055406, + "grad_norm": 0.029541015625, + "learning_rate": 0.018462913315460232, + "loss": 0.8214, + "num_input_tokens_seen": 4792704, + "step": 8265 + }, + { + "epoch": 1.2317545427464998, + "grad_norm": 0.0302734375, + "learning_rate": 0.018474084003574618, + "loss": 0.7971, + "num_input_tokens_seen": 4795712, + "step": 8270 + }, + { + "epoch": 1.232499255287459, + "grad_norm": 0.0235595703125, + "learning_rate": 0.018485254691689008, + "loss": 0.8228, + "num_input_tokens_seen": 4798560, + "step": 8275 + }, + { + "epoch": 1.2332439678284182, + "grad_norm": 0.033935546875, + "learning_rate": 0.018496425379803394, + "loss": 0.7764, + "num_input_tokens_seen": 4801312, + "step": 8280 + }, + { + "epoch": 1.2339886803693774, + "grad_norm": 0.03173828125, + "learning_rate": 0.018507596067917784, + "loss": 0.8115, + "num_input_tokens_seen": 4804192, + "step": 8285 + }, + { + "epoch": 1.2347333929103366, + "grad_norm": 0.052001953125, + "learning_rate": 0.01851876675603217, + "loss": 0.7947, + "num_input_tokens_seen": 4807072, + "step": 8290 + }, + { + "epoch": 1.2354781054512958, + "grad_norm": 0.0194091796875, + "learning_rate": 0.01852993744414656, + "loss": 0.8086, + "num_input_tokens_seen": 4810048, + "step": 8295 + }, + { + "epoch": 1.236222817992255, + "grad_norm": 0.045166015625, + "learning_rate": 0.01854110813226095, + "loss": 0.8308, + "num_input_tokens_seen": 4813120, + "step": 8300 + }, + { + "epoch": 1.2369675305332142, + "grad_norm": 0.0299072265625, + "learning_rate": 0.018552278820375336, + "loss": 0.7936, + "num_input_tokens_seen": 4816128, + "step": 8305 + }, + { + "epoch": 1.2377122430741734, + "grad_norm": 0.030029296875, + "learning_rate": 0.018563449508489725, + "loss": 0.8048, + "num_input_tokens_seen": 4818976, + "step": 8310 + }, + { + "epoch": 1.2384569556151326, + "grad_norm": 0.038330078125, + "learning_rate": 0.01857462019660411, + "loss": 0.8372, + "num_input_tokens_seen": 4821760, + "step": 8315 + }, + { + "epoch": 1.2392016681560918, + "grad_norm": 0.018310546875, + "learning_rate": 0.018585790884718498, + "loss": 0.8023, + "num_input_tokens_seen": 4824544, + "step": 8320 + }, + { + "epoch": 1.239946380697051, + "grad_norm": 0.0322265625, + "learning_rate": 0.018596961572832884, + "loss": 0.8073, + "num_input_tokens_seen": 4827648, + "step": 8325 + }, + { + "epoch": 1.2406910932380102, + "grad_norm": 0.029541015625, + "learning_rate": 0.018608132260947274, + "loss": 0.7913, + "num_input_tokens_seen": 4830336, + "step": 8330 + }, + { + "epoch": 1.2414358057789694, + "grad_norm": 0.0322265625, + "learning_rate": 0.01861930294906166, + "loss": 0.7997, + "num_input_tokens_seen": 4833056, + "step": 8335 + }, + { + "epoch": 1.2421805183199286, + "grad_norm": 0.017822265625, + "learning_rate": 0.01863047363717605, + "loss": 0.792, + "num_input_tokens_seen": 4836096, + "step": 8340 + }, + { + "epoch": 1.2429252308608878, + "grad_norm": 0.029052734375, + "learning_rate": 0.018641644325290436, + "loss": 0.8226, + "num_input_tokens_seen": 4838912, + "step": 8345 + }, + { + "epoch": 1.243669943401847, + "grad_norm": 0.0233154296875, + "learning_rate": 0.018652815013404826, + "loss": 0.8057, + "num_input_tokens_seen": 4841728, + "step": 8350 + }, + { + "epoch": 1.244414655942806, + "grad_norm": 0.0303955078125, + "learning_rate": 0.018663985701519212, + "loss": 0.8072, + "num_input_tokens_seen": 4844928, + "step": 8355 + }, + { + "epoch": 1.2451593684837652, + "grad_norm": 0.0341796875, + "learning_rate": 0.0186751563896336, + "loss": 0.8047, + "num_input_tokens_seen": 4848096, + "step": 8360 + }, + { + "epoch": 1.2459040810247244, + "grad_norm": 0.036376953125, + "learning_rate": 0.018686327077747988, + "loss": 0.7866, + "num_input_tokens_seen": 4851296, + "step": 8365 + }, + { + "epoch": 1.2466487935656836, + "grad_norm": 0.0419921875, + "learning_rate": 0.018697497765862377, + "loss": 0.8094, + "num_input_tokens_seen": 4854208, + "step": 8370 + }, + { + "epoch": 1.2473935061066428, + "grad_norm": 0.058837890625, + "learning_rate": 0.018708668453976764, + "loss": 0.8027, + "num_input_tokens_seen": 4856896, + "step": 8375 + }, + { + "epoch": 1.248138218647602, + "grad_norm": 0.046142578125, + "learning_rate": 0.018719839142091153, + "loss": 0.7879, + "num_input_tokens_seen": 4859744, + "step": 8380 + }, + { + "epoch": 1.2488829311885612, + "grad_norm": 0.021240234375, + "learning_rate": 0.01873100983020554, + "loss": 0.8164, + "num_input_tokens_seen": 4862560, + "step": 8385 + }, + { + "epoch": 1.2496276437295204, + "grad_norm": 0.02978515625, + "learning_rate": 0.01874218051831993, + "loss": 0.7958, + "num_input_tokens_seen": 4865568, + "step": 8390 + }, + { + "epoch": 1.2503723562704796, + "grad_norm": 0.03076171875, + "learning_rate": 0.018753351206434316, + "loss": 0.8184, + "num_input_tokens_seen": 4868384, + "step": 8395 + }, + { + "epoch": 1.2511170688114388, + "grad_norm": 0.0181884765625, + "learning_rate": 0.018764521894548705, + "loss": 0.8075, + "num_input_tokens_seen": 4870912, + "step": 8400 + }, + { + "epoch": 1.251861781352398, + "grad_norm": 0.041259765625, + "learning_rate": 0.01877569258266309, + "loss": 0.8224, + "num_input_tokens_seen": 4873792, + "step": 8405 + }, + { + "epoch": 1.2526064938933572, + "grad_norm": 0.0303955078125, + "learning_rate": 0.018786863270777478, + "loss": 0.8077, + "num_input_tokens_seen": 4876864, + "step": 8410 + }, + { + "epoch": 1.2533512064343164, + "grad_norm": 0.03173828125, + "learning_rate": 0.018798033958891867, + "loss": 0.7884, + "num_input_tokens_seen": 4879872, + "step": 8415 + }, + { + "epoch": 1.2540959189752756, + "grad_norm": 0.036865234375, + "learning_rate": 0.018809204647006254, + "loss": 0.8193, + "num_input_tokens_seen": 4882944, + "step": 8420 + }, + { + "epoch": 1.2548406315162346, + "grad_norm": 0.0225830078125, + "learning_rate": 0.018820375335120643, + "loss": 0.8278, + "num_input_tokens_seen": 4885952, + "step": 8425 + }, + { + "epoch": 1.2555853440571938, + "grad_norm": 0.03173828125, + "learning_rate": 0.01883154602323503, + "loss": 0.8239, + "num_input_tokens_seen": 4888768, + "step": 8430 + }, + { + "epoch": 1.256330056598153, + "grad_norm": 0.02978515625, + "learning_rate": 0.01884271671134942, + "loss": 0.8052, + "num_input_tokens_seen": 4891584, + "step": 8435 + }, + { + "epoch": 1.2570747691391122, + "grad_norm": 0.03857421875, + "learning_rate": 0.018853887399463806, + "loss": 0.8131, + "num_input_tokens_seen": 4894432, + "step": 8440 + }, + { + "epoch": 1.2578194816800714, + "grad_norm": 0.037841796875, + "learning_rate": 0.018865058087578195, + "loss": 0.8125, + "num_input_tokens_seen": 4897152, + "step": 8445 + }, + { + "epoch": 1.2585641942210306, + "grad_norm": 0.033935546875, + "learning_rate": 0.01887622877569258, + "loss": 0.8003, + "num_input_tokens_seen": 4900224, + "step": 8450 + }, + { + "epoch": 1.2593089067619898, + "grad_norm": 0.031982421875, + "learning_rate": 0.01888739946380697, + "loss": 0.7999, + "num_input_tokens_seen": 4903072, + "step": 8455 + }, + { + "epoch": 1.260053619302949, + "grad_norm": 0.018310546875, + "learning_rate": 0.018898570151921357, + "loss": 0.7907, + "num_input_tokens_seen": 4905792, + "step": 8460 + }, + { + "epoch": 1.2607983318439082, + "grad_norm": 0.032958984375, + "learning_rate": 0.018909740840035747, + "loss": 0.7997, + "num_input_tokens_seen": 4908608, + "step": 8465 + }, + { + "epoch": 1.2615430443848674, + "grad_norm": 0.0191650390625, + "learning_rate": 0.018920911528150133, + "loss": 0.8166, + "num_input_tokens_seen": 4911648, + "step": 8470 + }, + { + "epoch": 1.2622877569258266, + "grad_norm": 0.048828125, + "learning_rate": 0.018932082216264523, + "loss": 0.8038, + "num_input_tokens_seen": 4914816, + "step": 8475 + }, + { + "epoch": 1.2630324694667858, + "grad_norm": 0.031982421875, + "learning_rate": 0.01894325290437891, + "loss": 0.7954, + "num_input_tokens_seen": 4917728, + "step": 8480 + }, + { + "epoch": 1.263777182007745, + "grad_norm": 0.031494140625, + "learning_rate": 0.0189544235924933, + "loss": 0.8171, + "num_input_tokens_seen": 4920704, + "step": 8485 + }, + { + "epoch": 1.2645218945487042, + "grad_norm": 0.0299072265625, + "learning_rate": 0.018965594280607682, + "loss": 0.7981, + "num_input_tokens_seen": 4923392, + "step": 8490 + }, + { + "epoch": 1.2652666070896634, + "grad_norm": 0.04150390625, + "learning_rate": 0.01897676496872207, + "loss": 0.7808, + "num_input_tokens_seen": 4926432, + "step": 8495 + }, + { + "epoch": 1.2660113196306226, + "grad_norm": 0.02685546875, + "learning_rate": 0.018987935656836458, + "loss": 0.79, + "num_input_tokens_seen": 4929600, + "step": 8500 + }, + { + "epoch": 1.2667560321715818, + "grad_norm": 0.02587890625, + "learning_rate": 0.018999106344950847, + "loss": 0.8064, + "num_input_tokens_seen": 4932288, + "step": 8505 + }, + { + "epoch": 1.267500744712541, + "grad_norm": 0.02978515625, + "learning_rate": 0.019010277033065234, + "loss": 0.8144, + "num_input_tokens_seen": 4935168, + "step": 8510 + }, + { + "epoch": 1.2682454572535002, + "grad_norm": 0.028076171875, + "learning_rate": 0.019021447721179623, + "loss": 0.7943, + "num_input_tokens_seen": 4937920, + "step": 8515 + }, + { + "epoch": 1.2689901697944594, + "grad_norm": 0.035400390625, + "learning_rate": 0.019032618409294013, + "loss": 0.8092, + "num_input_tokens_seen": 4940992, + "step": 8520 + }, + { + "epoch": 1.2697348823354186, + "grad_norm": 0.0177001953125, + "learning_rate": 0.0190437890974084, + "loss": 0.831, + "num_input_tokens_seen": 4944128, + "step": 8525 + }, + { + "epoch": 1.2704795948763778, + "grad_norm": 0.0257568359375, + "learning_rate": 0.01905495978552279, + "loss": 0.7806, + "num_input_tokens_seen": 4947072, + "step": 8530 + }, + { + "epoch": 1.2712243074173368, + "grad_norm": 0.031494140625, + "learning_rate": 0.019066130473637175, + "loss": 0.8111, + "num_input_tokens_seen": 4949760, + "step": 8535 + }, + { + "epoch": 1.271969019958296, + "grad_norm": 0.027099609375, + "learning_rate": 0.019077301161751565, + "loss": 0.8269, + "num_input_tokens_seen": 4952544, + "step": 8540 + }, + { + "epoch": 1.2727137324992552, + "grad_norm": 0.042724609375, + "learning_rate": 0.01908847184986595, + "loss": 0.8218, + "num_input_tokens_seen": 4955520, + "step": 8545 + }, + { + "epoch": 1.2734584450402144, + "grad_norm": 0.0400390625, + "learning_rate": 0.01909964253798034, + "loss": 0.824, + "num_input_tokens_seen": 4958272, + "step": 8550 + }, + { + "epoch": 1.2742031575811736, + "grad_norm": 0.048583984375, + "learning_rate": 0.019110813226094727, + "loss": 0.7906, + "num_input_tokens_seen": 4961280, + "step": 8555 + }, + { + "epoch": 1.2749478701221328, + "grad_norm": 0.033447265625, + "learning_rate": 0.019121983914209117, + "loss": 0.8156, + "num_input_tokens_seen": 4964032, + "step": 8560 + }, + { + "epoch": 1.275692582663092, + "grad_norm": 0.046630859375, + "learning_rate": 0.019133154602323503, + "loss": 0.7996, + "num_input_tokens_seen": 4967008, + "step": 8565 + }, + { + "epoch": 1.2764372952040512, + "grad_norm": 0.0400390625, + "learning_rate": 0.019144325290437893, + "loss": 0.7925, + "num_input_tokens_seen": 4969696, + "step": 8570 + }, + { + "epoch": 1.2771820077450105, + "grad_norm": 0.0458984375, + "learning_rate": 0.01915549597855228, + "loss": 0.8159, + "num_input_tokens_seen": 4973280, + "step": 8575 + }, + { + "epoch": 1.2779267202859697, + "grad_norm": 0.03857421875, + "learning_rate": 0.019166666666666665, + "loss": 0.8047, + "num_input_tokens_seen": 4976032, + "step": 8580 + }, + { + "epoch": 1.2786714328269289, + "grad_norm": 0.031494140625, + "learning_rate": 0.01917783735478105, + "loss": 0.8126, + "num_input_tokens_seen": 4979168, + "step": 8585 + }, + { + "epoch": 1.279416145367888, + "grad_norm": 0.033447265625, + "learning_rate": 0.01918900804289544, + "loss": 0.7915, + "num_input_tokens_seen": 4982112, + "step": 8590 + }, + { + "epoch": 1.2801608579088473, + "grad_norm": 0.0189208984375, + "learning_rate": 0.019200178731009827, + "loss": 0.8151, + "num_input_tokens_seen": 4984800, + "step": 8595 + }, + { + "epoch": 1.2809055704498062, + "grad_norm": 0.03955078125, + "learning_rate": 0.019211349419124217, + "loss": 0.7866, + "num_input_tokens_seen": 4987584, + "step": 8600 + }, + { + "epoch": 1.2816502829907654, + "grad_norm": 0.028564453125, + "learning_rate": 0.019222520107238603, + "loss": 0.8034, + "num_input_tokens_seen": 4990592, + "step": 8605 + }, + { + "epoch": 1.2823949955317246, + "grad_norm": 0.030029296875, + "learning_rate": 0.019233690795352993, + "loss": 0.8082, + "num_input_tokens_seen": 4993696, + "step": 8610 + }, + { + "epoch": 1.2831397080726838, + "grad_norm": 0.03271484375, + "learning_rate": 0.01924486148346738, + "loss": 0.8092, + "num_input_tokens_seen": 4996864, + "step": 8615 + }, + { + "epoch": 1.283884420613643, + "grad_norm": 0.032470703125, + "learning_rate": 0.01925603217158177, + "loss": 0.7959, + "num_input_tokens_seen": 4999744, + "step": 8620 + }, + { + "epoch": 1.2846291331546023, + "grad_norm": 0.037841796875, + "learning_rate": 0.019267202859696155, + "loss": 0.8046, + "num_input_tokens_seen": 5002656, + "step": 8625 + }, + { + "epoch": 1.2853738456955615, + "grad_norm": 0.048828125, + "learning_rate": 0.019278373547810545, + "loss": 0.8014, + "num_input_tokens_seen": 5005280, + "step": 8630 + }, + { + "epoch": 1.2861185582365207, + "grad_norm": 0.033203125, + "learning_rate": 0.019289544235924935, + "loss": 0.809, + "num_input_tokens_seen": 5008352, + "step": 8635 + }, + { + "epoch": 1.2868632707774799, + "grad_norm": 0.0517578125, + "learning_rate": 0.01930071492403932, + "loss": 0.8048, + "num_input_tokens_seen": 5011200, + "step": 8640 + }, + { + "epoch": 1.287607983318439, + "grad_norm": 0.04736328125, + "learning_rate": 0.01931188561215371, + "loss": 0.7993, + "num_input_tokens_seen": 5014016, + "step": 8645 + }, + { + "epoch": 1.2883526958593983, + "grad_norm": 0.04931640625, + "learning_rate": 0.019323056300268097, + "loss": 0.807, + "num_input_tokens_seen": 5016832, + "step": 8650 + }, + { + "epoch": 1.2890974084003575, + "grad_norm": 0.045166015625, + "learning_rate": 0.019334226988382486, + "loss": 0.7973, + "num_input_tokens_seen": 5019904, + "step": 8655 + }, + { + "epoch": 1.2898421209413167, + "grad_norm": 0.044189453125, + "learning_rate": 0.019345397676496873, + "loss": 0.8205, + "num_input_tokens_seen": 5022784, + "step": 8660 + }, + { + "epoch": 1.2905868334822759, + "grad_norm": 0.04833984375, + "learning_rate": 0.01935656836461126, + "loss": 0.7961, + "num_input_tokens_seen": 5025696, + "step": 8665 + }, + { + "epoch": 1.291331546023235, + "grad_norm": 0.039306640625, + "learning_rate": 0.019367739052725645, + "loss": 0.8103, + "num_input_tokens_seen": 5028704, + "step": 8670 + }, + { + "epoch": 1.2920762585641943, + "grad_norm": 0.041748046875, + "learning_rate": 0.019378909740840035, + "loss": 0.8216, + "num_input_tokens_seen": 5031648, + "step": 8675 + }, + { + "epoch": 1.2928209711051535, + "grad_norm": 0.041015625, + "learning_rate": 0.01939008042895442, + "loss": 0.7888, + "num_input_tokens_seen": 5034752, + "step": 8680 + }, + { + "epoch": 1.2935656836461127, + "grad_norm": 0.04931640625, + "learning_rate": 0.01940125111706881, + "loss": 0.8106, + "num_input_tokens_seen": 5037568, + "step": 8685 + }, + { + "epoch": 1.2943103961870719, + "grad_norm": 0.0308837890625, + "learning_rate": 0.019412421805183197, + "loss": 0.8073, + "num_input_tokens_seen": 5040000, + "step": 8690 + }, + { + "epoch": 1.295055108728031, + "grad_norm": 0.024169921875, + "learning_rate": 0.019423592493297587, + "loss": 0.7804, + "num_input_tokens_seen": 5042976, + "step": 8695 + }, + { + "epoch": 1.2957998212689903, + "grad_norm": 0.052001953125, + "learning_rate": 0.019434763181411973, + "loss": 0.8031, + "num_input_tokens_seen": 5045792, + "step": 8700 + }, + { + "epoch": 1.2965445338099495, + "grad_norm": 0.0301513671875, + "learning_rate": 0.019445933869526363, + "loss": 0.8071, + "num_input_tokens_seen": 5048288, + "step": 8705 + }, + { + "epoch": 1.2972892463509085, + "grad_norm": 0.035888671875, + "learning_rate": 0.01945710455764075, + "loss": 0.812, + "num_input_tokens_seen": 5051200, + "step": 8710 + }, + { + "epoch": 1.2980339588918677, + "grad_norm": 0.04833984375, + "learning_rate": 0.01946827524575514, + "loss": 0.8101, + "num_input_tokens_seen": 5054048, + "step": 8715 + }, + { + "epoch": 1.2987786714328269, + "grad_norm": 0.0302734375, + "learning_rate": 0.019479445933869525, + "loss": 0.7875, + "num_input_tokens_seen": 5056736, + "step": 8720 + }, + { + "epoch": 1.299523383973786, + "grad_norm": 0.0302734375, + "learning_rate": 0.019490616621983915, + "loss": 0.8112, + "num_input_tokens_seen": 5059936, + "step": 8725 + }, + { + "epoch": 1.3002680965147453, + "grad_norm": 0.03564453125, + "learning_rate": 0.0195017873100983, + "loss": 0.7896, + "num_input_tokens_seen": 5062688, + "step": 8730 + }, + { + "epoch": 1.3010128090557045, + "grad_norm": 0.03173828125, + "learning_rate": 0.01951295799821269, + "loss": 0.797, + "num_input_tokens_seen": 5065792, + "step": 8735 + }, + { + "epoch": 1.3017575215966637, + "grad_norm": 0.032958984375, + "learning_rate": 0.019524128686327077, + "loss": 0.791, + "num_input_tokens_seen": 5068768, + "step": 8740 + }, + { + "epoch": 1.302502234137623, + "grad_norm": 0.032958984375, + "learning_rate": 0.019535299374441466, + "loss": 0.8038, + "num_input_tokens_seen": 5071584, + "step": 8745 + }, + { + "epoch": 1.303246946678582, + "grad_norm": 0.042236328125, + "learning_rate": 0.019546470062555853, + "loss": 0.8173, + "num_input_tokens_seen": 5074400, + "step": 8750 + }, + { + "epoch": 1.3039916592195413, + "grad_norm": 0.033935546875, + "learning_rate": 0.01955764075067024, + "loss": 0.821, + "num_input_tokens_seen": 5077120, + "step": 8755 + }, + { + "epoch": 1.3047363717605005, + "grad_norm": 0.03466796875, + "learning_rate": 0.01956881143878463, + "loss": 0.7973, + "num_input_tokens_seen": 5079872, + "step": 8760 + }, + { + "epoch": 1.3054810843014597, + "grad_norm": 0.02978515625, + "learning_rate": 0.019579982126899015, + "loss": 0.8149, + "num_input_tokens_seen": 5082720, + "step": 8765 + }, + { + "epoch": 1.306225796842419, + "grad_norm": 0.07275390625, + "learning_rate": 0.019591152815013405, + "loss": 0.8352, + "num_input_tokens_seen": 5085504, + "step": 8770 + }, + { + "epoch": 1.3069705093833779, + "grad_norm": 0.029052734375, + "learning_rate": 0.01960232350312779, + "loss": 0.8177, + "num_input_tokens_seen": 5088000, + "step": 8775 + }, + { + "epoch": 1.307715221924337, + "grad_norm": 0.0279541015625, + "learning_rate": 0.01961349419124218, + "loss": 0.8196, + "num_input_tokens_seen": 5090752, + "step": 8780 + }, + { + "epoch": 1.3084599344652963, + "grad_norm": 0.0294189453125, + "learning_rate": 0.019624664879356567, + "loss": 0.8319, + "num_input_tokens_seen": 5093440, + "step": 8785 + }, + { + "epoch": 1.3092046470062555, + "grad_norm": 0.0419921875, + "learning_rate": 0.019635835567470956, + "loss": 0.8065, + "num_input_tokens_seen": 5096288, + "step": 8790 + }, + { + "epoch": 1.3099493595472147, + "grad_norm": 0.0169677734375, + "learning_rate": 0.019647006255585343, + "loss": 0.8145, + "num_input_tokens_seen": 5099072, + "step": 8795 + }, + { + "epoch": 1.310694072088174, + "grad_norm": 0.0194091796875, + "learning_rate": 0.019658176943699732, + "loss": 0.7977, + "num_input_tokens_seen": 5102080, + "step": 8800 + }, + { + "epoch": 1.311438784629133, + "grad_norm": 0.026611328125, + "learning_rate": 0.01966934763181412, + "loss": 0.7927, + "num_input_tokens_seen": 5105024, + "step": 8805 + }, + { + "epoch": 1.3121834971700923, + "grad_norm": 0.04248046875, + "learning_rate": 0.01968051831992851, + "loss": 0.7866, + "num_input_tokens_seen": 5107904, + "step": 8810 + }, + { + "epoch": 1.3129282097110515, + "grad_norm": 0.03369140625, + "learning_rate": 0.019691689008042895, + "loss": 0.8044, + "num_input_tokens_seen": 5110880, + "step": 8815 + }, + { + "epoch": 1.3136729222520107, + "grad_norm": 0.026123046875, + "learning_rate": 0.019702859696157284, + "loss": 0.8088, + "num_input_tokens_seen": 5113472, + "step": 8820 + }, + { + "epoch": 1.31441763479297, + "grad_norm": 0.0179443359375, + "learning_rate": 0.01971403038427167, + "loss": 0.7942, + "num_input_tokens_seen": 5116544, + "step": 8825 + }, + { + "epoch": 1.3151623473339291, + "grad_norm": 0.01953125, + "learning_rate": 0.01972520107238606, + "loss": 0.8029, + "num_input_tokens_seen": 5119136, + "step": 8830 + }, + { + "epoch": 1.3159070598748883, + "grad_norm": 0.044921875, + "learning_rate": 0.019736371760500443, + "loss": 0.796, + "num_input_tokens_seen": 5122016, + "step": 8835 + }, + { + "epoch": 1.3166517724158475, + "grad_norm": 0.0263671875, + "learning_rate": 0.019747542448614833, + "loss": 0.8188, + "num_input_tokens_seen": 5124960, + "step": 8840 + }, + { + "epoch": 1.3173964849568067, + "grad_norm": 0.0274658203125, + "learning_rate": 0.01975871313672922, + "loss": 0.795, + "num_input_tokens_seen": 5128000, + "step": 8845 + }, + { + "epoch": 1.318141197497766, + "grad_norm": 0.0439453125, + "learning_rate": 0.01976988382484361, + "loss": 0.7871, + "num_input_tokens_seen": 5131008, + "step": 8850 + }, + { + "epoch": 1.3188859100387251, + "grad_norm": 0.03466796875, + "learning_rate": 0.019781054512958, + "loss": 0.7987, + "num_input_tokens_seen": 5133824, + "step": 8855 + }, + { + "epoch": 1.3196306225796843, + "grad_norm": 0.034912109375, + "learning_rate": 0.019792225201072385, + "loss": 0.7866, + "num_input_tokens_seen": 5136608, + "step": 8860 + }, + { + "epoch": 1.3203753351206435, + "grad_norm": 0.025146484375, + "learning_rate": 0.019803395889186774, + "loss": 0.8155, + "num_input_tokens_seen": 5139360, + "step": 8865 + }, + { + "epoch": 1.3211200476616027, + "grad_norm": 0.0296630859375, + "learning_rate": 0.01981456657730116, + "loss": 0.7787, + "num_input_tokens_seen": 5142208, + "step": 8870 + }, + { + "epoch": 1.321864760202562, + "grad_norm": 0.046875, + "learning_rate": 0.01982573726541555, + "loss": 0.8186, + "num_input_tokens_seen": 5145280, + "step": 8875 + }, + { + "epoch": 1.322609472743521, + "grad_norm": 0.01806640625, + "learning_rate": 0.019836907953529936, + "loss": 0.7946, + "num_input_tokens_seen": 5148224, + "step": 8880 + }, + { + "epoch": 1.3233541852844801, + "grad_norm": 0.02978515625, + "learning_rate": 0.019848078641644326, + "loss": 0.8042, + "num_input_tokens_seen": 5150976, + "step": 8885 + }, + { + "epoch": 1.3240988978254393, + "grad_norm": 0.016357421875, + "learning_rate": 0.019859249329758712, + "loss": 0.8023, + "num_input_tokens_seen": 5153504, + "step": 8890 + }, + { + "epoch": 1.3248436103663985, + "grad_norm": 0.0311279296875, + "learning_rate": 0.019870420017873102, + "loss": 0.8286, + "num_input_tokens_seen": 5156512, + "step": 8895 + }, + { + "epoch": 1.3255883229073577, + "grad_norm": 0.04248046875, + "learning_rate": 0.01988159070598749, + "loss": 0.8347, + "num_input_tokens_seen": 5159328, + "step": 8900 + }, + { + "epoch": 1.326333035448317, + "grad_norm": 0.018310546875, + "learning_rate": 0.019892761394101878, + "loss": 0.8058, + "num_input_tokens_seen": 5162048, + "step": 8905 + }, + { + "epoch": 1.3270777479892761, + "grad_norm": 0.04345703125, + "learning_rate": 0.019903932082216264, + "loss": 0.7808, + "num_input_tokens_seen": 5165056, + "step": 8910 + }, + { + "epoch": 1.3278224605302353, + "grad_norm": 0.0308837890625, + "learning_rate": 0.019915102770330654, + "loss": 0.8163, + "num_input_tokens_seen": 5168000, + "step": 8915 + }, + { + "epoch": 1.3285671730711945, + "grad_norm": 0.062255859375, + "learning_rate": 0.01992627345844504, + "loss": 0.8305, + "num_input_tokens_seen": 5170784, + "step": 8920 + }, + { + "epoch": 1.3293118856121537, + "grad_norm": 0.038818359375, + "learning_rate": 0.019937444146559426, + "loss": 0.7843, + "num_input_tokens_seen": 5174016, + "step": 8925 + }, + { + "epoch": 1.330056598153113, + "grad_norm": 0.03076171875, + "learning_rate": 0.019948614834673813, + "loss": 0.8133, + "num_input_tokens_seen": 5177120, + "step": 8930 + }, + { + "epoch": 1.3308013106940721, + "grad_norm": 0.0419921875, + "learning_rate": 0.019959785522788202, + "loss": 0.7967, + "num_input_tokens_seen": 5179904, + "step": 8935 + }, + { + "epoch": 1.3315460232350314, + "grad_norm": 0.0498046875, + "learning_rate": 0.01997095621090259, + "loss": 0.7754, + "num_input_tokens_seen": 5182912, + "step": 8940 + }, + { + "epoch": 1.3322907357759903, + "grad_norm": 0.0162353515625, + "learning_rate": 0.01998212689901698, + "loss": 0.8011, + "num_input_tokens_seen": 5185888, + "step": 8945 + }, + { + "epoch": 1.3330354483169495, + "grad_norm": 0.038818359375, + "learning_rate": 0.019993297587131365, + "loss": 0.8013, + "num_input_tokens_seen": 5188864, + "step": 8950 + }, + { + "epoch": 1.3337801608579087, + "grad_norm": 0.037109375, + "learning_rate": 0.020004468275245754, + "loss": 0.7879, + "num_input_tokens_seen": 5191392, + "step": 8955 + }, + { + "epoch": 1.334524873398868, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02001563896336014, + "loss": 0.7738, + "num_input_tokens_seen": 5193984, + "step": 8960 + }, + { + "epoch": 1.3352695859398271, + "grad_norm": 0.06494140625, + "learning_rate": 0.02002680965147453, + "loss": 0.799, + "num_input_tokens_seen": 5196960, + "step": 8965 + }, + { + "epoch": 1.3360142984807863, + "grad_norm": 0.03759765625, + "learning_rate": 0.02003798033958892, + "loss": 0.753, + "num_input_tokens_seen": 5199776, + "step": 8970 + }, + { + "epoch": 1.3367590110217455, + "grad_norm": 0.040283203125, + "learning_rate": 0.020049151027703306, + "loss": 0.8538, + "num_input_tokens_seen": 5203040, + "step": 8975 + }, + { + "epoch": 1.3375037235627047, + "grad_norm": 0.0205078125, + "learning_rate": 0.020060321715817696, + "loss": 0.7857, + "num_input_tokens_seen": 5205952, + "step": 8980 + }, + { + "epoch": 1.338248436103664, + "grad_norm": 0.029052734375, + "learning_rate": 0.020071492403932082, + "loss": 0.7913, + "num_input_tokens_seen": 5208768, + "step": 8985 + }, + { + "epoch": 1.3389931486446232, + "grad_norm": 0.045166015625, + "learning_rate": 0.020082663092046472, + "loss": 0.8059, + "num_input_tokens_seen": 5211584, + "step": 8990 + }, + { + "epoch": 1.3397378611855824, + "grad_norm": 0.046142578125, + "learning_rate": 0.020093833780160858, + "loss": 0.8582, + "num_input_tokens_seen": 5214752, + "step": 8995 + }, + { + "epoch": 1.3404825737265416, + "grad_norm": 0.0302734375, + "learning_rate": 0.020105004468275248, + "loss": 0.8108, + "num_input_tokens_seen": 5217536, + "step": 9000 + }, + { + "epoch": 1.3412272862675008, + "grad_norm": 0.040771484375, + "learning_rate": 0.020116175156389634, + "loss": 0.787, + "num_input_tokens_seen": 5220480, + "step": 9005 + }, + { + "epoch": 1.34197199880846, + "grad_norm": 0.035888671875, + "learning_rate": 0.02012734584450402, + "loss": 0.8246, + "num_input_tokens_seen": 5223520, + "step": 9010 + }, + { + "epoch": 1.3427167113494192, + "grad_norm": 0.0264892578125, + "learning_rate": 0.020138516532618406, + "loss": 0.7788, + "num_input_tokens_seen": 5227744, + "step": 9015 + }, + { + "epoch": 1.3434614238903784, + "grad_norm": 0.0306396484375, + "learning_rate": 0.020149687220732796, + "loss": 0.8091, + "num_input_tokens_seen": 5230400, + "step": 9020 + }, + { + "epoch": 1.3442061364313376, + "grad_norm": 0.0380859375, + "learning_rate": 0.020160857908847182, + "loss": 0.8237, + "num_input_tokens_seen": 5233312, + "step": 9025 + }, + { + "epoch": 1.3449508489722968, + "grad_norm": 0.03515625, + "learning_rate": 0.020172028596961572, + "loss": 0.8144, + "num_input_tokens_seen": 5236256, + "step": 9030 + }, + { + "epoch": 1.345695561513256, + "grad_norm": 0.029052734375, + "learning_rate": 0.020183199285075958, + "loss": 0.8097, + "num_input_tokens_seen": 5239296, + "step": 9035 + }, + { + "epoch": 1.3464402740542152, + "grad_norm": 0.031494140625, + "learning_rate": 0.020194369973190348, + "loss": 0.8111, + "num_input_tokens_seen": 5242560, + "step": 9040 + }, + { + "epoch": 1.3471849865951744, + "grad_norm": 0.06005859375, + "learning_rate": 0.020205540661304734, + "loss": 0.8068, + "num_input_tokens_seen": 5245408, + "step": 9045 + }, + { + "epoch": 1.3479296991361336, + "grad_norm": 0.0284423828125, + "learning_rate": 0.020216711349419124, + "loss": 0.804, + "num_input_tokens_seen": 5248448, + "step": 9050 + }, + { + "epoch": 1.3486744116770926, + "grad_norm": 0.024658203125, + "learning_rate": 0.02022788203753351, + "loss": 0.8207, + "num_input_tokens_seen": 5251616, + "step": 9055 + }, + { + "epoch": 1.3494191242180518, + "grad_norm": 0.03564453125, + "learning_rate": 0.0202390527256479, + "loss": 0.8011, + "num_input_tokens_seen": 5254752, + "step": 9060 + }, + { + "epoch": 1.350163836759011, + "grad_norm": 0.030517578125, + "learning_rate": 0.020250223413762286, + "loss": 0.8051, + "num_input_tokens_seen": 5257696, + "step": 9065 + }, + { + "epoch": 1.3509085492999702, + "grad_norm": 0.03955078125, + "learning_rate": 0.020261394101876676, + "loss": 0.7975, + "num_input_tokens_seen": 5260800, + "step": 9070 + }, + { + "epoch": 1.3516532618409294, + "grad_norm": 0.052001953125, + "learning_rate": 0.020272564789991062, + "loss": 0.8052, + "num_input_tokens_seen": 5263488, + "step": 9075 + }, + { + "epoch": 1.3523979743818886, + "grad_norm": 0.046875, + "learning_rate": 0.02028373547810545, + "loss": 0.801, + "num_input_tokens_seen": 5266272, + "step": 9080 + }, + { + "epoch": 1.3531426869228478, + "grad_norm": 0.03369140625, + "learning_rate": 0.02029490616621984, + "loss": 0.8049, + "num_input_tokens_seen": 5269248, + "step": 9085 + }, + { + "epoch": 1.353887399463807, + "grad_norm": 0.04931640625, + "learning_rate": 0.020306076854334228, + "loss": 0.8055, + "num_input_tokens_seen": 5272448, + "step": 9090 + }, + { + "epoch": 1.3546321120047662, + "grad_norm": 0.03564453125, + "learning_rate": 0.020317247542448614, + "loss": 0.8148, + "num_input_tokens_seen": 5275200, + "step": 9095 + }, + { + "epoch": 1.3553768245457254, + "grad_norm": 0.0299072265625, + "learning_rate": 0.020328418230563, + "loss": 0.7711, + "num_input_tokens_seen": 5277984, + "step": 9100 + }, + { + "epoch": 1.3561215370866846, + "grad_norm": 0.033447265625, + "learning_rate": 0.02033958891867739, + "loss": 0.7912, + "num_input_tokens_seen": 5281024, + "step": 9105 + }, + { + "epoch": 1.3568662496276438, + "grad_norm": 0.03662109375, + "learning_rate": 0.020350759606791776, + "loss": 0.8236, + "num_input_tokens_seen": 5284000, + "step": 9110 + }, + { + "epoch": 1.357610962168603, + "grad_norm": 0.0198974609375, + "learning_rate": 0.020361930294906166, + "loss": 0.8221, + "num_input_tokens_seen": 5286624, + "step": 9115 + }, + { + "epoch": 1.358355674709562, + "grad_norm": 0.045654296875, + "learning_rate": 0.020373100983020552, + "loss": 0.7948, + "num_input_tokens_seen": 5289408, + "step": 9120 + }, + { + "epoch": 1.3591003872505212, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02038427167113494, + "loss": 0.8165, + "num_input_tokens_seen": 5292480, + "step": 9125 + }, + { + "epoch": 1.3598450997914804, + "grad_norm": 0.03515625, + "learning_rate": 0.020395442359249328, + "loss": 0.8064, + "num_input_tokens_seen": 5295776, + "step": 9130 + }, + { + "epoch": 1.3605898123324396, + "grad_norm": 0.03173828125, + "learning_rate": 0.020406613047363718, + "loss": 0.8289, + "num_input_tokens_seen": 5298400, + "step": 9135 + }, + { + "epoch": 1.3613345248733988, + "grad_norm": 0.0289306640625, + "learning_rate": 0.020417783735478104, + "loss": 0.8162, + "num_input_tokens_seen": 5301152, + "step": 9140 + }, + { + "epoch": 1.362079237414358, + "grad_norm": 0.03369140625, + "learning_rate": 0.020428954423592494, + "loss": 0.8081, + "num_input_tokens_seen": 5304480, + "step": 9145 + }, + { + "epoch": 1.3628239499553172, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02044012511170688, + "loss": 0.7994, + "num_input_tokens_seen": 5307520, + "step": 9150 + }, + { + "epoch": 1.3635686624962764, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02045129579982127, + "loss": 0.8007, + "num_input_tokens_seen": 5310240, + "step": 9155 + }, + { + "epoch": 1.3643133750372356, + "grad_norm": 0.0341796875, + "learning_rate": 0.020462466487935656, + "loss": 0.8088, + "num_input_tokens_seen": 5312928, + "step": 9160 + }, + { + "epoch": 1.3650580875781948, + "grad_norm": 0.039306640625, + "learning_rate": 0.020473637176050045, + "loss": 0.8033, + "num_input_tokens_seen": 5316000, + "step": 9165 + }, + { + "epoch": 1.365802800119154, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02048480786416443, + "loss": 0.803, + "num_input_tokens_seen": 5319232, + "step": 9170 + }, + { + "epoch": 1.3665475126601132, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02049597855227882, + "loss": 0.8124, + "num_input_tokens_seen": 5322080, + "step": 9175 + }, + { + "epoch": 1.3672922252010724, + "grad_norm": 0.057861328125, + "learning_rate": 0.020507149240393208, + "loss": 0.8075, + "num_input_tokens_seen": 5324928, + "step": 9180 + }, + { + "epoch": 1.3680369377420316, + "grad_norm": 0.049560546875, + "learning_rate": 0.020518319928507594, + "loss": 0.8081, + "num_input_tokens_seen": 5327744, + "step": 9185 + }, + { + "epoch": 1.3687816502829908, + "grad_norm": 0.02880859375, + "learning_rate": 0.020529490616621984, + "loss": 0.7946, + "num_input_tokens_seen": 5330400, + "step": 9190 + }, + { + "epoch": 1.36952636282395, + "grad_norm": 0.0311279296875, + "learning_rate": 0.02054066130473637, + "loss": 0.799, + "num_input_tokens_seen": 5333248, + "step": 9195 + }, + { + "epoch": 1.3702710753649092, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02055183199285076, + "loss": 0.7926, + "num_input_tokens_seen": 5336064, + "step": 9200 + }, + { + "epoch": 1.3710157879058684, + "grad_norm": 0.0260009765625, + "learning_rate": 0.020563002680965146, + "loss": 0.8, + "num_input_tokens_seen": 5339072, + "step": 9205 + }, + { + "epoch": 1.3717605004468276, + "grad_norm": 0.050537109375, + "learning_rate": 0.020574173369079535, + "loss": 0.8, + "num_input_tokens_seen": 5342016, + "step": 9210 + }, + { + "epoch": 1.3725052129877868, + "grad_norm": 0.04638671875, + "learning_rate": 0.02058534405719392, + "loss": 0.8038, + "num_input_tokens_seen": 5344896, + "step": 9215 + }, + { + "epoch": 1.373249925528746, + "grad_norm": 0.03662109375, + "learning_rate": 0.02059651474530831, + "loss": 0.8146, + "num_input_tokens_seen": 5347648, + "step": 9220 + }, + { + "epoch": 1.3739946380697052, + "grad_norm": 0.0279541015625, + "learning_rate": 0.020607685433422698, + "loss": 0.8031, + "num_input_tokens_seen": 5350624, + "step": 9225 + }, + { + "epoch": 1.3747393506106642, + "grad_norm": 0.016845703125, + "learning_rate": 0.020618856121537087, + "loss": 0.8098, + "num_input_tokens_seen": 5353728, + "step": 9230 + }, + { + "epoch": 1.3754840631516234, + "grad_norm": 0.030517578125, + "learning_rate": 0.020630026809651474, + "loss": 0.7693, + "num_input_tokens_seen": 5356640, + "step": 9235 + }, + { + "epoch": 1.3762287756925826, + "grad_norm": 0.017578125, + "learning_rate": 0.020641197497765863, + "loss": 0.8064, + "num_input_tokens_seen": 5359552, + "step": 9240 + }, + { + "epoch": 1.3769734882335418, + "grad_norm": 0.03125, + "learning_rate": 0.02065236818588025, + "loss": 0.7906, + "num_input_tokens_seen": 5362528, + "step": 9245 + }, + { + "epoch": 1.377718200774501, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02066353887399464, + "loss": 0.8307, + "num_input_tokens_seen": 5365600, + "step": 9250 + }, + { + "epoch": 1.3784629133154602, + "grad_norm": 0.02734375, + "learning_rate": 0.020674709562109025, + "loss": 0.8043, + "num_input_tokens_seen": 5368288, + "step": 9255 + }, + { + "epoch": 1.3792076258564194, + "grad_norm": 0.04931640625, + "learning_rate": 0.020685880250223415, + "loss": 0.8125, + "num_input_tokens_seen": 5371296, + "step": 9260 + }, + { + "epoch": 1.3799523383973786, + "grad_norm": 0.028564453125, + "learning_rate": 0.0206970509383378, + "loss": 0.7883, + "num_input_tokens_seen": 5374240, + "step": 9265 + }, + { + "epoch": 1.3806970509383378, + "grad_norm": 0.1435546875, + "learning_rate": 0.020708221626452188, + "loss": 0.8076, + "num_input_tokens_seen": 5377312, + "step": 9270 + }, + { + "epoch": 1.381441763479297, + "grad_norm": 0.06201171875, + "learning_rate": 0.020719392314566574, + "loss": 0.8076, + "num_input_tokens_seen": 5380064, + "step": 9275 + }, + { + "epoch": 1.3821864760202562, + "grad_norm": 0.034912109375, + "learning_rate": 0.020730563002680964, + "loss": 0.8206, + "num_input_tokens_seen": 5383072, + "step": 9280 + }, + { + "epoch": 1.3829311885612154, + "grad_norm": 0.031494140625, + "learning_rate": 0.02074173369079535, + "loss": 0.8166, + "num_input_tokens_seen": 5385792, + "step": 9285 + }, + { + "epoch": 1.3836759011021746, + "grad_norm": 0.03369140625, + "learning_rate": 0.02075290437890974, + "loss": 0.8103, + "num_input_tokens_seen": 5388640, + "step": 9290 + }, + { + "epoch": 1.3844206136431336, + "grad_norm": 0.0361328125, + "learning_rate": 0.020764075067024126, + "loss": 0.7924, + "num_input_tokens_seen": 5391360, + "step": 9295 + }, + { + "epoch": 1.3851653261840928, + "grad_norm": 0.03173828125, + "learning_rate": 0.020775245755138515, + "loss": 0.7954, + "num_input_tokens_seen": 5394240, + "step": 9300 + }, + { + "epoch": 1.385910038725052, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0207864164432529, + "loss": 0.8171, + "num_input_tokens_seen": 5396960, + "step": 9305 + }, + { + "epoch": 1.3866547512660112, + "grad_norm": 0.040771484375, + "learning_rate": 0.02079758713136729, + "loss": 0.7909, + "num_input_tokens_seen": 5399616, + "step": 9310 + }, + { + "epoch": 1.3873994638069704, + "grad_norm": 0.046630859375, + "learning_rate": 0.02080875781948168, + "loss": 0.804, + "num_input_tokens_seen": 5402144, + "step": 9315 + }, + { + "epoch": 1.3881441763479296, + "grad_norm": 0.03369140625, + "learning_rate": 0.020819928507596067, + "loss": 0.7887, + "num_input_tokens_seen": 5404992, + "step": 9320 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 0.0390625, + "learning_rate": 0.020831099195710457, + "loss": 0.8093, + "num_input_tokens_seen": 5407808, + "step": 9325 + }, + { + "epoch": 1.389633601429848, + "grad_norm": 0.04052734375, + "learning_rate": 0.020842269883824843, + "loss": 0.8135, + "num_input_tokens_seen": 5410752, + "step": 9330 + }, + { + "epoch": 1.3903783139708072, + "grad_norm": 0.035888671875, + "learning_rate": 0.020853440571939233, + "loss": 0.8262, + "num_input_tokens_seen": 5413504, + "step": 9335 + }, + { + "epoch": 1.3911230265117664, + "grad_norm": 0.041015625, + "learning_rate": 0.02086461126005362, + "loss": 0.8177, + "num_input_tokens_seen": 5416672, + "step": 9340 + }, + { + "epoch": 1.3918677390527256, + "grad_norm": 0.036376953125, + "learning_rate": 0.02087578194816801, + "loss": 0.8126, + "num_input_tokens_seen": 5419840, + "step": 9345 + }, + { + "epoch": 1.3926124515936849, + "grad_norm": 0.033935546875, + "learning_rate": 0.020886952636282395, + "loss": 0.7995, + "num_input_tokens_seen": 5422624, + "step": 9350 + }, + { + "epoch": 1.393357164134644, + "grad_norm": 0.03515625, + "learning_rate": 0.02089812332439678, + "loss": 0.8233, + "num_input_tokens_seen": 5425568, + "step": 9355 + }, + { + "epoch": 1.3941018766756033, + "grad_norm": 0.05224609375, + "learning_rate": 0.020909294012511168, + "loss": 0.7996, + "num_input_tokens_seen": 5428416, + "step": 9360 + }, + { + "epoch": 1.3948465892165625, + "grad_norm": 0.035400390625, + "learning_rate": 0.020920464700625557, + "loss": 0.8144, + "num_input_tokens_seen": 5431264, + "step": 9365 + }, + { + "epoch": 1.3955913017575217, + "grad_norm": 0.03369140625, + "learning_rate": 0.020931635388739944, + "loss": 0.8034, + "num_input_tokens_seen": 5434048, + "step": 9370 + }, + { + "epoch": 1.3963360142984809, + "grad_norm": 0.0208740234375, + "learning_rate": 0.020942806076854333, + "loss": 0.7984, + "num_input_tokens_seen": 5437056, + "step": 9375 + }, + { + "epoch": 1.39708072683944, + "grad_norm": 0.04296875, + "learning_rate": 0.02095397676496872, + "loss": 0.8214, + "num_input_tokens_seen": 5439840, + "step": 9380 + }, + { + "epoch": 1.3978254393803993, + "grad_norm": 0.0400390625, + "learning_rate": 0.02096514745308311, + "loss": 0.7973, + "num_input_tokens_seen": 5442816, + "step": 9385 + }, + { + "epoch": 1.3985701519213585, + "grad_norm": 0.029296875, + "learning_rate": 0.020976318141197495, + "loss": 0.7971, + "num_input_tokens_seen": 5445664, + "step": 9390 + }, + { + "epoch": 1.3993148644623177, + "grad_norm": 0.0419921875, + "learning_rate": 0.020987488829311885, + "loss": 0.7844, + "num_input_tokens_seen": 5448704, + "step": 9395 + }, + { + "epoch": 1.4000595770032767, + "grad_norm": 0.03466796875, + "learning_rate": 0.02099865951742627, + "loss": 0.776, + "num_input_tokens_seen": 5451648, + "step": 9400 + }, + { + "epoch": 1.4008042895442359, + "grad_norm": 0.033447265625, + "learning_rate": 0.02100983020554066, + "loss": 0.779, + "num_input_tokens_seen": 5454624, + "step": 9405 + }, + { + "epoch": 1.401549002085195, + "grad_norm": 0.0294189453125, + "learning_rate": 0.021021000893655047, + "loss": 0.8175, + "num_input_tokens_seen": 5457248, + "step": 9410 + }, + { + "epoch": 1.4022937146261543, + "grad_norm": 0.03271484375, + "learning_rate": 0.021032171581769437, + "loss": 0.7906, + "num_input_tokens_seen": 5460032, + "step": 9415 + }, + { + "epoch": 1.4030384271671135, + "grad_norm": 0.046875, + "learning_rate": 0.021043342269883823, + "loss": 0.8023, + "num_input_tokens_seen": 5462816, + "step": 9420 + }, + { + "epoch": 1.4037831397080727, + "grad_norm": 0.0703125, + "learning_rate": 0.021054512957998213, + "loss": 0.8601, + "num_input_tokens_seen": 5465632, + "step": 9425 + }, + { + "epoch": 1.4045278522490319, + "grad_norm": 0.0751953125, + "learning_rate": 0.021065683646112603, + "loss": 0.7979, + "num_input_tokens_seen": 5468544, + "step": 9430 + }, + { + "epoch": 1.405272564789991, + "grad_norm": 0.197265625, + "learning_rate": 0.02107685433422699, + "loss": 0.7797, + "num_input_tokens_seen": 5471360, + "step": 9435 + }, + { + "epoch": 1.4060172773309503, + "grad_norm": 0.06640625, + "learning_rate": 0.02108802502234138, + "loss": 0.8382, + "num_input_tokens_seen": 5474080, + "step": 9440 + }, + { + "epoch": 1.4067619898719095, + "grad_norm": 0.359375, + "learning_rate": 0.02109919571045576, + "loss": 0.8207, + "num_input_tokens_seen": 5477152, + "step": 9445 + }, + { + "epoch": 1.4075067024128687, + "grad_norm": 0.625, + "learning_rate": 0.02111036639857015, + "loss": 0.9157, + "num_input_tokens_seen": 5480064, + "step": 9450 + }, + { + "epoch": 1.4082514149538279, + "grad_norm": 9.5625, + "learning_rate": 0.021121537086684537, + "loss": 0.8139, + "num_input_tokens_seen": 5483136, + "step": 9455 + }, + { + "epoch": 1.408996127494787, + "grad_norm": 0.038818359375, + "learning_rate": 0.021132707774798927, + "loss": 0.8108, + "num_input_tokens_seen": 5486368, + "step": 9460 + }, + { + "epoch": 1.409740840035746, + "grad_norm": 0.0771484375, + "learning_rate": 0.021143878462913313, + "loss": 0.8154, + "num_input_tokens_seen": 5489024, + "step": 9465 + }, + { + "epoch": 1.4104855525767053, + "grad_norm": 0.0791015625, + "learning_rate": 0.021155049151027703, + "loss": 0.8034, + "num_input_tokens_seen": 5491616, + "step": 9470 + }, + { + "epoch": 1.4112302651176645, + "grad_norm": 0.0771484375, + "learning_rate": 0.02116621983914209, + "loss": 0.7808, + "num_input_tokens_seen": 5494464, + "step": 9475 + }, + { + "epoch": 1.4119749776586237, + "grad_norm": 0.09423828125, + "learning_rate": 0.02117739052725648, + "loss": 0.7728, + "num_input_tokens_seen": 5497376, + "step": 9480 + }, + { + "epoch": 1.4127196901995829, + "grad_norm": 0.08251953125, + "learning_rate": 0.021188561215370865, + "loss": 0.7847, + "num_input_tokens_seen": 5500128, + "step": 9485 + }, + { + "epoch": 1.413464402740542, + "grad_norm": 0.6484375, + "learning_rate": 0.021199731903485255, + "loss": 0.8152, + "num_input_tokens_seen": 5503136, + "step": 9490 + }, + { + "epoch": 1.4142091152815013, + "grad_norm": 0.053955078125, + "learning_rate": 0.02121090259159964, + "loss": 0.8548, + "num_input_tokens_seen": 5506048, + "step": 9495 + }, + { + "epoch": 1.4149538278224605, + "grad_norm": 0.05517578125, + "learning_rate": 0.02122207327971403, + "loss": 0.789, + "num_input_tokens_seen": 5508736, + "step": 9500 + }, + { + "epoch": 1.4156985403634197, + "grad_norm": 0.05712890625, + "learning_rate": 0.021233243967828417, + "loss": 0.8385, + "num_input_tokens_seen": 5511776, + "step": 9505 + }, + { + "epoch": 1.416443252904379, + "grad_norm": 0.11328125, + "learning_rate": 0.021244414655942807, + "loss": 0.7992, + "num_input_tokens_seen": 5514848, + "step": 9510 + }, + { + "epoch": 1.417187965445338, + "grad_norm": 0.1005859375, + "learning_rate": 0.021255585344057193, + "loss": 1.1079, + "num_input_tokens_seen": 5517600, + "step": 9515 + }, + { + "epoch": 1.4179326779862973, + "grad_norm": 0.08154296875, + "learning_rate": 0.021266756032171583, + "loss": 0.805, + "num_input_tokens_seen": 5520384, + "step": 9520 + }, + { + "epoch": 1.4186773905272565, + "grad_norm": 0.0517578125, + "learning_rate": 0.02127792672028597, + "loss": 0.8075, + "num_input_tokens_seen": 5523392, + "step": 9525 + }, + { + "epoch": 1.4194221030682157, + "grad_norm": 0.05322265625, + "learning_rate": 0.021289097408400355, + "loss": 0.8202, + "num_input_tokens_seen": 5526496, + "step": 9530 + }, + { + "epoch": 1.420166815609175, + "grad_norm": 0.10302734375, + "learning_rate": 0.021300268096514745, + "loss": 0.7948, + "num_input_tokens_seen": 5529600, + "step": 9535 + }, + { + "epoch": 1.420911528150134, + "grad_norm": 0.142578125, + "learning_rate": 0.02131143878462913, + "loss": 0.8255, + "num_input_tokens_seen": 5532352, + "step": 9540 + }, + { + "epoch": 1.4216562406910933, + "grad_norm": 0.039794921875, + "learning_rate": 0.02132260947274352, + "loss": 0.8182, + "num_input_tokens_seen": 5535456, + "step": 9545 + }, + { + "epoch": 1.4224009532320525, + "grad_norm": 0.046142578125, + "learning_rate": 0.021333780160857907, + "loss": 0.8046, + "num_input_tokens_seen": 5538464, + "step": 9550 + }, + { + "epoch": 1.4231456657730117, + "grad_norm": 0.02197265625, + "learning_rate": 0.021344950848972297, + "loss": 0.8067, + "num_input_tokens_seen": 5541344, + "step": 9555 + }, + { + "epoch": 1.423890378313971, + "grad_norm": 0.06591796875, + "learning_rate": 0.021356121537086683, + "loss": 0.8266, + "num_input_tokens_seen": 5544320, + "step": 9560 + }, + { + "epoch": 1.4246350908549301, + "grad_norm": 0.0458984375, + "learning_rate": 0.021367292225201073, + "loss": 0.7949, + "num_input_tokens_seen": 5547008, + "step": 9565 + }, + { + "epoch": 1.4253798033958893, + "grad_norm": 0.029052734375, + "learning_rate": 0.02137846291331546, + "loss": 0.8122, + "num_input_tokens_seen": 5549920, + "step": 9570 + }, + { + "epoch": 1.4261245159368483, + "grad_norm": 0.07861328125, + "learning_rate": 0.02138963360142985, + "loss": 0.7807, + "num_input_tokens_seen": 5552960, + "step": 9575 + }, + { + "epoch": 1.4268692284778075, + "grad_norm": 0.02490234375, + "learning_rate": 0.021400804289544235, + "loss": 0.794, + "num_input_tokens_seen": 5555840, + "step": 9580 + }, + { + "epoch": 1.4276139410187667, + "grad_norm": 0.0673828125, + "learning_rate": 0.021411974977658624, + "loss": 0.7859, + "num_input_tokens_seen": 5558912, + "step": 9585 + }, + { + "epoch": 1.428358653559726, + "grad_norm": 0.05126953125, + "learning_rate": 0.02142314566577301, + "loss": 0.7643, + "num_input_tokens_seen": 5561792, + "step": 9590 + }, + { + "epoch": 1.4291033661006851, + "grad_norm": 0.061279296875, + "learning_rate": 0.0214343163538874, + "loss": 0.8261, + "num_input_tokens_seen": 5564480, + "step": 9595 + }, + { + "epoch": 1.4298480786416443, + "grad_norm": 0.07861328125, + "learning_rate": 0.021445487042001787, + "loss": 0.8146, + "num_input_tokens_seen": 5567264, + "step": 9600 + }, + { + "epoch": 1.4305927911826035, + "grad_norm": 0.03125, + "learning_rate": 0.021456657730116176, + "loss": 0.8052, + "num_input_tokens_seen": 5569984, + "step": 9605 + }, + { + "epoch": 1.4313375037235627, + "grad_norm": 0.04736328125, + "learning_rate": 0.021467828418230563, + "loss": 0.8312, + "num_input_tokens_seen": 5573472, + "step": 9610 + }, + { + "epoch": 1.432082216264522, + "grad_norm": 0.046875, + "learning_rate": 0.02147899910634495, + "loss": 0.8076, + "num_input_tokens_seen": 5576416, + "step": 9615 + }, + { + "epoch": 1.4328269288054811, + "grad_norm": 0.024658203125, + "learning_rate": 0.021490169794459335, + "loss": 0.7883, + "num_input_tokens_seen": 5579584, + "step": 9620 + }, + { + "epoch": 1.4335716413464403, + "grad_norm": 0.0693359375, + "learning_rate": 0.021501340482573725, + "loss": 0.8022, + "num_input_tokens_seen": 5582464, + "step": 9625 + }, + { + "epoch": 1.4343163538873995, + "grad_norm": 0.0498046875, + "learning_rate": 0.02151251117068811, + "loss": 0.7902, + "num_input_tokens_seen": 5585056, + "step": 9630 + }, + { + "epoch": 1.4350610664283587, + "grad_norm": 0.05224609375, + "learning_rate": 0.0215236818588025, + "loss": 0.8092, + "num_input_tokens_seen": 5587968, + "step": 9635 + }, + { + "epoch": 1.4358057789693177, + "grad_norm": 0.031982421875, + "learning_rate": 0.021534852546916887, + "loss": 0.8146, + "num_input_tokens_seen": 5591136, + "step": 9640 + }, + { + "epoch": 1.436550491510277, + "grad_norm": 0.02734375, + "learning_rate": 0.021546023235031277, + "loss": 0.7889, + "num_input_tokens_seen": 5594016, + "step": 9645 + }, + { + "epoch": 1.4372952040512361, + "grad_norm": 0.05322265625, + "learning_rate": 0.021557193923145666, + "loss": 0.7713, + "num_input_tokens_seen": 5596736, + "step": 9650 + }, + { + "epoch": 1.4380399165921953, + "grad_norm": 0.10888671875, + "learning_rate": 0.021568364611260053, + "loss": 0.8128, + "num_input_tokens_seen": 5599552, + "step": 9655 + }, + { + "epoch": 1.4387846291331545, + "grad_norm": 0.052490234375, + "learning_rate": 0.021579535299374442, + "loss": 0.7509, + "num_input_tokens_seen": 5602752, + "step": 9660 + }, + { + "epoch": 1.4395293416741137, + "grad_norm": 0.06640625, + "learning_rate": 0.02159070598748883, + "loss": 0.816, + "num_input_tokens_seen": 5605568, + "step": 9665 + }, + { + "epoch": 1.440274054215073, + "grad_norm": 0.038330078125, + "learning_rate": 0.021601876675603218, + "loss": 0.7872, + "num_input_tokens_seen": 5608448, + "step": 9670 + }, + { + "epoch": 1.4410187667560321, + "grad_norm": 0.0439453125, + "learning_rate": 0.021613047363717604, + "loss": 0.8099, + "num_input_tokens_seen": 5611584, + "step": 9675 + }, + { + "epoch": 1.4417634792969913, + "grad_norm": 0.06787109375, + "learning_rate": 0.021624218051831994, + "loss": 0.7527, + "num_input_tokens_seen": 5614080, + "step": 9680 + }, + { + "epoch": 1.4425081918379505, + "grad_norm": 0.0810546875, + "learning_rate": 0.02163538873994638, + "loss": 0.8037, + "num_input_tokens_seen": 5617088, + "step": 9685 + }, + { + "epoch": 1.4432529043789097, + "grad_norm": 0.0458984375, + "learning_rate": 0.02164655942806077, + "loss": 0.7943, + "num_input_tokens_seen": 5619872, + "step": 9690 + }, + { + "epoch": 1.443997616919869, + "grad_norm": 0.033203125, + "learning_rate": 0.021657730116175156, + "loss": 0.7797, + "num_input_tokens_seen": 5622496, + "step": 9695 + }, + { + "epoch": 1.4447423294608281, + "grad_norm": 0.048095703125, + "learning_rate": 0.021668900804289543, + "loss": 0.8429, + "num_input_tokens_seen": 5625408, + "step": 9700 + }, + { + "epoch": 1.4454870420017873, + "grad_norm": 0.053955078125, + "learning_rate": 0.02168007149240393, + "loss": 0.8188, + "num_input_tokens_seen": 5628064, + "step": 9705 + }, + { + "epoch": 1.4462317545427466, + "grad_norm": 0.036376953125, + "learning_rate": 0.02169124218051832, + "loss": 0.7953, + "num_input_tokens_seen": 5631104, + "step": 9710 + }, + { + "epoch": 1.4469764670837058, + "grad_norm": 0.0419921875, + "learning_rate": 0.021702412868632705, + "loss": 0.8216, + "num_input_tokens_seen": 5633920, + "step": 9715 + }, + { + "epoch": 1.447721179624665, + "grad_norm": 0.0242919921875, + "learning_rate": 0.021713583556747094, + "loss": 0.7726, + "num_input_tokens_seen": 5636672, + "step": 9720 + }, + { + "epoch": 1.4484658921656242, + "grad_norm": 0.037109375, + "learning_rate": 0.02172475424486148, + "loss": 0.7927, + "num_input_tokens_seen": 5639616, + "step": 9725 + }, + { + "epoch": 1.4492106047065834, + "grad_norm": 0.041259765625, + "learning_rate": 0.02173592493297587, + "loss": 0.836, + "num_input_tokens_seen": 5642880, + "step": 9730 + }, + { + "epoch": 1.4499553172475426, + "grad_norm": 0.035888671875, + "learning_rate": 0.021747095621090257, + "loss": 0.806, + "num_input_tokens_seen": 5645728, + "step": 9735 + }, + { + "epoch": 1.4507000297885018, + "grad_norm": 0.01708984375, + "learning_rate": 0.021758266309204646, + "loss": 0.8072, + "num_input_tokens_seen": 5648928, + "step": 9740 + }, + { + "epoch": 1.4514447423294607, + "grad_norm": 0.0159912109375, + "learning_rate": 0.021769436997319033, + "loss": 0.803, + "num_input_tokens_seen": 5651552, + "step": 9745 + }, + { + "epoch": 1.45218945487042, + "grad_norm": 0.038330078125, + "learning_rate": 0.021780607685433422, + "loss": 0.7837, + "num_input_tokens_seen": 5654496, + "step": 9750 + }, + { + "epoch": 1.4529341674113792, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02179177837354781, + "loss": 0.8001, + "num_input_tokens_seen": 5657280, + "step": 9755 + }, + { + "epoch": 1.4536788799523384, + "grad_norm": 0.038818359375, + "learning_rate": 0.021802949061662198, + "loss": 0.8069, + "num_input_tokens_seen": 5659872, + "step": 9760 + }, + { + "epoch": 1.4544235924932976, + "grad_norm": 0.060546875, + "learning_rate": 0.021814119749776588, + "loss": 0.817, + "num_input_tokens_seen": 5662752, + "step": 9765 + }, + { + "epoch": 1.4551683050342568, + "grad_norm": 0.027587890625, + "learning_rate": 0.021825290437890974, + "loss": 0.7762, + "num_input_tokens_seen": 5665568, + "step": 9770 + }, + { + "epoch": 1.455913017575216, + "grad_norm": 0.033203125, + "learning_rate": 0.021836461126005364, + "loss": 0.8044, + "num_input_tokens_seen": 5668448, + "step": 9775 + }, + { + "epoch": 1.4566577301161752, + "grad_norm": 0.0419921875, + "learning_rate": 0.02184763181411975, + "loss": 0.8387, + "num_input_tokens_seen": 5671200, + "step": 9780 + }, + { + "epoch": 1.4574024426571344, + "grad_norm": 0.03271484375, + "learning_rate": 0.02185880250223414, + "loss": 0.7931, + "num_input_tokens_seen": 5674144, + "step": 9785 + }, + { + "epoch": 1.4581471551980936, + "grad_norm": 0.07373046875, + "learning_rate": 0.021869973190348523, + "loss": 0.8012, + "num_input_tokens_seen": 5677152, + "step": 9790 + }, + { + "epoch": 1.4588918677390528, + "grad_norm": 0.032470703125, + "learning_rate": 0.021881143878462912, + "loss": 0.809, + "num_input_tokens_seen": 5680448, + "step": 9795 + }, + { + "epoch": 1.459636580280012, + "grad_norm": 0.039306640625, + "learning_rate": 0.0218923145665773, + "loss": 0.7979, + "num_input_tokens_seen": 5683424, + "step": 9800 + }, + { + "epoch": 1.4603812928209712, + "grad_norm": 0.03515625, + "learning_rate": 0.021903485254691688, + "loss": 0.7865, + "num_input_tokens_seen": 5686336, + "step": 9805 + }, + { + "epoch": 1.4611260053619302, + "grad_norm": 0.037109375, + "learning_rate": 0.021914655942806074, + "loss": 0.8182, + "num_input_tokens_seen": 5689120, + "step": 9810 + }, + { + "epoch": 1.4618707179028894, + "grad_norm": 0.042236328125, + "learning_rate": 0.021925826630920464, + "loss": 0.8129, + "num_input_tokens_seen": 5691648, + "step": 9815 + }, + { + "epoch": 1.4626154304438486, + "grad_norm": 0.041259765625, + "learning_rate": 0.02193699731903485, + "loss": 0.7991, + "num_input_tokens_seen": 5694368, + "step": 9820 + }, + { + "epoch": 1.4633601429848078, + "grad_norm": 0.035400390625, + "learning_rate": 0.02194816800714924, + "loss": 0.8092, + "num_input_tokens_seen": 5697280, + "step": 9825 + }, + { + "epoch": 1.464104855525767, + "grad_norm": 0.0299072265625, + "learning_rate": 0.021959338695263626, + "loss": 0.7967, + "num_input_tokens_seen": 5700448, + "step": 9830 + }, + { + "epoch": 1.4648495680667262, + "grad_norm": 0.04443359375, + "learning_rate": 0.021970509383378016, + "loss": 0.8058, + "num_input_tokens_seen": 5703584, + "step": 9835 + }, + { + "epoch": 1.4655942806076854, + "grad_norm": 0.04833984375, + "learning_rate": 0.021981680071492402, + "loss": 0.8067, + "num_input_tokens_seen": 5706400, + "step": 9840 + }, + { + "epoch": 1.4663389931486446, + "grad_norm": 0.052734375, + "learning_rate": 0.021992850759606792, + "loss": 0.8097, + "num_input_tokens_seen": 5709184, + "step": 9845 + }, + { + "epoch": 1.4670837056896038, + "grad_norm": 0.0306396484375, + "learning_rate": 0.022004021447721178, + "loss": 0.7842, + "num_input_tokens_seen": 5711712, + "step": 9850 + }, + { + "epoch": 1.467828418230563, + "grad_norm": 0.034423828125, + "learning_rate": 0.022015192135835568, + "loss": 0.8094, + "num_input_tokens_seen": 5714464, + "step": 9855 + }, + { + "epoch": 1.4685731307715222, + "grad_norm": 0.0556640625, + "learning_rate": 0.022026362823949954, + "loss": 0.8076, + "num_input_tokens_seen": 5717216, + "step": 9860 + }, + { + "epoch": 1.4693178433124814, + "grad_norm": 0.04931640625, + "learning_rate": 0.022037533512064344, + "loss": 0.7929, + "num_input_tokens_seen": 5720608, + "step": 9865 + }, + { + "epoch": 1.4700625558534406, + "grad_norm": 0.07373046875, + "learning_rate": 0.02204870420017873, + "loss": 0.8132, + "num_input_tokens_seen": 5723488, + "step": 9870 + }, + { + "epoch": 1.4708072683943998, + "grad_norm": 0.0341796875, + "learning_rate": 0.022059874888293116, + "loss": 0.8231, + "num_input_tokens_seen": 5726624, + "step": 9875 + }, + { + "epoch": 1.471551980935359, + "grad_norm": 0.03955078125, + "learning_rate": 0.022071045576407506, + "loss": 0.809, + "num_input_tokens_seen": 5729408, + "step": 9880 + }, + { + "epoch": 1.4722966934763182, + "grad_norm": 0.03369140625, + "learning_rate": 0.022082216264521892, + "loss": 0.7895, + "num_input_tokens_seen": 5732352, + "step": 9885 + }, + { + "epoch": 1.4730414060172774, + "grad_norm": 0.042236328125, + "learning_rate": 0.022093386952636282, + "loss": 0.8233, + "num_input_tokens_seen": 5735072, + "step": 9890 + }, + { + "epoch": 1.4737861185582366, + "grad_norm": 0.034423828125, + "learning_rate": 0.022104557640750668, + "loss": 0.7971, + "num_input_tokens_seen": 5738272, + "step": 9895 + }, + { + "epoch": 1.4745308310991958, + "grad_norm": 0.0185546875, + "learning_rate": 0.022115728328865058, + "loss": 0.7795, + "num_input_tokens_seen": 5741024, + "step": 9900 + }, + { + "epoch": 1.475275543640155, + "grad_norm": 0.04345703125, + "learning_rate": 0.022126899016979444, + "loss": 0.8002, + "num_input_tokens_seen": 5743968, + "step": 9905 + }, + { + "epoch": 1.4760202561811142, + "grad_norm": 0.0390625, + "learning_rate": 0.022138069705093834, + "loss": 0.8096, + "num_input_tokens_seen": 5746816, + "step": 9910 + }, + { + "epoch": 1.4767649687220734, + "grad_norm": 0.09033203125, + "learning_rate": 0.02214924039320822, + "loss": 0.8151, + "num_input_tokens_seen": 5749312, + "step": 9915 + }, + { + "epoch": 1.4775096812630324, + "grad_norm": 0.03369140625, + "learning_rate": 0.02216041108132261, + "loss": 0.79, + "num_input_tokens_seen": 5751776, + "step": 9920 + }, + { + "epoch": 1.4782543938039916, + "grad_norm": 0.04345703125, + "learning_rate": 0.022171581769436996, + "loss": 0.8015, + "num_input_tokens_seen": 5754432, + "step": 9925 + }, + { + "epoch": 1.4789991063449508, + "grad_norm": 0.051025390625, + "learning_rate": 0.022182752457551386, + "loss": 0.7875, + "num_input_tokens_seen": 5757152, + "step": 9930 + }, + { + "epoch": 1.47974381888591, + "grad_norm": 0.05517578125, + "learning_rate": 0.022193923145665772, + "loss": 0.8074, + "num_input_tokens_seen": 5760096, + "step": 9935 + }, + { + "epoch": 1.4804885314268692, + "grad_norm": 0.032470703125, + "learning_rate": 0.02220509383378016, + "loss": 0.7856, + "num_input_tokens_seen": 5762880, + "step": 9940 + }, + { + "epoch": 1.4812332439678284, + "grad_norm": 0.033203125, + "learning_rate": 0.022216264521894548, + "loss": 0.7793, + "num_input_tokens_seen": 5765696, + "step": 9945 + }, + { + "epoch": 1.4819779565087876, + "grad_norm": 0.04736328125, + "learning_rate": 0.022227435210008938, + "loss": 0.8054, + "num_input_tokens_seen": 5768448, + "step": 9950 + }, + { + "epoch": 1.4827226690497468, + "grad_norm": 0.035888671875, + "learning_rate": 0.022238605898123324, + "loss": 0.8354, + "num_input_tokens_seen": 5771520, + "step": 9955 + }, + { + "epoch": 1.483467381590706, + "grad_norm": 0.056640625, + "learning_rate": 0.02224977658623771, + "loss": 0.8282, + "num_input_tokens_seen": 5774400, + "step": 9960 + }, + { + "epoch": 1.4842120941316652, + "grad_norm": 0.037109375, + "learning_rate": 0.022260947274352096, + "loss": 0.7934, + "num_input_tokens_seen": 5777216, + "step": 9965 + }, + { + "epoch": 1.4849568066726244, + "grad_norm": 0.044921875, + "learning_rate": 0.022272117962466486, + "loss": 0.7901, + "num_input_tokens_seen": 5779744, + "step": 9970 + }, + { + "epoch": 1.4857015192135836, + "grad_norm": 0.03564453125, + "learning_rate": 0.022283288650580872, + "loss": 0.8087, + "num_input_tokens_seen": 5782720, + "step": 9975 + }, + { + "epoch": 1.4864462317545428, + "grad_norm": 0.0322265625, + "learning_rate": 0.022294459338695262, + "loss": 0.8176, + "num_input_tokens_seen": 5785344, + "step": 9980 + }, + { + "epoch": 1.4871909442955018, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02230563002680965, + "loss": 0.7723, + "num_input_tokens_seen": 5787936, + "step": 9985 + }, + { + "epoch": 1.487935656836461, + "grad_norm": 0.03466796875, + "learning_rate": 0.022316800714924038, + "loss": 0.8339, + "num_input_tokens_seen": 5790880, + "step": 9990 + }, + { + "epoch": 1.4886803693774202, + "grad_norm": 0.036865234375, + "learning_rate": 0.022327971403038428, + "loss": 0.8205, + "num_input_tokens_seen": 5793920, + "step": 9995 + }, + { + "epoch": 1.4894250819183794, + "grad_norm": 0.03173828125, + "learning_rate": 0.022339142091152814, + "loss": 0.8047, + "num_input_tokens_seen": 5796768, + "step": 10000 + }, + { + "epoch": 1.4901697944593386, + "grad_norm": 0.02783203125, + "learning_rate": 0.022350312779267204, + "loss": 0.8058, + "num_input_tokens_seen": 5799520, + "step": 10005 + }, + { + "epoch": 1.4909145070002978, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02236148346738159, + "loss": 0.8062, + "num_input_tokens_seen": 5802336, + "step": 10010 + }, + { + "epoch": 1.491659219541257, + "grad_norm": 0.03466796875, + "learning_rate": 0.02237265415549598, + "loss": 0.7878, + "num_input_tokens_seen": 5805600, + "step": 10015 + }, + { + "epoch": 1.4924039320822162, + "grad_norm": 0.0294189453125, + "learning_rate": 0.022383824843610366, + "loss": 0.8056, + "num_input_tokens_seen": 5808320, + "step": 10020 + }, + { + "epoch": 1.4931486446231754, + "grad_norm": 0.036865234375, + "learning_rate": 0.022394995531724755, + "loss": 0.8029, + "num_input_tokens_seen": 5811264, + "step": 10025 + }, + { + "epoch": 1.4938933571641346, + "grad_norm": 0.0361328125, + "learning_rate": 0.02240616621983914, + "loss": 0.8179, + "num_input_tokens_seen": 5813920, + "step": 10030 + }, + { + "epoch": 1.4946380697050938, + "grad_norm": 0.036865234375, + "learning_rate": 0.02241733690795353, + "loss": 0.8145, + "num_input_tokens_seen": 5817024, + "step": 10035 + }, + { + "epoch": 1.495382782246053, + "grad_norm": 0.03271484375, + "learning_rate": 0.022428507596067918, + "loss": 0.8031, + "num_input_tokens_seen": 5820000, + "step": 10040 + }, + { + "epoch": 1.4961274947870122, + "grad_norm": 0.018798828125, + "learning_rate": 0.022439678284182307, + "loss": 0.7921, + "num_input_tokens_seen": 5822784, + "step": 10045 + }, + { + "epoch": 1.4968722073279714, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02245084897229669, + "loss": 0.7767, + "num_input_tokens_seen": 5825568, + "step": 10050 + }, + { + "epoch": 1.4976169198689306, + "grad_norm": 0.033935546875, + "learning_rate": 0.02246201966041108, + "loss": 0.7997, + "num_input_tokens_seen": 5828288, + "step": 10055 + }, + { + "epoch": 1.4983616324098898, + "grad_norm": 0.0185546875, + "learning_rate": 0.022473190348525466, + "loss": 0.7986, + "num_input_tokens_seen": 5831360, + "step": 10060 + }, + { + "epoch": 1.499106344950849, + "grad_norm": 0.034423828125, + "learning_rate": 0.022484361036639856, + "loss": 0.8334, + "num_input_tokens_seen": 5834208, + "step": 10065 + }, + { + "epoch": 1.4998510574918082, + "grad_norm": 0.016357421875, + "learning_rate": 0.022495531724754242, + "loss": 0.8209, + "num_input_tokens_seen": 5837184, + "step": 10070 + }, + { + "epoch": 1.5005957700327675, + "grad_norm": 0.0361328125, + "learning_rate": 0.02250670241286863, + "loss": 0.8168, + "num_input_tokens_seen": 5840096, + "step": 10075 + }, + { + "epoch": 1.5013404825737267, + "grad_norm": 0.0286865234375, + "learning_rate": 0.022517873100983018, + "loss": 0.8106, + "num_input_tokens_seen": 5843584, + "step": 10080 + }, + { + "epoch": 1.5020851951146859, + "grad_norm": 0.0289306640625, + "learning_rate": 0.022529043789097408, + "loss": 0.7996, + "num_input_tokens_seen": 5846144, + "step": 10085 + }, + { + "epoch": 1.502829907655645, + "grad_norm": 0.017578125, + "learning_rate": 0.022540214477211794, + "loss": 0.8053, + "num_input_tokens_seen": 5848960, + "step": 10090 + }, + { + "epoch": 1.5035746201966043, + "grad_norm": 0.05126953125, + "learning_rate": 0.022551385165326183, + "loss": 0.8197, + "num_input_tokens_seen": 5851744, + "step": 10095 + }, + { + "epoch": 1.5043193327375635, + "grad_norm": 0.0291748046875, + "learning_rate": 0.022562555853440573, + "loss": 0.8019, + "num_input_tokens_seen": 5854848, + "step": 10100 + }, + { + "epoch": 1.5050640452785224, + "grad_norm": 0.021484375, + "learning_rate": 0.02257372654155496, + "loss": 0.8079, + "num_input_tokens_seen": 5857792, + "step": 10105 + }, + { + "epoch": 1.5058087578194816, + "grad_norm": 0.029296875, + "learning_rate": 0.02258489722966935, + "loss": 0.8005, + "num_input_tokens_seen": 5860672, + "step": 10110 + }, + { + "epoch": 1.5065534703604408, + "grad_norm": 0.032958984375, + "learning_rate": 0.022596067917783735, + "loss": 0.8121, + "num_input_tokens_seen": 5863552, + "step": 10115 + }, + { + "epoch": 1.5072981829014, + "grad_norm": 0.01904296875, + "learning_rate": 0.022607238605898125, + "loss": 0.8057, + "num_input_tokens_seen": 5866592, + "step": 10120 + }, + { + "epoch": 1.5080428954423593, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02261840929401251, + "loss": 0.7915, + "num_input_tokens_seen": 5869408, + "step": 10125 + }, + { + "epoch": 1.5087876079833185, + "grad_norm": 0.032470703125, + "learning_rate": 0.0226295799821269, + "loss": 0.8001, + "num_input_tokens_seen": 5872128, + "step": 10130 + }, + { + "epoch": 1.5095323205242777, + "grad_norm": 0.0177001953125, + "learning_rate": 0.022640750670241284, + "loss": 0.7877, + "num_input_tokens_seen": 5875104, + "step": 10135 + }, + { + "epoch": 1.5102770330652369, + "grad_norm": 0.041748046875, + "learning_rate": 0.022651921358355673, + "loss": 0.7987, + "num_input_tokens_seen": 5878176, + "step": 10140 + }, + { + "epoch": 1.5110217456061958, + "grad_norm": 0.0244140625, + "learning_rate": 0.02266309204647006, + "loss": 0.8177, + "num_input_tokens_seen": 5881088, + "step": 10145 + }, + { + "epoch": 1.511766458147155, + "grad_norm": 0.045166015625, + "learning_rate": 0.02267426273458445, + "loss": 0.7949, + "num_input_tokens_seen": 5883968, + "step": 10150 + }, + { + "epoch": 1.5125111706881142, + "grad_norm": 0.0177001953125, + "learning_rate": 0.022685433422698836, + "loss": 0.7735, + "num_input_tokens_seen": 5887104, + "step": 10155 + }, + { + "epoch": 1.5132558832290735, + "grad_norm": 0.053466796875, + "learning_rate": 0.022696604110813225, + "loss": 0.7843, + "num_input_tokens_seen": 5889952, + "step": 10160 + }, + { + "epoch": 1.5140005957700327, + "grad_norm": 0.056640625, + "learning_rate": 0.02270777479892761, + "loss": 0.8286, + "num_input_tokens_seen": 5892896, + "step": 10165 + }, + { + "epoch": 1.5147453083109919, + "grad_norm": 0.02978515625, + "learning_rate": 0.022718945487042, + "loss": 0.7698, + "num_input_tokens_seen": 5895680, + "step": 10170 + }, + { + "epoch": 1.515490020851951, + "grad_norm": 0.029052734375, + "learning_rate": 0.022730116175156388, + "loss": 0.7947, + "num_input_tokens_seen": 5898560, + "step": 10175 + }, + { + "epoch": 1.5162347333929103, + "grad_norm": 0.04248046875, + "learning_rate": 0.022741286863270777, + "loss": 0.7885, + "num_input_tokens_seen": 5901696, + "step": 10180 + }, + { + "epoch": 1.5169794459338695, + "grad_norm": 0.040771484375, + "learning_rate": 0.022752457551385163, + "loss": 0.8151, + "num_input_tokens_seen": 5904384, + "step": 10185 + }, + { + "epoch": 1.5177241584748287, + "grad_norm": 0.046142578125, + "learning_rate": 0.022763628239499553, + "loss": 0.7851, + "num_input_tokens_seen": 5907424, + "step": 10190 + }, + { + "epoch": 1.5184688710157879, + "grad_norm": 0.06494140625, + "learning_rate": 0.02277479892761394, + "loss": 0.7708, + "num_input_tokens_seen": 5910464, + "step": 10195 + }, + { + "epoch": 1.519213583556747, + "grad_norm": 0.057373046875, + "learning_rate": 0.02278596961572833, + "loss": 0.8063, + "num_input_tokens_seen": 5913504, + "step": 10200 + }, + { + "epoch": 1.5199582960977063, + "grad_norm": 0.048828125, + "learning_rate": 0.022797140303842715, + "loss": 0.7989, + "num_input_tokens_seen": 5916544, + "step": 10205 + }, + { + "epoch": 1.5207030086386655, + "grad_norm": 0.040283203125, + "learning_rate": 0.022808310991957105, + "loss": 0.8429, + "num_input_tokens_seen": 5919744, + "step": 10210 + }, + { + "epoch": 1.5214477211796247, + "grad_norm": 0.0634765625, + "learning_rate": 0.022819481680071495, + "loss": 0.8228, + "num_input_tokens_seen": 5922656, + "step": 10215 + }, + { + "epoch": 1.5221924337205839, + "grad_norm": 0.02978515625, + "learning_rate": 0.022830652368185878, + "loss": 0.7976, + "num_input_tokens_seen": 5925376, + "step": 10220 + }, + { + "epoch": 1.522937146261543, + "grad_norm": 0.0184326171875, + "learning_rate": 0.022841823056300267, + "loss": 0.7982, + "num_input_tokens_seen": 5928160, + "step": 10225 + }, + { + "epoch": 1.5236818588025023, + "grad_norm": 0.017333984375, + "learning_rate": 0.022852993744414653, + "loss": 0.799, + "num_input_tokens_seen": 5930848, + "step": 10230 + }, + { + "epoch": 1.5244265713434615, + "grad_norm": 0.0517578125, + "learning_rate": 0.022864164432529043, + "loss": 0.8075, + "num_input_tokens_seen": 5933760, + "step": 10235 + }, + { + "epoch": 1.5251712838844207, + "grad_norm": 0.0306396484375, + "learning_rate": 0.02287533512064343, + "loss": 0.7915, + "num_input_tokens_seen": 5936544, + "step": 10240 + }, + { + "epoch": 1.52591599642538, + "grad_norm": 0.061279296875, + "learning_rate": 0.02288650580875782, + "loss": 0.7914, + "num_input_tokens_seen": 5939520, + "step": 10245 + }, + { + "epoch": 1.526660708966339, + "grad_norm": 0.0286865234375, + "learning_rate": 0.022897676496872205, + "loss": 0.7904, + "num_input_tokens_seen": 5942528, + "step": 10250 + }, + { + "epoch": 1.5274054215072983, + "grad_norm": 0.03515625, + "learning_rate": 0.022908847184986595, + "loss": 0.7925, + "num_input_tokens_seen": 5945888, + "step": 10255 + }, + { + "epoch": 1.5281501340482575, + "grad_norm": 0.022216796875, + "learning_rate": 0.02292001787310098, + "loss": 0.783, + "num_input_tokens_seen": 5949088, + "step": 10260 + }, + { + "epoch": 1.5288948465892167, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02293118856121537, + "loss": 0.8166, + "num_input_tokens_seen": 5952064, + "step": 10265 + }, + { + "epoch": 1.529639559130176, + "grad_norm": 0.040283203125, + "learning_rate": 0.022942359249329757, + "loss": 0.8442, + "num_input_tokens_seen": 5954944, + "step": 10270 + }, + { + "epoch": 1.5303842716711349, + "grad_norm": 0.03857421875, + "learning_rate": 0.022953529937444147, + "loss": 0.7922, + "num_input_tokens_seen": 5957728, + "step": 10275 + }, + { + "epoch": 1.531128984212094, + "grad_norm": 0.0289306640625, + "learning_rate": 0.022964700625558533, + "loss": 0.8029, + "num_input_tokens_seen": 5960736, + "step": 10280 + }, + { + "epoch": 1.5318736967530533, + "grad_norm": 0.02978515625, + "learning_rate": 0.022975871313672923, + "loss": 0.8142, + "num_input_tokens_seen": 5963584, + "step": 10285 + }, + { + "epoch": 1.5326184092940125, + "grad_norm": 0.056640625, + "learning_rate": 0.02298704200178731, + "loss": 0.7864, + "num_input_tokens_seen": 5966432, + "step": 10290 + }, + { + "epoch": 1.5333631218349717, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0229982126899017, + "loss": 0.7923, + "num_input_tokens_seen": 5969248, + "step": 10295 + }, + { + "epoch": 1.534107834375931, + "grad_norm": 0.033447265625, + "learning_rate": 0.023009383378016085, + "loss": 0.7892, + "num_input_tokens_seen": 5972064, + "step": 10300 + }, + { + "epoch": 1.53485254691689, + "grad_norm": 0.0152587890625, + "learning_rate": 0.023020554066130475, + "loss": 0.7987, + "num_input_tokens_seen": 5974976, + "step": 10305 + }, + { + "epoch": 1.5355972594578493, + "grad_norm": 0.01904296875, + "learning_rate": 0.023031724754244857, + "loss": 0.8282, + "num_input_tokens_seen": 5977728, + "step": 10310 + }, + { + "epoch": 1.5363419719988085, + "grad_norm": 0.026123046875, + "learning_rate": 0.023042895442359247, + "loss": 0.8141, + "num_input_tokens_seen": 5980512, + "step": 10315 + }, + { + "epoch": 1.5370866845397675, + "grad_norm": 0.0274658203125, + "learning_rate": 0.023054066130473637, + "loss": 0.7931, + "num_input_tokens_seen": 5983616, + "step": 10320 + }, + { + "epoch": 1.5378313970807267, + "grad_norm": 0.026611328125, + "learning_rate": 0.023065236818588023, + "loss": 0.7989, + "num_input_tokens_seen": 5986464, + "step": 10325 + }, + { + "epoch": 1.538576109621686, + "grad_norm": 0.03125, + "learning_rate": 0.023076407506702413, + "loss": 0.8023, + "num_input_tokens_seen": 5989632, + "step": 10330 + }, + { + "epoch": 1.539320822162645, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0230875781948168, + "loss": 0.8054, + "num_input_tokens_seen": 5992736, + "step": 10335 + }, + { + "epoch": 1.5400655347036043, + "grad_norm": 0.02783203125, + "learning_rate": 0.02309874888293119, + "loss": 0.787, + "num_input_tokens_seen": 5995424, + "step": 10340 + }, + { + "epoch": 1.5408102472445635, + "grad_norm": 0.035400390625, + "learning_rate": 0.023109919571045575, + "loss": 0.8307, + "num_input_tokens_seen": 5998176, + "step": 10345 + }, + { + "epoch": 1.5415549597855227, + "grad_norm": 0.043701171875, + "learning_rate": 0.023121090259159965, + "loss": 0.7937, + "num_input_tokens_seen": 6001408, + "step": 10350 + }, + { + "epoch": 1.542299672326482, + "grad_norm": 0.0174560546875, + "learning_rate": 0.02313226094727435, + "loss": 0.819, + "num_input_tokens_seen": 6004064, + "step": 10355 + }, + { + "epoch": 1.543044384867441, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02314343163538874, + "loss": 0.8026, + "num_input_tokens_seen": 6006944, + "step": 10360 + }, + { + "epoch": 1.5437890974084003, + "grad_norm": 0.0299072265625, + "learning_rate": 0.023154602323503127, + "loss": 0.8106, + "num_input_tokens_seen": 6009984, + "step": 10365 + }, + { + "epoch": 1.5445338099493595, + "grad_norm": 0.0159912109375, + "learning_rate": 0.023165773011617517, + "loss": 0.7798, + "num_input_tokens_seen": 6012864, + "step": 10370 + }, + { + "epoch": 1.5452785224903187, + "grad_norm": 0.04052734375, + "learning_rate": 0.023176943699731903, + "loss": 0.8059, + "num_input_tokens_seen": 6015680, + "step": 10375 + }, + { + "epoch": 1.546023235031278, + "grad_norm": 0.0164794921875, + "learning_rate": 0.023188114387846293, + "loss": 0.7783, + "num_input_tokens_seen": 6018496, + "step": 10380 + }, + { + "epoch": 1.5467679475722371, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02319928507596068, + "loss": 0.7957, + "num_input_tokens_seen": 6021600, + "step": 10385 + }, + { + "epoch": 1.5475126601131963, + "grad_norm": 0.0205078125, + "learning_rate": 0.02321045576407507, + "loss": 0.8155, + "num_input_tokens_seen": 6024480, + "step": 10390 + }, + { + "epoch": 1.5482573726541555, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02322162645218945, + "loss": 0.7903, + "num_input_tokens_seen": 6027456, + "step": 10395 + }, + { + "epoch": 1.5490020851951147, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02323279714030384, + "loss": 0.806, + "num_input_tokens_seen": 6030432, + "step": 10400 + }, + { + "epoch": 1.549746797736074, + "grad_norm": 0.0218505859375, + "learning_rate": 0.023243967828418227, + "loss": 0.824, + "num_input_tokens_seen": 6033376, + "step": 10405 + }, + { + "epoch": 1.5504915102770331, + "grad_norm": 0.0242919921875, + "learning_rate": 0.023255138516532617, + "loss": 0.8097, + "num_input_tokens_seen": 6036256, + "step": 10410 + }, + { + "epoch": 1.5512362228179923, + "grad_norm": 0.024169921875, + "learning_rate": 0.023266309204647003, + "loss": 0.7824, + "num_input_tokens_seen": 6039232, + "step": 10415 + }, + { + "epoch": 1.5519809353589515, + "grad_norm": 0.028564453125, + "learning_rate": 0.023277479892761393, + "loss": 0.8344, + "num_input_tokens_seen": 6042240, + "step": 10420 + }, + { + "epoch": 1.5527256478999107, + "grad_norm": 0.028076171875, + "learning_rate": 0.02328865058087578, + "loss": 0.7857, + "num_input_tokens_seen": 6045024, + "step": 10425 + }, + { + "epoch": 1.55347036044087, + "grad_norm": 0.056884765625, + "learning_rate": 0.02329982126899017, + "loss": 0.8067, + "num_input_tokens_seen": 6047968, + "step": 10430 + }, + { + "epoch": 1.5542150729818291, + "grad_norm": 0.016357421875, + "learning_rate": 0.02331099195710456, + "loss": 0.7747, + "num_input_tokens_seen": 6051360, + "step": 10435 + }, + { + "epoch": 1.5549597855227884, + "grad_norm": 0.0272216796875, + "learning_rate": 0.023322162645218945, + "loss": 0.7967, + "num_input_tokens_seen": 6054080, + "step": 10440 + }, + { + "epoch": 1.5557044980637476, + "grad_norm": 0.017333984375, + "learning_rate": 0.023333333333333334, + "loss": 0.8117, + "num_input_tokens_seen": 6056672, + "step": 10445 + }, + { + "epoch": 1.5564492106047065, + "grad_norm": 0.03125, + "learning_rate": 0.02334450402144772, + "loss": 0.8189, + "num_input_tokens_seen": 6059648, + "step": 10450 + }, + { + "epoch": 1.5571939231456657, + "grad_norm": 0.02685546875, + "learning_rate": 0.02335567470956211, + "loss": 0.8168, + "num_input_tokens_seen": 6062496, + "step": 10455 + }, + { + "epoch": 1.557938635686625, + "grad_norm": 0.02783203125, + "learning_rate": 0.023366845397676497, + "loss": 0.8028, + "num_input_tokens_seen": 6065344, + "step": 10460 + }, + { + "epoch": 1.5586833482275841, + "grad_norm": 0.0164794921875, + "learning_rate": 0.023378016085790886, + "loss": 0.806, + "num_input_tokens_seen": 6068064, + "step": 10465 + }, + { + "epoch": 1.5594280607685433, + "grad_norm": 0.03271484375, + "learning_rate": 0.023389186773905273, + "loss": 0.7943, + "num_input_tokens_seen": 6070944, + "step": 10470 + }, + { + "epoch": 1.5601727733095025, + "grad_norm": 0.05859375, + "learning_rate": 0.023400357462019662, + "loss": 0.7999, + "num_input_tokens_seen": 6073856, + "step": 10475 + }, + { + "epoch": 1.5609174858504618, + "grad_norm": 0.0390625, + "learning_rate": 0.023411528150134045, + "loss": 0.8105, + "num_input_tokens_seen": 6076608, + "step": 10480 + }, + { + "epoch": 1.561662198391421, + "grad_norm": 0.026611328125, + "learning_rate": 0.023422698838248435, + "loss": 0.7808, + "num_input_tokens_seen": 6079424, + "step": 10485 + }, + { + "epoch": 1.5624069109323802, + "grad_norm": 0.01531982421875, + "learning_rate": 0.02343386952636282, + "loss": 0.7896, + "num_input_tokens_seen": 6082432, + "step": 10490 + }, + { + "epoch": 1.5631516234733391, + "grad_norm": 0.03076171875, + "learning_rate": 0.02344504021447721, + "loss": 0.804, + "num_input_tokens_seen": 6085472, + "step": 10495 + }, + { + "epoch": 1.5638963360142983, + "grad_norm": 0.0159912109375, + "learning_rate": 0.023456210902591597, + "loss": 0.7699, + "num_input_tokens_seen": 6088448, + "step": 10500 + }, + { + "epoch": 1.5646410485552575, + "grad_norm": 0.0244140625, + "learning_rate": 0.023467381590705987, + "loss": 0.8267, + "num_input_tokens_seen": 6091552, + "step": 10505 + }, + { + "epoch": 1.5653857610962167, + "grad_norm": 0.064453125, + "learning_rate": 0.023478552278820373, + "loss": 0.8066, + "num_input_tokens_seen": 6094496, + "step": 10510 + }, + { + "epoch": 1.566130473637176, + "grad_norm": 0.017822265625, + "learning_rate": 0.023489722966934762, + "loss": 0.7898, + "num_input_tokens_seen": 6097248, + "step": 10515 + }, + { + "epoch": 1.5668751861781351, + "grad_norm": 0.03271484375, + "learning_rate": 0.02350089365504915, + "loss": 0.8241, + "num_input_tokens_seen": 6100000, + "step": 10520 + }, + { + "epoch": 1.5676198987190944, + "grad_norm": 0.02880859375, + "learning_rate": 0.02351206434316354, + "loss": 0.8103, + "num_input_tokens_seen": 6103104, + "step": 10525 + }, + { + "epoch": 1.5683646112600536, + "grad_norm": 0.03466796875, + "learning_rate": 0.023523235031277925, + "loss": 0.7883, + "num_input_tokens_seen": 6105984, + "step": 10530 + }, + { + "epoch": 1.5691093238010128, + "grad_norm": 0.038330078125, + "learning_rate": 0.023534405719392314, + "loss": 0.7994, + "num_input_tokens_seen": 6108736, + "step": 10535 + }, + { + "epoch": 1.569854036341972, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0235455764075067, + "loss": 0.7936, + "num_input_tokens_seen": 6111840, + "step": 10540 + }, + { + "epoch": 1.5705987488829312, + "grad_norm": 0.033935546875, + "learning_rate": 0.02355674709562109, + "loss": 0.772, + "num_input_tokens_seen": 6114848, + "step": 10545 + }, + { + "epoch": 1.5713434614238904, + "grad_norm": 0.031982421875, + "learning_rate": 0.02356791778373548, + "loss": 0.7828, + "num_input_tokens_seen": 6117600, + "step": 10550 + }, + { + "epoch": 1.5720881739648496, + "grad_norm": 0.041015625, + "learning_rate": 0.023579088471849866, + "loss": 0.7732, + "num_input_tokens_seen": 6120160, + "step": 10555 + }, + { + "epoch": 1.5728328865058088, + "grad_norm": 0.0140380859375, + "learning_rate": 0.023590259159964256, + "loss": 0.7904, + "num_input_tokens_seen": 6122848, + "step": 10560 + }, + { + "epoch": 1.573577599046768, + "grad_norm": 0.02978515625, + "learning_rate": 0.023601429848078642, + "loss": 0.8061, + "num_input_tokens_seen": 6125728, + "step": 10565 + }, + { + "epoch": 1.5743223115877272, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02361260053619303, + "loss": 0.7849, + "num_input_tokens_seen": 6128448, + "step": 10570 + }, + { + "epoch": 1.5750670241286864, + "grad_norm": 0.0281982421875, + "learning_rate": 0.023623771224307415, + "loss": 0.8014, + "num_input_tokens_seen": 6131584, + "step": 10575 + }, + { + "epoch": 1.5758117366696456, + "grad_norm": 0.029296875, + "learning_rate": 0.023634941912421804, + "loss": 0.7901, + "num_input_tokens_seen": 6134400, + "step": 10580 + }, + { + "epoch": 1.5765564492106048, + "grad_norm": 0.0322265625, + "learning_rate": 0.02364611260053619, + "loss": 0.7793, + "num_input_tokens_seen": 6137056, + "step": 10585 + }, + { + "epoch": 1.577301161751564, + "grad_norm": 0.02001953125, + "learning_rate": 0.02365728328865058, + "loss": 0.7644, + "num_input_tokens_seen": 6139744, + "step": 10590 + }, + { + "epoch": 1.5780458742925232, + "grad_norm": 0.033203125, + "learning_rate": 0.023668453976764967, + "loss": 0.8439, + "num_input_tokens_seen": 6142432, + "step": 10595 + }, + { + "epoch": 1.5787905868334824, + "grad_norm": 0.03466796875, + "learning_rate": 0.023679624664879356, + "loss": 0.8101, + "num_input_tokens_seen": 6145248, + "step": 10600 + }, + { + "epoch": 1.5795352993744416, + "grad_norm": 0.04296875, + "learning_rate": 0.023690795352993742, + "loss": 0.7701, + "num_input_tokens_seen": 6148256, + "step": 10605 + }, + { + "epoch": 1.5802800119154008, + "grad_norm": 0.0255126953125, + "learning_rate": 0.023701966041108132, + "loss": 0.7709, + "num_input_tokens_seen": 6151232, + "step": 10610 + }, + { + "epoch": 1.58102472445636, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02371313672922252, + "loss": 0.8297, + "num_input_tokens_seen": 6154144, + "step": 10615 + }, + { + "epoch": 1.5817694369973192, + "grad_norm": 0.037353515625, + "learning_rate": 0.023724307417336908, + "loss": 0.8267, + "num_input_tokens_seen": 6157216, + "step": 10620 + }, + { + "epoch": 1.5825141495382782, + "grad_norm": 0.028076171875, + "learning_rate": 0.023735478105451294, + "loss": 0.8087, + "num_input_tokens_seen": 6160128, + "step": 10625 + }, + { + "epoch": 1.5832588620792374, + "grad_norm": 0.032470703125, + "learning_rate": 0.023746648793565684, + "loss": 0.8305, + "num_input_tokens_seen": 6163040, + "step": 10630 + }, + { + "epoch": 1.5840035746201966, + "grad_norm": 0.036376953125, + "learning_rate": 0.02375781948168007, + "loss": 0.8215, + "num_input_tokens_seen": 6165824, + "step": 10635 + }, + { + "epoch": 1.5847482871611558, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02376899016979446, + "loss": 0.7737, + "num_input_tokens_seen": 6168800, + "step": 10640 + }, + { + "epoch": 1.585492999702115, + "grad_norm": 0.0235595703125, + "learning_rate": 0.023780160857908846, + "loss": 0.8204, + "num_input_tokens_seen": 6171648, + "step": 10645 + }, + { + "epoch": 1.5862377122430742, + "grad_norm": 0.03857421875, + "learning_rate": 0.023791331546023236, + "loss": 0.7875, + "num_input_tokens_seen": 6174688, + "step": 10650 + }, + { + "epoch": 1.5869824247840334, + "grad_norm": 0.03271484375, + "learning_rate": 0.023802502234137622, + "loss": 0.8316, + "num_input_tokens_seen": 6177664, + "step": 10655 + }, + { + "epoch": 1.5877271373249926, + "grad_norm": 0.0361328125, + "learning_rate": 0.02381367292225201, + "loss": 0.7973, + "num_input_tokens_seen": 6180672, + "step": 10660 + }, + { + "epoch": 1.5884718498659516, + "grad_norm": 0.02490234375, + "learning_rate": 0.023824843610366398, + "loss": 0.8004, + "num_input_tokens_seen": 6183616, + "step": 10665 + }, + { + "epoch": 1.5892165624069108, + "grad_norm": 0.02490234375, + "learning_rate": 0.023836014298480784, + "loss": 0.7961, + "num_input_tokens_seen": 6186368, + "step": 10670 + }, + { + "epoch": 1.58996127494787, + "grad_norm": 0.037841796875, + "learning_rate": 0.023847184986595174, + "loss": 0.7928, + "num_input_tokens_seen": 6189344, + "step": 10675 + }, + { + "epoch": 1.5907059874888292, + "grad_norm": 0.02783203125, + "learning_rate": 0.02385835567470956, + "loss": 0.8009, + "num_input_tokens_seen": 6192128, + "step": 10680 + }, + { + "epoch": 1.5914507000297884, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02386952636282395, + "loss": 0.8063, + "num_input_tokens_seen": 6195264, + "step": 10685 + }, + { + "epoch": 1.5921954125707476, + "grad_norm": 0.05126953125, + "learning_rate": 0.023880697050938336, + "loss": 0.8136, + "num_input_tokens_seen": 6197920, + "step": 10690 + }, + { + "epoch": 1.5929401251117068, + "grad_norm": 0.03125, + "learning_rate": 0.023891867739052726, + "loss": 0.8191, + "num_input_tokens_seen": 6201088, + "step": 10695 + }, + { + "epoch": 1.593684837652666, + "grad_norm": 0.02880859375, + "learning_rate": 0.023903038427167112, + "loss": 0.8257, + "num_input_tokens_seen": 6204224, + "step": 10700 + }, + { + "epoch": 1.5944295501936252, + "grad_norm": 0.04541015625, + "learning_rate": 0.023914209115281502, + "loss": 0.8079, + "num_input_tokens_seen": 6207040, + "step": 10705 + }, + { + "epoch": 1.5951742627345844, + "grad_norm": 0.0167236328125, + "learning_rate": 0.023925379803395888, + "loss": 0.816, + "num_input_tokens_seen": 6209952, + "step": 10710 + }, + { + "epoch": 1.5959189752755436, + "grad_norm": 0.045654296875, + "learning_rate": 0.023936550491510278, + "loss": 0.8112, + "num_input_tokens_seen": 6212896, + "step": 10715 + }, + { + "epoch": 1.5966636878165028, + "grad_norm": 0.025634765625, + "learning_rate": 0.023947721179624664, + "loss": 0.82, + "num_input_tokens_seen": 6216448, + "step": 10720 + }, + { + "epoch": 1.597408400357462, + "grad_norm": 0.024169921875, + "learning_rate": 0.023958891867739054, + "loss": 0.7993, + "num_input_tokens_seen": 6219296, + "step": 10725 + }, + { + "epoch": 1.5981531128984212, + "grad_norm": 0.027587890625, + "learning_rate": 0.02397006255585344, + "loss": 0.7961, + "num_input_tokens_seen": 6222528, + "step": 10730 + }, + { + "epoch": 1.5988978254393804, + "grad_norm": 0.03173828125, + "learning_rate": 0.02398123324396783, + "loss": 0.7906, + "num_input_tokens_seen": 6225184, + "step": 10735 + }, + { + "epoch": 1.5996425379803396, + "grad_norm": 0.02490234375, + "learning_rate": 0.023992403932082212, + "loss": 0.8065, + "num_input_tokens_seen": 6227936, + "step": 10740 + }, + { + "epoch": 1.6003872505212988, + "grad_norm": 0.0157470703125, + "learning_rate": 0.024003574620196602, + "loss": 0.8425, + "num_input_tokens_seen": 6230688, + "step": 10745 + }, + { + "epoch": 1.601131963062258, + "grad_norm": 0.02685546875, + "learning_rate": 0.02401474530831099, + "loss": 0.7937, + "num_input_tokens_seen": 6233632, + "step": 10750 + }, + { + "epoch": 1.6018766756032172, + "grad_norm": 0.02587890625, + "learning_rate": 0.024025915996425378, + "loss": 0.7963, + "num_input_tokens_seen": 6236480, + "step": 10755 + }, + { + "epoch": 1.6026213881441764, + "grad_norm": 0.0244140625, + "learning_rate": 0.024037086684539764, + "loss": 0.824, + "num_input_tokens_seen": 6239456, + "step": 10760 + }, + { + "epoch": 1.6033661006851356, + "grad_norm": 0.02685546875, + "learning_rate": 0.024048257372654154, + "loss": 0.8091, + "num_input_tokens_seen": 6242368, + "step": 10765 + }, + { + "epoch": 1.6041108132260948, + "grad_norm": 0.025634765625, + "learning_rate": 0.024059428060768544, + "loss": 0.7941, + "num_input_tokens_seen": 6245152, + "step": 10770 + }, + { + "epoch": 1.604855525767054, + "grad_norm": 0.026123046875, + "learning_rate": 0.02407059874888293, + "loss": 0.8091, + "num_input_tokens_seen": 6248032, + "step": 10775 + }, + { + "epoch": 1.6056002383080132, + "grad_norm": 0.035400390625, + "learning_rate": 0.02408176943699732, + "loss": 0.8025, + "num_input_tokens_seen": 6251008, + "step": 10780 + }, + { + "epoch": 1.6063449508489724, + "grad_norm": 0.02392578125, + "learning_rate": 0.024092940125111706, + "loss": 0.7897, + "num_input_tokens_seen": 6254240, + "step": 10785 + }, + { + "epoch": 1.6070896633899316, + "grad_norm": 0.0341796875, + "learning_rate": 0.024104110813226096, + "loss": 0.7923, + "num_input_tokens_seen": 6257376, + "step": 10790 + }, + { + "epoch": 1.6078343759308906, + "grad_norm": 0.017333984375, + "learning_rate": 0.024115281501340482, + "loss": 0.8181, + "num_input_tokens_seen": 6260608, + "step": 10795 + }, + { + "epoch": 1.6085790884718498, + "grad_norm": 0.016845703125, + "learning_rate": 0.02412645218945487, + "loss": 0.7893, + "num_input_tokens_seen": 6263200, + "step": 10800 + }, + { + "epoch": 1.609323801012809, + "grad_norm": 0.0257568359375, + "learning_rate": 0.024137622877569258, + "loss": 0.7931, + "num_input_tokens_seen": 6266208, + "step": 10805 + }, + { + "epoch": 1.6100685135537682, + "grad_norm": 0.033935546875, + "learning_rate": 0.024148793565683647, + "loss": 0.7882, + "num_input_tokens_seen": 6269024, + "step": 10810 + }, + { + "epoch": 1.6108132260947274, + "grad_norm": 0.025634765625, + "learning_rate": 0.024159964253798034, + "loss": 0.7912, + "num_input_tokens_seen": 6272096, + "step": 10815 + }, + { + "epoch": 1.6115579386356866, + "grad_norm": 0.0189208984375, + "learning_rate": 0.024171134941912423, + "loss": 0.794, + "num_input_tokens_seen": 6274944, + "step": 10820 + }, + { + "epoch": 1.6123026511766458, + "grad_norm": 0.029541015625, + "learning_rate": 0.024182305630026806, + "loss": 0.8065, + "num_input_tokens_seen": 6277856, + "step": 10825 + }, + { + "epoch": 1.613047363717605, + "grad_norm": 0.0322265625, + "learning_rate": 0.024193476318141196, + "loss": 0.8087, + "num_input_tokens_seen": 6280544, + "step": 10830 + }, + { + "epoch": 1.6137920762585642, + "grad_norm": 0.0208740234375, + "learning_rate": 0.024204647006255582, + "loss": 0.8172, + "num_input_tokens_seen": 6283232, + "step": 10835 + }, + { + "epoch": 1.6145367887995232, + "grad_norm": 0.0245361328125, + "learning_rate": 0.024215817694369972, + "loss": 0.805, + "num_input_tokens_seen": 6286144, + "step": 10840 + }, + { + "epoch": 1.6152815013404824, + "grad_norm": 0.028564453125, + "learning_rate": 0.024226988382484358, + "loss": 0.8142, + "num_input_tokens_seen": 6288864, + "step": 10845 + }, + { + "epoch": 1.6160262138814416, + "grad_norm": 0.04052734375, + "learning_rate": 0.024238159070598748, + "loss": 0.812, + "num_input_tokens_seen": 6291872, + "step": 10850 + }, + { + "epoch": 1.6167709264224008, + "grad_norm": 0.034912109375, + "learning_rate": 0.024249329758713134, + "loss": 0.8114, + "num_input_tokens_seen": 6294880, + "step": 10855 + }, + { + "epoch": 1.61751563896336, + "grad_norm": 0.03857421875, + "learning_rate": 0.024260500446827524, + "loss": 0.8147, + "num_input_tokens_seen": 6297824, + "step": 10860 + }, + { + "epoch": 1.6182603515043192, + "grad_norm": 0.03564453125, + "learning_rate": 0.02427167113494191, + "loss": 0.7999, + "num_input_tokens_seen": 6300576, + "step": 10865 + }, + { + "epoch": 1.6190050640452784, + "grad_norm": 0.0283203125, + "learning_rate": 0.0242828418230563, + "loss": 0.8137, + "num_input_tokens_seen": 6303616, + "step": 10870 + }, + { + "epoch": 1.6197497765862376, + "grad_norm": 0.03173828125, + "learning_rate": 0.024294012511170686, + "loss": 0.8174, + "num_input_tokens_seen": 6306592, + "step": 10875 + }, + { + "epoch": 1.6204944891271968, + "grad_norm": 0.0172119140625, + "learning_rate": 0.024305183199285076, + "loss": 0.7783, + "num_input_tokens_seen": 6309536, + "step": 10880 + }, + { + "epoch": 1.621239201668156, + "grad_norm": 0.0252685546875, + "learning_rate": 0.024316353887399465, + "loss": 0.782, + "num_input_tokens_seen": 6312448, + "step": 10885 + }, + { + "epoch": 1.6219839142091153, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02432752457551385, + "loss": 0.8037, + "num_input_tokens_seen": 6315168, + "step": 10890 + }, + { + "epoch": 1.6227286267500745, + "grad_norm": 0.01806640625, + "learning_rate": 0.02433869526362824, + "loss": 0.7965, + "num_input_tokens_seen": 6317952, + "step": 10895 + }, + { + "epoch": 1.6234733392910337, + "grad_norm": 0.023193359375, + "learning_rate": 0.024349865951742627, + "loss": 0.795, + "num_input_tokens_seen": 6320832, + "step": 10900 + }, + { + "epoch": 1.6242180518319929, + "grad_norm": 0.03662109375, + "learning_rate": 0.024361036639857017, + "loss": 0.8126, + "num_input_tokens_seen": 6323872, + "step": 10905 + }, + { + "epoch": 1.624962764372952, + "grad_norm": 0.057373046875, + "learning_rate": 0.024372207327971403, + "loss": 0.7742, + "num_input_tokens_seen": 6326560, + "step": 10910 + }, + { + "epoch": 1.6257074769139113, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02438337801608579, + "loss": 0.8153, + "num_input_tokens_seen": 6329504, + "step": 10915 + }, + { + "epoch": 1.6264521894548705, + "grad_norm": 0.041259765625, + "learning_rate": 0.024394548704200176, + "loss": 0.8113, + "num_input_tokens_seen": 6332608, + "step": 10920 + }, + { + "epoch": 1.6271969019958297, + "grad_norm": 0.0255126953125, + "learning_rate": 0.024405719392314566, + "loss": 0.8145, + "num_input_tokens_seen": 6335328, + "step": 10925 + }, + { + "epoch": 1.6279416145367889, + "grad_norm": 0.052490234375, + "learning_rate": 0.024416890080428952, + "loss": 0.8144, + "num_input_tokens_seen": 6338304, + "step": 10930 + }, + { + "epoch": 1.628686327077748, + "grad_norm": 0.032470703125, + "learning_rate": 0.02442806076854334, + "loss": 0.8242, + "num_input_tokens_seen": 6341632, + "step": 10935 + }, + { + "epoch": 1.6294310396187073, + "grad_norm": 0.047119140625, + "learning_rate": 0.024439231456657728, + "loss": 0.8008, + "num_input_tokens_seen": 6344672, + "step": 10940 + }, + { + "epoch": 1.6301757521596665, + "grad_norm": 0.03369140625, + "learning_rate": 0.024450402144772117, + "loss": 0.8032, + "num_input_tokens_seen": 6347520, + "step": 10945 + }, + { + "epoch": 1.6309204647006257, + "grad_norm": 0.033203125, + "learning_rate": 0.024461572832886504, + "loss": 0.8135, + "num_input_tokens_seen": 6350080, + "step": 10950 + }, + { + "epoch": 1.6316651772415849, + "grad_norm": 0.0301513671875, + "learning_rate": 0.024472743521000893, + "loss": 0.7942, + "num_input_tokens_seen": 6352768, + "step": 10955 + }, + { + "epoch": 1.632409889782544, + "grad_norm": 0.043212890625, + "learning_rate": 0.02448391420911528, + "loss": 0.7913, + "num_input_tokens_seen": 6355776, + "step": 10960 + }, + { + "epoch": 1.6331546023235033, + "grad_norm": 0.045166015625, + "learning_rate": 0.02449508489722967, + "loss": 0.8149, + "num_input_tokens_seen": 6358496, + "step": 10965 + }, + { + "epoch": 1.6338993148644623, + "grad_norm": 0.015869140625, + "learning_rate": 0.024506255585344056, + "loss": 0.7909, + "num_input_tokens_seen": 6361408, + "step": 10970 + }, + { + "epoch": 1.6346440274054215, + "grad_norm": 0.0174560546875, + "learning_rate": 0.024517426273458445, + "loss": 0.8083, + "num_input_tokens_seen": 6364320, + "step": 10975 + }, + { + "epoch": 1.6353887399463807, + "grad_norm": 0.039306640625, + "learning_rate": 0.02452859696157283, + "loss": 0.7982, + "num_input_tokens_seen": 6366944, + "step": 10980 + }, + { + "epoch": 1.6361334524873399, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02453976764968722, + "loss": 0.8241, + "num_input_tokens_seen": 6369568, + "step": 10985 + }, + { + "epoch": 1.636878165028299, + "grad_norm": 0.03271484375, + "learning_rate": 0.024550938337801607, + "loss": 0.8178, + "num_input_tokens_seen": 6372320, + "step": 10990 + }, + { + "epoch": 1.6376228775692583, + "grad_norm": 0.025634765625, + "learning_rate": 0.024562109025915997, + "loss": 0.7774, + "num_input_tokens_seen": 6375264, + "step": 10995 + }, + { + "epoch": 1.6383675901102175, + "grad_norm": 0.0174560546875, + "learning_rate": 0.024573279714030383, + "loss": 0.7954, + "num_input_tokens_seen": 6378016, + "step": 11000 + }, + { + "epoch": 1.6391123026511767, + "grad_norm": 0.0400390625, + "learning_rate": 0.02458445040214477, + "loss": 0.8012, + "num_input_tokens_seen": 6380576, + "step": 11005 + }, + { + "epoch": 1.6398570151921357, + "grad_norm": 0.035888671875, + "learning_rate": 0.02459562109025916, + "loss": 0.7844, + "num_input_tokens_seen": 6383200, + "step": 11010 + }, + { + "epoch": 1.6406017277330949, + "grad_norm": 0.058349609375, + "learning_rate": 0.024606791778373546, + "loss": 0.8131, + "num_input_tokens_seen": 6386016, + "step": 11015 + }, + { + "epoch": 1.641346440274054, + "grad_norm": 0.024658203125, + "learning_rate": 0.024617962466487935, + "loss": 0.8058, + "num_input_tokens_seen": 6388768, + "step": 11020 + }, + { + "epoch": 1.6420911528150133, + "grad_norm": 0.042236328125, + "learning_rate": 0.02462913315460232, + "loss": 0.8258, + "num_input_tokens_seen": 6391616, + "step": 11025 + }, + { + "epoch": 1.6428358653559725, + "grad_norm": 0.029052734375, + "learning_rate": 0.02464030384271671, + "loss": 0.8263, + "num_input_tokens_seen": 6394240, + "step": 11030 + }, + { + "epoch": 1.6435805778969317, + "grad_norm": 0.035888671875, + "learning_rate": 0.024651474530831097, + "loss": 0.8148, + "num_input_tokens_seen": 6397312, + "step": 11035 + }, + { + "epoch": 1.6443252904378909, + "grad_norm": 0.025390625, + "learning_rate": 0.024662645218945487, + "loss": 0.801, + "num_input_tokens_seen": 6399904, + "step": 11040 + }, + { + "epoch": 1.64507000297885, + "grad_norm": 0.01708984375, + "learning_rate": 0.024673815907059873, + "loss": 0.8215, + "num_input_tokens_seen": 6402816, + "step": 11045 + }, + { + "epoch": 1.6458147155198093, + "grad_norm": 0.0294189453125, + "learning_rate": 0.024684986595174263, + "loss": 0.8013, + "num_input_tokens_seen": 6405792, + "step": 11050 + }, + { + "epoch": 1.6465594280607685, + "grad_norm": 0.015625, + "learning_rate": 0.02469615728328865, + "loss": 0.824, + "num_input_tokens_seen": 6408512, + "step": 11055 + }, + { + "epoch": 1.6473041406017277, + "grad_norm": 0.036865234375, + "learning_rate": 0.02470732797140304, + "loss": 0.7942, + "num_input_tokens_seen": 6411744, + "step": 11060 + }, + { + "epoch": 1.648048853142687, + "grad_norm": 0.03271484375, + "learning_rate": 0.024718498659517425, + "loss": 0.808, + "num_input_tokens_seen": 6414720, + "step": 11065 + }, + { + "epoch": 1.648793565683646, + "grad_norm": 0.0296630859375, + "learning_rate": 0.024729669347631815, + "loss": 0.8054, + "num_input_tokens_seen": 6417536, + "step": 11070 + }, + { + "epoch": 1.6495382782246053, + "grad_norm": 0.0164794921875, + "learning_rate": 0.0247408400357462, + "loss": 0.7996, + "num_input_tokens_seen": 6420256, + "step": 11075 + }, + { + "epoch": 1.6502829907655645, + "grad_norm": 0.035400390625, + "learning_rate": 0.02475201072386059, + "loss": 0.805, + "num_input_tokens_seen": 6423296, + "step": 11080 + }, + { + "epoch": 1.6510277033065237, + "grad_norm": 0.01953125, + "learning_rate": 0.024763181411974974, + "loss": 0.789, + "num_input_tokens_seen": 6426080, + "step": 11085 + }, + { + "epoch": 1.651772415847483, + "grad_norm": 0.0162353515625, + "learning_rate": 0.024774352100089363, + "loss": 0.8072, + "num_input_tokens_seen": 6428864, + "step": 11090 + }, + { + "epoch": 1.6525171283884421, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02478552278820375, + "loss": 0.8085, + "num_input_tokens_seen": 6431584, + "step": 11095 + }, + { + "epoch": 1.6532618409294013, + "grad_norm": 0.0169677734375, + "learning_rate": 0.02479669347631814, + "loss": 0.8017, + "num_input_tokens_seen": 6434496, + "step": 11100 + }, + { + "epoch": 1.6540065534703605, + "grad_norm": 0.01806640625, + "learning_rate": 0.024807864164432526, + "loss": 0.7838, + "num_input_tokens_seen": 6437280, + "step": 11105 + }, + { + "epoch": 1.6547512660113197, + "grad_norm": 0.03955078125, + "learning_rate": 0.024819034852546915, + "loss": 0.7912, + "num_input_tokens_seen": 6440160, + "step": 11110 + }, + { + "epoch": 1.655495978552279, + "grad_norm": 0.025634765625, + "learning_rate": 0.024830205540661305, + "loss": 0.808, + "num_input_tokens_seen": 6443328, + "step": 11115 + }, + { + "epoch": 1.6562406910932381, + "grad_norm": 0.035400390625, + "learning_rate": 0.02484137622877569, + "loss": 0.8103, + "num_input_tokens_seen": 6446112, + "step": 11120 + }, + { + "epoch": 1.6569854036341973, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02485254691689008, + "loss": 0.8113, + "num_input_tokens_seen": 6449152, + "step": 11125 + }, + { + "epoch": 1.6577301161751565, + "grad_norm": 0.028564453125, + "learning_rate": 0.024863717605004467, + "loss": 0.7873, + "num_input_tokens_seen": 6451840, + "step": 11130 + }, + { + "epoch": 1.6584748287161157, + "grad_norm": 0.0302734375, + "learning_rate": 0.024874888293118857, + "loss": 0.8153, + "num_input_tokens_seen": 6454784, + "step": 11135 + }, + { + "epoch": 1.6592195412570747, + "grad_norm": 0.018798828125, + "learning_rate": 0.024886058981233243, + "loss": 0.7772, + "num_input_tokens_seen": 6457728, + "step": 11140 + }, + { + "epoch": 1.659964253798034, + "grad_norm": 0.043701171875, + "learning_rate": 0.024897229669347633, + "loss": 0.8129, + "num_input_tokens_seen": 6460416, + "step": 11145 + }, + { + "epoch": 1.6607089663389931, + "grad_norm": 0.0263671875, + "learning_rate": 0.02490840035746202, + "loss": 0.7906, + "num_input_tokens_seen": 6463360, + "step": 11150 + }, + { + "epoch": 1.6614536788799523, + "grad_norm": 0.044189453125, + "learning_rate": 0.02491957104557641, + "loss": 0.7948, + "num_input_tokens_seen": 6466144, + "step": 11155 + }, + { + "epoch": 1.6621983914209115, + "grad_norm": 0.032470703125, + "learning_rate": 0.024930741733690795, + "loss": 0.8011, + "num_input_tokens_seen": 6469120, + "step": 11160 + }, + { + "epoch": 1.6629431039618707, + "grad_norm": 0.03125, + "learning_rate": 0.024941912421805185, + "loss": 0.7995, + "num_input_tokens_seen": 6472320, + "step": 11165 + }, + { + "epoch": 1.66368781650283, + "grad_norm": 0.03271484375, + "learning_rate": 0.02495308310991957, + "loss": 0.7716, + "num_input_tokens_seen": 6475296, + "step": 11170 + }, + { + "epoch": 1.6644325290437891, + "grad_norm": 0.043701171875, + "learning_rate": 0.024964253798033957, + "loss": 0.8371, + "num_input_tokens_seen": 6478496, + "step": 11175 + }, + { + "epoch": 1.6651772415847483, + "grad_norm": 0.04443359375, + "learning_rate": 0.024975424486148343, + "loss": 0.8103, + "num_input_tokens_seen": 6481440, + "step": 11180 + }, + { + "epoch": 1.6659219541257073, + "grad_norm": 0.029541015625, + "learning_rate": 0.024986595174262733, + "loss": 0.7833, + "num_input_tokens_seen": 6484800, + "step": 11185 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.04150390625, + "learning_rate": 0.02499776586237712, + "loss": 0.8148, + "num_input_tokens_seen": 6487488, + "step": 11190 + }, + { + "epoch": 1.6674113792076257, + "grad_norm": 0.034912109375, + "learning_rate": 0.02500893655049151, + "loss": 0.8082, + "num_input_tokens_seen": 6490048, + "step": 11195 + }, + { + "epoch": 1.668156091748585, + "grad_norm": 0.027099609375, + "learning_rate": 0.025020107238605895, + "loss": 0.8219, + "num_input_tokens_seen": 6493088, + "step": 11200 + }, + { + "epoch": 1.6689008042895441, + "grad_norm": 0.0250244140625, + "learning_rate": 0.025031277926720285, + "loss": 0.7929, + "num_input_tokens_seen": 6496128, + "step": 11205 + }, + { + "epoch": 1.6696455168305033, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02504244861483467, + "loss": 0.82, + "num_input_tokens_seen": 6498976, + "step": 11210 + }, + { + "epoch": 1.6703902293714625, + "grad_norm": 0.021728515625, + "learning_rate": 0.02505361930294906, + "loss": 0.8304, + "num_input_tokens_seen": 6502208, + "step": 11215 + }, + { + "epoch": 1.6711349419124217, + "grad_norm": 0.017333984375, + "learning_rate": 0.025064789991063447, + "loss": 0.7828, + "num_input_tokens_seen": 6505408, + "step": 11220 + }, + { + "epoch": 1.671879654453381, + "grad_norm": 0.0240478515625, + "learning_rate": 0.025075960679177837, + "loss": 0.7947, + "num_input_tokens_seen": 6508416, + "step": 11225 + }, + { + "epoch": 1.6726243669943401, + "grad_norm": 0.0380859375, + "learning_rate": 0.025087131367292227, + "loss": 0.8031, + "num_input_tokens_seen": 6511232, + "step": 11230 + }, + { + "epoch": 1.6733690795352993, + "grad_norm": 0.0257568359375, + "learning_rate": 0.025098302055406613, + "loss": 0.7869, + "num_input_tokens_seen": 6514112, + "step": 11235 + }, + { + "epoch": 1.6741137920762585, + "grad_norm": 0.02490234375, + "learning_rate": 0.025109472743521002, + "loss": 0.807, + "num_input_tokens_seen": 6516928, + "step": 11240 + }, + { + "epoch": 1.6748585046172177, + "grad_norm": 0.038818359375, + "learning_rate": 0.02512064343163539, + "loss": 0.8038, + "num_input_tokens_seen": 6519616, + "step": 11245 + }, + { + "epoch": 1.675603217158177, + "grad_norm": 0.03564453125, + "learning_rate": 0.02513181411974978, + "loss": 0.7903, + "num_input_tokens_seen": 6522624, + "step": 11250 + }, + { + "epoch": 1.6763479296991362, + "grad_norm": 0.0281982421875, + "learning_rate": 0.025142984807864165, + "loss": 0.7936, + "num_input_tokens_seen": 6525312, + "step": 11255 + }, + { + "epoch": 1.6770926422400954, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02515415549597855, + "loss": 0.8044, + "num_input_tokens_seen": 6528320, + "step": 11260 + }, + { + "epoch": 1.6778373547810546, + "grad_norm": 0.03515625, + "learning_rate": 0.025165326184092937, + "loss": 0.8296, + "num_input_tokens_seen": 6531168, + "step": 11265 + }, + { + "epoch": 1.6785820673220138, + "grad_norm": 0.017578125, + "learning_rate": 0.025176496872207327, + "loss": 0.8041, + "num_input_tokens_seen": 6534208, + "step": 11270 + }, + { + "epoch": 1.679326779862973, + "grad_norm": 0.0306396484375, + "learning_rate": 0.025187667560321713, + "loss": 0.7864, + "num_input_tokens_seen": 6536896, + "step": 11275 + }, + { + "epoch": 1.6800714924039322, + "grad_norm": 0.030029296875, + "learning_rate": 0.025198838248436103, + "loss": 0.7896, + "num_input_tokens_seen": 6539744, + "step": 11280 + }, + { + "epoch": 1.6808162049448914, + "grad_norm": 0.025390625, + "learning_rate": 0.02521000893655049, + "loss": 0.8109, + "num_input_tokens_seen": 6542336, + "step": 11285 + }, + { + "epoch": 1.6815609174858506, + "grad_norm": 0.03369140625, + "learning_rate": 0.02522117962466488, + "loss": 0.8033, + "num_input_tokens_seen": 6545632, + "step": 11290 + }, + { + "epoch": 1.6823056300268098, + "grad_norm": 0.04248046875, + "learning_rate": 0.025232350312779265, + "loss": 0.8156, + "num_input_tokens_seen": 6548352, + "step": 11295 + }, + { + "epoch": 1.683050342567769, + "grad_norm": 0.0299072265625, + "learning_rate": 0.025243521000893655, + "loss": 0.8032, + "num_input_tokens_seen": 6551488, + "step": 11300 + }, + { + "epoch": 1.6837950551087282, + "grad_norm": 0.03369140625, + "learning_rate": 0.02525469168900804, + "loss": 0.8146, + "num_input_tokens_seen": 6554624, + "step": 11305 + }, + { + "epoch": 1.6845397676496874, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02526586237712243, + "loss": 0.8084, + "num_input_tokens_seen": 6557728, + "step": 11310 + }, + { + "epoch": 1.6852844801906464, + "grad_norm": 0.05517578125, + "learning_rate": 0.025277033065236817, + "loss": 0.8448, + "num_input_tokens_seen": 6560800, + "step": 11315 + }, + { + "epoch": 1.6860291927316056, + "grad_norm": 0.0390625, + "learning_rate": 0.025288203753351206, + "loss": 0.8077, + "num_input_tokens_seen": 6563776, + "step": 11320 + }, + { + "epoch": 1.6867739052725648, + "grad_norm": 0.033203125, + "learning_rate": 0.025299374441465593, + "loss": 0.8052, + "num_input_tokens_seen": 6566624, + "step": 11325 + }, + { + "epoch": 1.687518617813524, + "grad_norm": 0.055908203125, + "learning_rate": 0.025310545129579982, + "loss": 0.7815, + "num_input_tokens_seen": 6570048, + "step": 11330 + }, + { + "epoch": 1.6882633303544832, + "grad_norm": 0.0283203125, + "learning_rate": 0.02532171581769437, + "loss": 0.8, + "num_input_tokens_seen": 6572800, + "step": 11335 + }, + { + "epoch": 1.6890080428954424, + "grad_norm": 0.0322265625, + "learning_rate": 0.02533288650580876, + "loss": 0.8043, + "num_input_tokens_seen": 6575712, + "step": 11340 + }, + { + "epoch": 1.6897527554364016, + "grad_norm": 0.01904296875, + "learning_rate": 0.025344057193923145, + "loss": 0.8299, + "num_input_tokens_seen": 6578688, + "step": 11345 + }, + { + "epoch": 1.6904974679773608, + "grad_norm": 0.031494140625, + "learning_rate": 0.02535522788203753, + "loss": 0.8143, + "num_input_tokens_seen": 6581568, + "step": 11350 + }, + { + "epoch": 1.69124218051832, + "grad_norm": 0.034912109375, + "learning_rate": 0.02536639857015192, + "loss": 0.8034, + "num_input_tokens_seen": 6584672, + "step": 11355 + }, + { + "epoch": 1.691986893059279, + "grad_norm": 0.0299072265625, + "learning_rate": 0.025377569258266307, + "loss": 0.7935, + "num_input_tokens_seen": 6587552, + "step": 11360 + }, + { + "epoch": 1.6927316056002382, + "grad_norm": 0.033447265625, + "learning_rate": 0.025388739946380696, + "loss": 0.8024, + "num_input_tokens_seen": 6590624, + "step": 11365 + }, + { + "epoch": 1.6934763181411974, + "grad_norm": 0.038330078125, + "learning_rate": 0.025399910634495083, + "loss": 0.8132, + "num_input_tokens_seen": 6593728, + "step": 11370 + }, + { + "epoch": 1.6942210306821566, + "grad_norm": 0.015869140625, + "learning_rate": 0.025411081322609472, + "loss": 0.7905, + "num_input_tokens_seen": 6596480, + "step": 11375 + }, + { + "epoch": 1.6949657432231158, + "grad_norm": 0.02734375, + "learning_rate": 0.02542225201072386, + "loss": 0.7911, + "num_input_tokens_seen": 6599360, + "step": 11380 + }, + { + "epoch": 1.695710455764075, + "grad_norm": 0.029052734375, + "learning_rate": 0.02543342269883825, + "loss": 0.7887, + "num_input_tokens_seen": 6602176, + "step": 11385 + }, + { + "epoch": 1.6964551683050342, + "grad_norm": 0.0277099609375, + "learning_rate": 0.025444593386952635, + "loss": 0.7875, + "num_input_tokens_seen": 6605024, + "step": 11390 + }, + { + "epoch": 1.6971998808459934, + "grad_norm": 0.026611328125, + "learning_rate": 0.025455764075067024, + "loss": 0.8047, + "num_input_tokens_seen": 6607744, + "step": 11395 + }, + { + "epoch": 1.6979445933869526, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02546693476318141, + "loss": 0.7841, + "num_input_tokens_seen": 6610848, + "step": 11400 + }, + { + "epoch": 1.6986893059279118, + "grad_norm": 0.041748046875, + "learning_rate": 0.0254781054512958, + "loss": 0.8048, + "num_input_tokens_seen": 6613760, + "step": 11405 + }, + { + "epoch": 1.699434018468871, + "grad_norm": 0.042724609375, + "learning_rate": 0.025489276139410186, + "loss": 0.8255, + "num_input_tokens_seen": 6617248, + "step": 11410 + }, + { + "epoch": 1.7001787310098302, + "grad_norm": 0.02587890625, + "learning_rate": 0.025500446827524576, + "loss": 0.8169, + "num_input_tokens_seen": 6619904, + "step": 11415 + }, + { + "epoch": 1.7009234435507894, + "grad_norm": 0.0303955078125, + "learning_rate": 0.025511617515638962, + "loss": 0.794, + "num_input_tokens_seen": 6622528, + "step": 11420 + }, + { + "epoch": 1.7016681560917486, + "grad_norm": 0.0269775390625, + "learning_rate": 0.025522788203753352, + "loss": 0.8136, + "num_input_tokens_seen": 6625504, + "step": 11425 + }, + { + "epoch": 1.7024128686327078, + "grad_norm": 0.0244140625, + "learning_rate": 0.02553395889186774, + "loss": 0.7905, + "num_input_tokens_seen": 6628320, + "step": 11430 + }, + { + "epoch": 1.703157581173667, + "grad_norm": 0.0164794921875, + "learning_rate": 0.025545129579982125, + "loss": 0.8037, + "num_input_tokens_seen": 6631328, + "step": 11435 + }, + { + "epoch": 1.7039022937146262, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02555630026809651, + "loss": 0.8093, + "num_input_tokens_seen": 6634208, + "step": 11440 + }, + { + "epoch": 1.7046470062555854, + "grad_norm": 0.0169677734375, + "learning_rate": 0.0255674709562109, + "loss": 0.8003, + "num_input_tokens_seen": 6637216, + "step": 11445 + }, + { + "epoch": 1.7053917187965446, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02557864164432529, + "loss": 0.8164, + "num_input_tokens_seen": 6639936, + "step": 11450 + }, + { + "epoch": 1.7061364313375038, + "grad_norm": 0.054443359375, + "learning_rate": 0.025589812332439676, + "loss": 0.8368, + "num_input_tokens_seen": 6643712, + "step": 11455 + }, + { + "epoch": 1.706881143878463, + "grad_norm": 0.0361328125, + "learning_rate": 0.025600983020554066, + "loss": 0.7764, + "num_input_tokens_seen": 6646688, + "step": 11460 + }, + { + "epoch": 1.7076258564194222, + "grad_norm": 0.028564453125, + "learning_rate": 0.025612153708668452, + "loss": 0.7898, + "num_input_tokens_seen": 6649728, + "step": 11465 + }, + { + "epoch": 1.7083705689603814, + "grad_norm": 0.025634765625, + "learning_rate": 0.025623324396782842, + "loss": 0.7987, + "num_input_tokens_seen": 6652640, + "step": 11470 + }, + { + "epoch": 1.7091152815013406, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02563449508489723, + "loss": 0.7962, + "num_input_tokens_seen": 6655872, + "step": 11475 + }, + { + "epoch": 1.7098599940422998, + "grad_norm": 0.0267333984375, + "learning_rate": 0.025645665773011618, + "loss": 0.796, + "num_input_tokens_seen": 6658848, + "step": 11480 + }, + { + "epoch": 1.710604706583259, + "grad_norm": 0.015869140625, + "learning_rate": 0.025656836461126004, + "loss": 0.8027, + "num_input_tokens_seen": 6661760, + "step": 11485 + }, + { + "epoch": 1.711349419124218, + "grad_norm": 0.033203125, + "learning_rate": 0.025668007149240394, + "loss": 0.7857, + "num_input_tokens_seen": 6664608, + "step": 11490 + }, + { + "epoch": 1.7120941316651772, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02567917783735478, + "loss": 0.8198, + "num_input_tokens_seen": 6667744, + "step": 11495 + }, + { + "epoch": 1.7128388442061364, + "grad_norm": 0.040771484375, + "learning_rate": 0.02569034852546917, + "loss": 0.8151, + "num_input_tokens_seen": 6670816, + "step": 11500 + }, + { + "epoch": 1.7135835567470956, + "grad_norm": 0.052490234375, + "learning_rate": 0.025701519213583556, + "loss": 0.7766, + "num_input_tokens_seen": 6673664, + "step": 11505 + }, + { + "epoch": 1.7143282692880548, + "grad_norm": 0.0137939453125, + "learning_rate": 0.025712689901697946, + "loss": 0.8152, + "num_input_tokens_seen": 6676640, + "step": 11510 + }, + { + "epoch": 1.715072981829014, + "grad_norm": 0.02294921875, + "learning_rate": 0.025723860589812332, + "loss": 0.7937, + "num_input_tokens_seen": 6679424, + "step": 11515 + }, + { + "epoch": 1.7158176943699732, + "grad_norm": 0.024169921875, + "learning_rate": 0.02573503127792672, + "loss": 0.8008, + "num_input_tokens_seen": 6682464, + "step": 11520 + }, + { + "epoch": 1.7165624069109324, + "grad_norm": 0.0244140625, + "learning_rate": 0.025746201966041105, + "loss": 0.8215, + "num_input_tokens_seen": 6685280, + "step": 11525 + }, + { + "epoch": 1.7173071194518914, + "grad_norm": 0.02587890625, + "learning_rate": 0.025757372654155494, + "loss": 0.8138, + "num_input_tokens_seen": 6688160, + "step": 11530 + }, + { + "epoch": 1.7180518319928506, + "grad_norm": 0.026123046875, + "learning_rate": 0.02576854334226988, + "loss": 0.8171, + "num_input_tokens_seen": 6691200, + "step": 11535 + }, + { + "epoch": 1.7187965445338098, + "grad_norm": 0.0169677734375, + "learning_rate": 0.02577971403038427, + "loss": 0.8028, + "num_input_tokens_seen": 6693888, + "step": 11540 + }, + { + "epoch": 1.719541257074769, + "grad_norm": 0.0245361328125, + "learning_rate": 0.025790884718498656, + "loss": 0.8075, + "num_input_tokens_seen": 6696768, + "step": 11545 + }, + { + "epoch": 1.7202859696157282, + "grad_norm": 0.0311279296875, + "learning_rate": 0.025802055406613046, + "loss": 0.812, + "num_input_tokens_seen": 6699808, + "step": 11550 + }, + { + "epoch": 1.7210306821566874, + "grad_norm": 0.0264892578125, + "learning_rate": 0.025813226094727432, + "loss": 0.8182, + "num_input_tokens_seen": 6702784, + "step": 11555 + }, + { + "epoch": 1.7217753946976466, + "grad_norm": 0.032958984375, + "learning_rate": 0.025824396782841822, + "loss": 0.8115, + "num_input_tokens_seen": 6705760, + "step": 11560 + }, + { + "epoch": 1.7225201072386058, + "grad_norm": 0.02734375, + "learning_rate": 0.025835567470956212, + "loss": 0.8103, + "num_input_tokens_seen": 6708736, + "step": 11565 + }, + { + "epoch": 1.723264819779565, + "grad_norm": 0.0167236328125, + "learning_rate": 0.025846738159070598, + "loss": 0.7983, + "num_input_tokens_seen": 6711392, + "step": 11570 + }, + { + "epoch": 1.7240095323205242, + "grad_norm": 0.044921875, + "learning_rate": 0.025857908847184988, + "loss": 0.8038, + "num_input_tokens_seen": 6714496, + "step": 11575 + }, + { + "epoch": 1.7247542448614834, + "grad_norm": 0.0186767578125, + "learning_rate": 0.025869079535299374, + "loss": 0.8061, + "num_input_tokens_seen": 6717280, + "step": 11580 + }, + { + "epoch": 1.7254989574024426, + "grad_norm": 0.0303955078125, + "learning_rate": 0.025880250223413764, + "loss": 0.7975, + "num_input_tokens_seen": 6719904, + "step": 11585 + }, + { + "epoch": 1.7262436699434018, + "grad_norm": 0.033935546875, + "learning_rate": 0.02589142091152815, + "loss": 0.8122, + "num_input_tokens_seen": 6722560, + "step": 11590 + }, + { + "epoch": 1.726988382484361, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02590259159964254, + "loss": 0.8012, + "num_input_tokens_seen": 6725344, + "step": 11595 + }, + { + "epoch": 1.7277330950253202, + "grad_norm": 0.03125, + "learning_rate": 0.025913762287756926, + "loss": 0.7941, + "num_input_tokens_seen": 6728384, + "step": 11600 + }, + { + "epoch": 1.7284778075662794, + "grad_norm": 0.031494140625, + "learning_rate": 0.025924932975871312, + "loss": 0.8019, + "num_input_tokens_seen": 6731360, + "step": 11605 + }, + { + "epoch": 1.7292225201072386, + "grad_norm": 0.0458984375, + "learning_rate": 0.0259361036639857, + "loss": 0.8117, + "num_input_tokens_seen": 6734016, + "step": 11610 + }, + { + "epoch": 1.7299672326481979, + "grad_norm": 0.029296875, + "learning_rate": 0.025947274352100088, + "loss": 0.7859, + "num_input_tokens_seen": 6736896, + "step": 11615 + }, + { + "epoch": 1.730711945189157, + "grad_norm": 0.032470703125, + "learning_rate": 0.025958445040214474, + "loss": 0.8271, + "num_input_tokens_seen": 6739744, + "step": 11620 + }, + { + "epoch": 1.7314566577301163, + "grad_norm": 0.04296875, + "learning_rate": 0.025969615728328864, + "loss": 0.8055, + "num_input_tokens_seen": 6742528, + "step": 11625 + }, + { + "epoch": 1.7322013702710755, + "grad_norm": 0.041015625, + "learning_rate": 0.02598078641644325, + "loss": 0.8102, + "num_input_tokens_seen": 6745472, + "step": 11630 + }, + { + "epoch": 1.7329460828120347, + "grad_norm": 0.02978515625, + "learning_rate": 0.02599195710455764, + "loss": 0.7941, + "num_input_tokens_seen": 6748736, + "step": 11635 + }, + { + "epoch": 1.7336907953529939, + "grad_norm": 0.04736328125, + "learning_rate": 0.026003127792672026, + "loss": 0.8012, + "num_input_tokens_seen": 6751424, + "step": 11640 + }, + { + "epoch": 1.734435507893953, + "grad_norm": 0.0311279296875, + "learning_rate": 0.026014298480786416, + "loss": 0.8055, + "num_input_tokens_seen": 6754176, + "step": 11645 + }, + { + "epoch": 1.7351802204349123, + "grad_norm": 0.0274658203125, + "learning_rate": 0.026025469168900802, + "loss": 0.8138, + "num_input_tokens_seen": 6757024, + "step": 11650 + }, + { + "epoch": 1.7359249329758715, + "grad_norm": 0.03173828125, + "learning_rate": 0.026036639857015192, + "loss": 0.8029, + "num_input_tokens_seen": 6759872, + "step": 11655 + }, + { + "epoch": 1.7366696455168305, + "grad_norm": 0.033935546875, + "learning_rate": 0.026047810545129578, + "loss": 0.8245, + "num_input_tokens_seen": 6763104, + "step": 11660 + }, + { + "epoch": 1.7374143580577897, + "grad_norm": 0.031005859375, + "learning_rate": 0.026058981233243968, + "loss": 0.7938, + "num_input_tokens_seen": 6765824, + "step": 11665 + }, + { + "epoch": 1.7381590705987489, + "grad_norm": 0.029052734375, + "learning_rate": 0.026070151921358354, + "loss": 0.8036, + "num_input_tokens_seen": 6768672, + "step": 11670 + }, + { + "epoch": 1.738903783139708, + "grad_norm": 0.029541015625, + "learning_rate": 0.026081322609472744, + "loss": 0.7864, + "num_input_tokens_seen": 6771616, + "step": 11675 + }, + { + "epoch": 1.7396484956806673, + "grad_norm": 0.033447265625, + "learning_rate": 0.026092493297587133, + "loss": 0.8252, + "num_input_tokens_seen": 6774272, + "step": 11680 + }, + { + "epoch": 1.7403932082216265, + "grad_norm": 0.035400390625, + "learning_rate": 0.02610366398570152, + "loss": 0.8147, + "num_input_tokens_seen": 6777280, + "step": 11685 + }, + { + "epoch": 1.7411379207625857, + "grad_norm": 0.0286865234375, + "learning_rate": 0.026114834673815906, + "loss": 0.8218, + "num_input_tokens_seen": 6780096, + "step": 11690 + }, + { + "epoch": 1.7418826333035449, + "grad_norm": 0.0177001953125, + "learning_rate": 0.026126005361930292, + "loss": 0.7954, + "num_input_tokens_seen": 6783072, + "step": 11695 + }, + { + "epoch": 1.742627345844504, + "grad_norm": 0.01708984375, + "learning_rate": 0.026137176050044682, + "loss": 0.7967, + "num_input_tokens_seen": 6785792, + "step": 11700 + }, + { + "epoch": 1.743372058385463, + "grad_norm": 0.017822265625, + "learning_rate": 0.026148346738159068, + "loss": 0.7927, + "num_input_tokens_seen": 6788544, + "step": 11705 + }, + { + "epoch": 1.7441167709264223, + "grad_norm": 0.01611328125, + "learning_rate": 0.026159517426273458, + "loss": 0.8031, + "num_input_tokens_seen": 6791680, + "step": 11710 + }, + { + "epoch": 1.7448614834673815, + "grad_norm": 0.017822265625, + "learning_rate": 0.026170688114387844, + "loss": 0.8142, + "num_input_tokens_seen": 6794496, + "step": 11715 + }, + { + "epoch": 1.7456061960083407, + "grad_norm": 0.0147705078125, + "learning_rate": 0.026181858802502234, + "loss": 0.8062, + "num_input_tokens_seen": 6797408, + "step": 11720 + }, + { + "epoch": 1.7463509085492999, + "grad_norm": 0.01531982421875, + "learning_rate": 0.02619302949061662, + "loss": 0.8231, + "num_input_tokens_seen": 6800320, + "step": 11725 + }, + { + "epoch": 1.747095621090259, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02620420017873101, + "loss": 0.8017, + "num_input_tokens_seen": 6803360, + "step": 11730 + }, + { + "epoch": 1.7478403336312183, + "grad_norm": 0.01483154296875, + "learning_rate": 0.026215370866845396, + "loss": 0.7974, + "num_input_tokens_seen": 6806592, + "step": 11735 + }, + { + "epoch": 1.7485850461721775, + "grad_norm": 0.024658203125, + "learning_rate": 0.026226541554959786, + "loss": 0.787, + "num_input_tokens_seen": 6809440, + "step": 11740 + }, + { + "epoch": 1.7493297587131367, + "grad_norm": 0.03125, + "learning_rate": 0.026237712243074172, + "loss": 0.8171, + "num_input_tokens_seen": 6812320, + "step": 11745 + }, + { + "epoch": 1.7500744712540959, + "grad_norm": 0.023681640625, + "learning_rate": 0.02624888293118856, + "loss": 0.8086, + "num_input_tokens_seen": 6814976, + "step": 11750 + }, + { + "epoch": 1.750819183795055, + "grad_norm": 0.013916015625, + "learning_rate": 0.026260053619302948, + "loss": 0.8181, + "num_input_tokens_seen": 6817728, + "step": 11755 + }, + { + "epoch": 1.7515638963360143, + "grad_norm": 0.014404296875, + "learning_rate": 0.026271224307417337, + "loss": 0.8046, + "num_input_tokens_seen": 6820672, + "step": 11760 + }, + { + "epoch": 1.7523086088769735, + "grad_norm": 0.01434326171875, + "learning_rate": 0.026282394995531724, + "loss": 0.7965, + "num_input_tokens_seen": 6823552, + "step": 11765 + }, + { + "epoch": 1.7530533214179327, + "grad_norm": 0.0260009765625, + "learning_rate": 0.026293565683646113, + "loss": 0.8012, + "num_input_tokens_seen": 6826976, + "step": 11770 + }, + { + "epoch": 1.7537980339588919, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0263047363717605, + "loss": 0.7925, + "num_input_tokens_seen": 6830048, + "step": 11775 + }, + { + "epoch": 1.754542746499851, + "grad_norm": 0.0264892578125, + "learning_rate": 0.026315907059874886, + "loss": 0.8143, + "num_input_tokens_seen": 6832896, + "step": 11780 + }, + { + "epoch": 1.7552874590408103, + "grad_norm": 0.0255126953125, + "learning_rate": 0.026327077747989275, + "loss": 0.7961, + "num_input_tokens_seen": 6835712, + "step": 11785 + }, + { + "epoch": 1.7560321715817695, + "grad_norm": 0.035400390625, + "learning_rate": 0.02633824843610366, + "loss": 0.8003, + "num_input_tokens_seen": 6838368, + "step": 11790 + }, + { + "epoch": 1.7567768841227287, + "grad_norm": 0.0390625, + "learning_rate": 0.02634941912421805, + "loss": 0.8096, + "num_input_tokens_seen": 6841216, + "step": 11795 + }, + { + "epoch": 1.757521596663688, + "grad_norm": 0.03564453125, + "learning_rate": 0.026360589812332438, + "loss": 0.8052, + "num_input_tokens_seen": 6844512, + "step": 11800 + }, + { + "epoch": 1.758266309204647, + "grad_norm": 0.0242919921875, + "learning_rate": 0.026371760500446827, + "loss": 0.7939, + "num_input_tokens_seen": 6847584, + "step": 11805 + }, + { + "epoch": 1.7590110217456063, + "grad_norm": 0.0238037109375, + "learning_rate": 0.026382931188561214, + "loss": 0.8115, + "num_input_tokens_seen": 6850528, + "step": 11810 + }, + { + "epoch": 1.7597557342865655, + "grad_norm": 0.025390625, + "learning_rate": 0.026394101876675603, + "loss": 0.7719, + "num_input_tokens_seen": 6853504, + "step": 11815 + }, + { + "epoch": 1.7605004468275247, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02640527256478999, + "loss": 0.8071, + "num_input_tokens_seen": 6856064, + "step": 11820 + }, + { + "epoch": 1.761245159368484, + "grad_norm": 0.041748046875, + "learning_rate": 0.02641644325290438, + "loss": 0.7973, + "num_input_tokens_seen": 6859040, + "step": 11825 + }, + { + "epoch": 1.7619898719094431, + "grad_norm": 0.0303955078125, + "learning_rate": 0.026427613941018765, + "loss": 0.8343, + "num_input_tokens_seen": 6861888, + "step": 11830 + }, + { + "epoch": 1.762734584450402, + "grad_norm": 0.035400390625, + "learning_rate": 0.026438784629133155, + "loss": 0.8018, + "num_input_tokens_seen": 6864640, + "step": 11835 + }, + { + "epoch": 1.7634792969913613, + "grad_norm": 0.031005859375, + "learning_rate": 0.02644995531724754, + "loss": 0.8152, + "num_input_tokens_seen": 6867680, + "step": 11840 + }, + { + "epoch": 1.7642240095323205, + "grad_norm": 0.016845703125, + "learning_rate": 0.02646112600536193, + "loss": 0.8019, + "num_input_tokens_seen": 6870592, + "step": 11845 + }, + { + "epoch": 1.7649687220732797, + "grad_norm": 0.0281982421875, + "learning_rate": 0.026472296693476317, + "loss": 0.8024, + "num_input_tokens_seen": 6873344, + "step": 11850 + }, + { + "epoch": 1.765713434614239, + "grad_norm": 0.038330078125, + "learning_rate": 0.026483467381590707, + "loss": 0.7941, + "num_input_tokens_seen": 6876576, + "step": 11855 + }, + { + "epoch": 1.766458147155198, + "grad_norm": 0.024658203125, + "learning_rate": 0.026494638069705093, + "loss": 0.84, + "num_input_tokens_seen": 6879232, + "step": 11860 + }, + { + "epoch": 1.7672028596961573, + "grad_norm": 0.0400390625, + "learning_rate": 0.02650580875781948, + "loss": 0.7777, + "num_input_tokens_seen": 6882048, + "step": 11865 + }, + { + "epoch": 1.7679475722371165, + "grad_norm": 0.031982421875, + "learning_rate": 0.026516979445933866, + "loss": 0.8098, + "num_input_tokens_seen": 6884832, + "step": 11870 + }, + { + "epoch": 1.7686922847780755, + "grad_norm": 0.038330078125, + "learning_rate": 0.026528150134048255, + "loss": 0.7858, + "num_input_tokens_seen": 6887904, + "step": 11875 + }, + { + "epoch": 1.7694369973190347, + "grad_norm": 0.036376953125, + "learning_rate": 0.02653932082216264, + "loss": 0.7987, + "num_input_tokens_seen": 6890720, + "step": 11880 + }, + { + "epoch": 1.770181709859994, + "grad_norm": 0.025390625, + "learning_rate": 0.02655049151027703, + "loss": 0.8109, + "num_input_tokens_seen": 6893440, + "step": 11885 + }, + { + "epoch": 1.770926422400953, + "grad_norm": 0.0284423828125, + "learning_rate": 0.026561662198391418, + "loss": 0.7947, + "num_input_tokens_seen": 6896352, + "step": 11890 + }, + { + "epoch": 1.7716711349419123, + "grad_norm": 0.0263671875, + "learning_rate": 0.026572832886505807, + "loss": 0.8006, + "num_input_tokens_seen": 6898816, + "step": 11895 + }, + { + "epoch": 1.7724158474828715, + "grad_norm": 0.027587890625, + "learning_rate": 0.026584003574620197, + "loss": 0.8038, + "num_input_tokens_seen": 6901888, + "step": 11900 + }, + { + "epoch": 1.7731605600238307, + "grad_norm": 0.0162353515625, + "learning_rate": 0.026595174262734583, + "loss": 0.804, + "num_input_tokens_seen": 6904704, + "step": 11905 + }, + { + "epoch": 1.77390527256479, + "grad_norm": 0.03564453125, + "learning_rate": 0.026606344950848973, + "loss": 0.7797, + "num_input_tokens_seen": 6907648, + "step": 11910 + }, + { + "epoch": 1.7746499851057491, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02661751563896336, + "loss": 0.8183, + "num_input_tokens_seen": 6910272, + "step": 11915 + }, + { + "epoch": 1.7753946976467083, + "grad_norm": 0.0244140625, + "learning_rate": 0.02662868632707775, + "loss": 0.8004, + "num_input_tokens_seen": 6913344, + "step": 11920 + }, + { + "epoch": 1.7761394101876675, + "grad_norm": 0.0238037109375, + "learning_rate": 0.026639857015192135, + "loss": 0.7971, + "num_input_tokens_seen": 6916224, + "step": 11925 + }, + { + "epoch": 1.7768841227286267, + "grad_norm": 0.0257568359375, + "learning_rate": 0.026651027703306525, + "loss": 0.8066, + "num_input_tokens_seen": 6919456, + "step": 11930 + }, + { + "epoch": 1.777628835269586, + "grad_norm": 0.057861328125, + "learning_rate": 0.02666219839142091, + "loss": 0.829, + "num_input_tokens_seen": 6922144, + "step": 11935 + }, + { + "epoch": 1.7783735478105451, + "grad_norm": 0.038818359375, + "learning_rate": 0.0266733690795353, + "loss": 0.7981, + "num_input_tokens_seen": 6925344, + "step": 11940 + }, + { + "epoch": 1.7791182603515043, + "grad_norm": 0.0279541015625, + "learning_rate": 0.026684539767649687, + "loss": 0.7888, + "num_input_tokens_seen": 6928224, + "step": 11945 + }, + { + "epoch": 1.7798629728924635, + "grad_norm": 0.01470947265625, + "learning_rate": 0.026695710455764073, + "loss": 0.8157, + "num_input_tokens_seen": 6931328, + "step": 11950 + }, + { + "epoch": 1.7806076854334227, + "grad_norm": 0.031982421875, + "learning_rate": 0.02670688114387846, + "loss": 0.8177, + "num_input_tokens_seen": 6934432, + "step": 11955 + }, + { + "epoch": 1.781352397974382, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02671805183199285, + "loss": 0.8016, + "num_input_tokens_seen": 6937216, + "step": 11960 + }, + { + "epoch": 1.7820971105153411, + "grad_norm": 0.0257568359375, + "learning_rate": 0.026729222520107235, + "loss": 0.7729, + "num_input_tokens_seen": 6940032, + "step": 11965 + }, + { + "epoch": 1.7828418230563003, + "grad_norm": 0.0341796875, + "learning_rate": 0.026740393208221625, + "loss": 0.8054, + "num_input_tokens_seen": 6942816, + "step": 11970 + }, + { + "epoch": 1.7835865355972595, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02675156389633601, + "loss": 0.7881, + "num_input_tokens_seen": 6945760, + "step": 11975 + }, + { + "epoch": 1.7843312481382188, + "grad_norm": 0.036865234375, + "learning_rate": 0.0267627345844504, + "loss": 0.8104, + "num_input_tokens_seen": 6948896, + "step": 11980 + }, + { + "epoch": 1.785075960679178, + "grad_norm": 0.033935546875, + "learning_rate": 0.026773905272564787, + "loss": 0.7859, + "num_input_tokens_seen": 6951904, + "step": 11985 + }, + { + "epoch": 1.7858206732201372, + "grad_norm": 0.029052734375, + "learning_rate": 0.026785075960679177, + "loss": 0.7984, + "num_input_tokens_seen": 6955104, + "step": 11990 + }, + { + "epoch": 1.7865653857610964, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026796246648793563, + "loss": 0.8074, + "num_input_tokens_seen": 6958048, + "step": 11995 + }, + { + "epoch": 1.7873100983020556, + "grad_norm": 0.034912109375, + "learning_rate": 0.026807417336907953, + "loss": 0.7724, + "num_input_tokens_seen": 6960864, + "step": 12000 + }, + { + "epoch": 1.7880548108430145, + "grad_norm": 0.04150390625, + "learning_rate": 0.02681858802502234, + "loss": 0.7916, + "num_input_tokens_seen": 6963648, + "step": 12005 + }, + { + "epoch": 1.7887995233839737, + "grad_norm": 0.040283203125, + "learning_rate": 0.02682975871313673, + "loss": 0.7977, + "num_input_tokens_seen": 6966528, + "step": 12010 + }, + { + "epoch": 1.789544235924933, + "grad_norm": 0.0162353515625, + "learning_rate": 0.02684092940125112, + "loss": 0.7746, + "num_input_tokens_seen": 6969280, + "step": 12015 + }, + { + "epoch": 1.7902889484658921, + "grad_norm": 0.0498046875, + "learning_rate": 0.026852100089365505, + "loss": 0.8211, + "num_input_tokens_seen": 6972320, + "step": 12020 + }, + { + "epoch": 1.7910336610068514, + "grad_norm": 0.02392578125, + "learning_rate": 0.026863270777479895, + "loss": 0.7893, + "num_input_tokens_seen": 6975552, + "step": 12025 + }, + { + "epoch": 1.7917783735478106, + "grad_norm": 0.041748046875, + "learning_rate": 0.02687444146559428, + "loss": 0.8298, + "num_input_tokens_seen": 6978560, + "step": 12030 + }, + { + "epoch": 1.7925230860887698, + "grad_norm": 0.0341796875, + "learning_rate": 0.02688561215370867, + "loss": 0.7829, + "num_input_tokens_seen": 6981408, + "step": 12035 + }, + { + "epoch": 1.793267798629729, + "grad_norm": 0.0228271484375, + "learning_rate": 0.026896782841823053, + "loss": 0.8483, + "num_input_tokens_seen": 6984384, + "step": 12040 + }, + { + "epoch": 1.7940125111706882, + "grad_norm": 0.01458740234375, + "learning_rate": 0.026907953529937443, + "loss": 0.8288, + "num_input_tokens_seen": 6987104, + "step": 12045 + }, + { + "epoch": 1.7947572237116471, + "grad_norm": 0.042724609375, + "learning_rate": 0.02691912421805183, + "loss": 0.816, + "num_input_tokens_seen": 6990112, + "step": 12050 + }, + { + "epoch": 1.7955019362526063, + "grad_norm": 0.024169921875, + "learning_rate": 0.02693029490616622, + "loss": 0.7952, + "num_input_tokens_seen": 6992928, + "step": 12055 + }, + { + "epoch": 1.7962466487935655, + "grad_norm": 0.0322265625, + "learning_rate": 0.026941465594280605, + "loss": 0.8235, + "num_input_tokens_seen": 6995872, + "step": 12060 + }, + { + "epoch": 1.7969913613345248, + "grad_norm": 0.0228271484375, + "learning_rate": 0.026952636282394995, + "loss": 0.8129, + "num_input_tokens_seen": 6998784, + "step": 12065 + }, + { + "epoch": 1.797736073875484, + "grad_norm": 0.01953125, + "learning_rate": 0.02696380697050938, + "loss": 0.8034, + "num_input_tokens_seen": 7001728, + "step": 12070 + }, + { + "epoch": 1.7984807864164432, + "grad_norm": 0.036865234375, + "learning_rate": 0.02697497765862377, + "loss": 0.7925, + "num_input_tokens_seen": 7004672, + "step": 12075 + }, + { + "epoch": 1.7992254989574024, + "grad_norm": 0.025390625, + "learning_rate": 0.026986148346738157, + "loss": 0.8014, + "num_input_tokens_seen": 7007712, + "step": 12080 + }, + { + "epoch": 1.7999702114983616, + "grad_norm": 0.03564453125, + "learning_rate": 0.026997319034852547, + "loss": 0.8019, + "num_input_tokens_seen": 7010656, + "step": 12085 + }, + { + "epoch": 1.8007149240393208, + "grad_norm": 0.0262451171875, + "learning_rate": 0.027008489722966933, + "loss": 0.7922, + "num_input_tokens_seen": 7013472, + "step": 12090 + }, + { + "epoch": 1.80145963658028, + "grad_norm": 0.032470703125, + "learning_rate": 0.027019660411081323, + "loss": 0.7822, + "num_input_tokens_seen": 7016352, + "step": 12095 + }, + { + "epoch": 1.8022043491212392, + "grad_norm": 0.039306640625, + "learning_rate": 0.02703083109919571, + "loss": 0.8175, + "num_input_tokens_seen": 7019168, + "step": 12100 + }, + { + "epoch": 1.8029490616621984, + "grad_norm": 0.0155029296875, + "learning_rate": 0.0270420017873101, + "loss": 0.7958, + "num_input_tokens_seen": 7022400, + "step": 12105 + }, + { + "epoch": 1.8036937742031576, + "grad_norm": 0.0150146484375, + "learning_rate": 0.027053172475424485, + "loss": 0.82, + "num_input_tokens_seen": 7025440, + "step": 12110 + }, + { + "epoch": 1.8044384867441168, + "grad_norm": 0.0146484375, + "learning_rate": 0.027064343163538875, + "loss": 0.8099, + "num_input_tokens_seen": 7028320, + "step": 12115 + }, + { + "epoch": 1.805183199285076, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02707551385165326, + "loss": 0.8102, + "num_input_tokens_seen": 7031168, + "step": 12120 + }, + { + "epoch": 1.8059279118260352, + "grad_norm": 0.0157470703125, + "learning_rate": 0.027086684539767647, + "loss": 0.8068, + "num_input_tokens_seen": 7033888, + "step": 12125 + }, + { + "epoch": 1.8066726243669944, + "grad_norm": 0.0228271484375, + "learning_rate": 0.027097855227882037, + "loss": 0.7879, + "num_input_tokens_seen": 7036544, + "step": 12130 + }, + { + "epoch": 1.8074173369079536, + "grad_norm": 0.022705078125, + "learning_rate": 0.027109025915996423, + "loss": 0.8094, + "num_input_tokens_seen": 7039264, + "step": 12135 + }, + { + "epoch": 1.8081620494489128, + "grad_norm": 0.02197265625, + "learning_rate": 0.027120196604110813, + "loss": 0.7859, + "num_input_tokens_seen": 7041920, + "step": 12140 + }, + { + "epoch": 1.808906761989872, + "grad_norm": 0.03076171875, + "learning_rate": 0.0271313672922252, + "loss": 0.8206, + "num_input_tokens_seen": 7044800, + "step": 12145 + }, + { + "epoch": 1.8096514745308312, + "grad_norm": 0.013671875, + "learning_rate": 0.02714253798033959, + "loss": 0.8075, + "num_input_tokens_seen": 7047968, + "step": 12150 + }, + { + "epoch": 1.8103961870717904, + "grad_norm": 0.0250244140625, + "learning_rate": 0.027153708668453975, + "loss": 0.8136, + "num_input_tokens_seen": 7050784, + "step": 12155 + }, + { + "epoch": 1.8111408996127496, + "grad_norm": 0.0262451171875, + "learning_rate": 0.027164879356568365, + "loss": 0.7887, + "num_input_tokens_seen": 7053632, + "step": 12160 + }, + { + "epoch": 1.8118856121537088, + "grad_norm": 0.0263671875, + "learning_rate": 0.02717605004468275, + "loss": 0.8071, + "num_input_tokens_seen": 7056736, + "step": 12165 + }, + { + "epoch": 1.812630324694668, + "grad_norm": 0.022705078125, + "learning_rate": 0.02718722073279714, + "loss": 0.8002, + "num_input_tokens_seen": 7059488, + "step": 12170 + }, + { + "epoch": 1.8133750372356272, + "grad_norm": 0.0189208984375, + "learning_rate": 0.027198391420911527, + "loss": 0.7941, + "num_input_tokens_seen": 7063616, + "step": 12175 + }, + { + "epoch": 1.8141197497765862, + "grad_norm": 0.01611328125, + "learning_rate": 0.027209562109025916, + "loss": 0.8149, + "num_input_tokens_seen": 7066720, + "step": 12180 + }, + { + "epoch": 1.8148644623175454, + "grad_norm": 0.01397705078125, + "learning_rate": 0.027220732797140303, + "loss": 0.8184, + "num_input_tokens_seen": 7069568, + "step": 12185 + }, + { + "epoch": 1.8156091748585046, + "grad_norm": 0.0218505859375, + "learning_rate": 0.027231903485254692, + "loss": 0.8065, + "num_input_tokens_seen": 7072320, + "step": 12190 + }, + { + "epoch": 1.8163538873994638, + "grad_norm": 0.01513671875, + "learning_rate": 0.02724307417336908, + "loss": 0.8163, + "num_input_tokens_seen": 7075264, + "step": 12195 + }, + { + "epoch": 1.817098599940423, + "grad_norm": 0.03125, + "learning_rate": 0.02725424486148347, + "loss": 0.8264, + "num_input_tokens_seen": 7078272, + "step": 12200 + }, + { + "epoch": 1.8178433124813822, + "grad_norm": 0.02783203125, + "learning_rate": 0.027265415549597855, + "loss": 0.7874, + "num_input_tokens_seen": 7081088, + "step": 12205 + }, + { + "epoch": 1.8185880250223414, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02727658623771224, + "loss": 0.7959, + "num_input_tokens_seen": 7084032, + "step": 12210 + }, + { + "epoch": 1.8193327375633006, + "grad_norm": 0.013916015625, + "learning_rate": 0.027287756925826627, + "loss": 0.8006, + "num_input_tokens_seen": 7087104, + "step": 12215 + }, + { + "epoch": 1.8200774501042598, + "grad_norm": 0.0255126953125, + "learning_rate": 0.027298927613941017, + "loss": 0.7907, + "num_input_tokens_seen": 7089920, + "step": 12220 + }, + { + "epoch": 1.8208221626452188, + "grad_norm": 0.0255126953125, + "learning_rate": 0.027310098302055403, + "loss": 0.7835, + "num_input_tokens_seen": 7093056, + "step": 12225 + }, + { + "epoch": 1.821566875186178, + "grad_norm": 0.029052734375, + "learning_rate": 0.027321268990169793, + "loss": 0.8025, + "num_input_tokens_seen": 7095584, + "step": 12230 + }, + { + "epoch": 1.8223115877271372, + "grad_norm": 0.0294189453125, + "learning_rate": 0.027332439678284182, + "loss": 0.8007, + "num_input_tokens_seen": 7098464, + "step": 12235 + }, + { + "epoch": 1.8230563002680964, + "grad_norm": 0.025390625, + "learning_rate": 0.02734361036639857, + "loss": 0.7971, + "num_input_tokens_seen": 7101440, + "step": 12240 + }, + { + "epoch": 1.8238010128090556, + "grad_norm": 0.021240234375, + "learning_rate": 0.02735478105451296, + "loss": 0.8042, + "num_input_tokens_seen": 7104224, + "step": 12245 + }, + { + "epoch": 1.8245457253500148, + "grad_norm": 0.0250244140625, + "learning_rate": 0.027365951742627345, + "loss": 0.7772, + "num_input_tokens_seen": 7107488, + "step": 12250 + }, + { + "epoch": 1.825290437890974, + "grad_norm": 0.032958984375, + "learning_rate": 0.027377122430741734, + "loss": 0.8136, + "num_input_tokens_seen": 7110176, + "step": 12255 + }, + { + "epoch": 1.8260351504319332, + "grad_norm": 0.0322265625, + "learning_rate": 0.02738829311885612, + "loss": 0.8029, + "num_input_tokens_seen": 7113056, + "step": 12260 + }, + { + "epoch": 1.8267798629728924, + "grad_norm": 0.02783203125, + "learning_rate": 0.02739946380697051, + "loss": 0.8374, + "num_input_tokens_seen": 7115840, + "step": 12265 + }, + { + "epoch": 1.8275245755138516, + "grad_norm": 0.036865234375, + "learning_rate": 0.027410634495084896, + "loss": 0.7896, + "num_input_tokens_seen": 7118880, + "step": 12270 + }, + { + "epoch": 1.8282692880548108, + "grad_norm": 0.036865234375, + "learning_rate": 0.027421805183199286, + "loss": 0.7931, + "num_input_tokens_seen": 7121664, + "step": 12275 + }, + { + "epoch": 1.82901400059577, + "grad_norm": 0.016845703125, + "learning_rate": 0.027432975871313672, + "loss": 0.8006, + "num_input_tokens_seen": 7124480, + "step": 12280 + }, + { + "epoch": 1.8297587131367292, + "grad_norm": 0.022216796875, + "learning_rate": 0.027444146559428062, + "loss": 0.7857, + "num_input_tokens_seen": 7127424, + "step": 12285 + }, + { + "epoch": 1.8305034256776884, + "grad_norm": 0.0303955078125, + "learning_rate": 0.027455317247542448, + "loss": 0.8023, + "num_input_tokens_seen": 7130208, + "step": 12290 + }, + { + "epoch": 1.8312481382186476, + "grad_norm": 0.034912109375, + "learning_rate": 0.027466487935656838, + "loss": 0.782, + "num_input_tokens_seen": 7133344, + "step": 12295 + }, + { + "epoch": 1.8319928507596068, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02747765862377122, + "loss": 0.8197, + "num_input_tokens_seen": 7136416, + "step": 12300 + }, + { + "epoch": 1.832737563300566, + "grad_norm": 0.03369140625, + "learning_rate": 0.02748882931188561, + "loss": 0.8038, + "num_input_tokens_seen": 7139168, + "step": 12305 + }, + { + "epoch": 1.8334822758415252, + "grad_norm": 0.0279541015625, + "learning_rate": 0.027499999999999997, + "loss": 0.8115, + "num_input_tokens_seen": 7142080, + "step": 12310 + }, + { + "epoch": 1.8342269883824844, + "grad_norm": 0.041259765625, + "learning_rate": 0.027511170688114386, + "loss": 0.8199, + "num_input_tokens_seen": 7144832, + "step": 12315 + }, + { + "epoch": 1.8349717009234436, + "grad_norm": 0.034912109375, + "learning_rate": 0.027522341376228773, + "loss": 0.8291, + "num_input_tokens_seen": 7147840, + "step": 12320 + }, + { + "epoch": 1.8357164134644028, + "grad_norm": 0.0322265625, + "learning_rate": 0.027533512064343162, + "loss": 0.8064, + "num_input_tokens_seen": 7150912, + "step": 12325 + }, + { + "epoch": 1.836461126005362, + "grad_norm": 0.0147705078125, + "learning_rate": 0.02754468275245755, + "loss": 0.8011, + "num_input_tokens_seen": 7153952, + "step": 12330 + }, + { + "epoch": 1.8372058385463212, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027555853440571938, + "loss": 0.8165, + "num_input_tokens_seen": 7156864, + "step": 12335 + }, + { + "epoch": 1.8379505510872804, + "grad_norm": 0.0281982421875, + "learning_rate": 0.027567024128686324, + "loss": 0.81, + "num_input_tokens_seen": 7159872, + "step": 12340 + }, + { + "epoch": 1.8386952636282397, + "grad_norm": 0.039794921875, + "learning_rate": 0.027578194816800714, + "loss": 0.8096, + "num_input_tokens_seen": 7162944, + "step": 12345 + }, + { + "epoch": 1.8394399761691989, + "grad_norm": 0.01470947265625, + "learning_rate": 0.027589365504915104, + "loss": 0.7984, + "num_input_tokens_seen": 7165952, + "step": 12350 + }, + { + "epoch": 1.8401846887101578, + "grad_norm": 0.035888671875, + "learning_rate": 0.02760053619302949, + "loss": 0.8148, + "num_input_tokens_seen": 7168768, + "step": 12355 + }, + { + "epoch": 1.840929401251117, + "grad_norm": 0.03564453125, + "learning_rate": 0.02761170688114388, + "loss": 0.8088, + "num_input_tokens_seen": 7171744, + "step": 12360 + }, + { + "epoch": 1.8416741137920762, + "grad_norm": 0.037353515625, + "learning_rate": 0.027622877569258266, + "loss": 0.8026, + "num_input_tokens_seen": 7174816, + "step": 12365 + }, + { + "epoch": 1.8424188263330354, + "grad_norm": 0.0284423828125, + "learning_rate": 0.027634048257372656, + "loss": 0.803, + "num_input_tokens_seen": 7177792, + "step": 12370 + }, + { + "epoch": 1.8431635388739946, + "grad_norm": 0.026611328125, + "learning_rate": 0.027645218945487042, + "loss": 0.808, + "num_input_tokens_seen": 7180704, + "step": 12375 + }, + { + "epoch": 1.8439082514149538, + "grad_norm": 0.027099609375, + "learning_rate": 0.02765638963360143, + "loss": 0.792, + "num_input_tokens_seen": 7183648, + "step": 12380 + }, + { + "epoch": 1.844652963955913, + "grad_norm": 0.025146484375, + "learning_rate": 0.027667560321715814, + "loss": 0.8017, + "num_input_tokens_seen": 7186592, + "step": 12385 + }, + { + "epoch": 1.8453976764968723, + "grad_norm": 0.03173828125, + "learning_rate": 0.027678731009830204, + "loss": 0.794, + "num_input_tokens_seen": 7189280, + "step": 12390 + }, + { + "epoch": 1.8461423890378312, + "grad_norm": 0.032958984375, + "learning_rate": 0.02768990169794459, + "loss": 0.7928, + "num_input_tokens_seen": 7192160, + "step": 12395 + }, + { + "epoch": 1.8468871015787904, + "grad_norm": 0.0380859375, + "learning_rate": 0.02770107238605898, + "loss": 0.814, + "num_input_tokens_seen": 7195232, + "step": 12400 + }, + { + "epoch": 1.8476318141197496, + "grad_norm": 0.033203125, + "learning_rate": 0.027712243074173366, + "loss": 0.7996, + "num_input_tokens_seen": 7198112, + "step": 12405 + }, + { + "epoch": 1.8483765266607088, + "grad_norm": 0.03515625, + "learning_rate": 0.027723413762287756, + "loss": 0.8092, + "num_input_tokens_seen": 7201056, + "step": 12410 + }, + { + "epoch": 1.849121239201668, + "grad_norm": 0.0267333984375, + "learning_rate": 0.027734584450402142, + "loss": 0.764, + "num_input_tokens_seen": 7203968, + "step": 12415 + }, + { + "epoch": 1.8498659517426272, + "grad_norm": 0.033203125, + "learning_rate": 0.027745755138516532, + "loss": 0.8104, + "num_input_tokens_seen": 7206976, + "step": 12420 + }, + { + "epoch": 1.8506106642835864, + "grad_norm": 0.0306396484375, + "learning_rate": 0.027756925826630918, + "loss": 0.8061, + "num_input_tokens_seen": 7209920, + "step": 12425 + }, + { + "epoch": 1.8513553768245457, + "grad_norm": 0.07470703125, + "learning_rate": 0.027768096514745308, + "loss": 0.8215, + "num_input_tokens_seen": 7212640, + "step": 12430 + }, + { + "epoch": 1.8521000893655049, + "grad_norm": 0.031982421875, + "learning_rate": 0.027779267202859694, + "loss": 0.7963, + "num_input_tokens_seen": 7215328, + "step": 12435 + }, + { + "epoch": 1.852844801906464, + "grad_norm": 0.016357421875, + "learning_rate": 0.027790437890974084, + "loss": 0.8068, + "num_input_tokens_seen": 7218080, + "step": 12440 + }, + { + "epoch": 1.8535895144474233, + "grad_norm": 0.033447265625, + "learning_rate": 0.02780160857908847, + "loss": 0.8234, + "num_input_tokens_seen": 7220832, + "step": 12445 + }, + { + "epoch": 1.8543342269883825, + "grad_norm": 0.034423828125, + "learning_rate": 0.02781277926720286, + "loss": 0.8175, + "num_input_tokens_seen": 7223680, + "step": 12450 + }, + { + "epoch": 1.8550789395293417, + "grad_norm": 0.0181884765625, + "learning_rate": 0.027823949955317246, + "loss": 0.8169, + "num_input_tokens_seen": 7226368, + "step": 12455 + }, + { + "epoch": 1.8558236520703009, + "grad_norm": 0.041015625, + "learning_rate": 0.027835120643431636, + "loss": 0.8053, + "num_input_tokens_seen": 7229120, + "step": 12460 + }, + { + "epoch": 1.85656836461126, + "grad_norm": 0.0244140625, + "learning_rate": 0.027846291331546025, + "loss": 0.8068, + "num_input_tokens_seen": 7232000, + "step": 12465 + }, + { + "epoch": 1.8573130771522193, + "grad_norm": 0.032958984375, + "learning_rate": 0.027857462019660408, + "loss": 0.7882, + "num_input_tokens_seen": 7234784, + "step": 12470 + }, + { + "epoch": 1.8580577896931785, + "grad_norm": 0.024169921875, + "learning_rate": 0.027868632707774798, + "loss": 0.8048, + "num_input_tokens_seen": 7237632, + "step": 12475 + }, + { + "epoch": 1.8588025022341377, + "grad_norm": 0.0272216796875, + "learning_rate": 0.027879803395889184, + "loss": 0.796, + "num_input_tokens_seen": 7240608, + "step": 12480 + }, + { + "epoch": 1.8595472147750969, + "grad_norm": 0.03271484375, + "learning_rate": 0.027890974084003574, + "loss": 0.78, + "num_input_tokens_seen": 7243392, + "step": 12485 + }, + { + "epoch": 1.860291927316056, + "grad_norm": 0.035888671875, + "learning_rate": 0.02790214477211796, + "loss": 0.8077, + "num_input_tokens_seen": 7246432, + "step": 12490 + }, + { + "epoch": 1.8610366398570153, + "grad_norm": 0.031494140625, + "learning_rate": 0.02791331546023235, + "loss": 0.7922, + "num_input_tokens_seen": 7249120, + "step": 12495 + }, + { + "epoch": 1.8617813523979745, + "grad_norm": 0.015380859375, + "learning_rate": 0.027924486148346736, + "loss": 0.8111, + "num_input_tokens_seen": 7251872, + "step": 12500 + }, + { + "epoch": 1.8625260649389337, + "grad_norm": 0.031494140625, + "learning_rate": 0.027935656836461126, + "loss": 0.7889, + "num_input_tokens_seen": 7254752, + "step": 12505 + }, + { + "epoch": 1.863270777479893, + "grad_norm": 0.0262451171875, + "learning_rate": 0.027946827524575512, + "loss": 0.8318, + "num_input_tokens_seen": 7257536, + "step": 12510 + }, + { + "epoch": 1.864015490020852, + "grad_norm": 0.04931640625, + "learning_rate": 0.0279579982126899, + "loss": 0.8336, + "num_input_tokens_seen": 7260576, + "step": 12515 + }, + { + "epoch": 1.8647602025618113, + "grad_norm": 0.031005859375, + "learning_rate": 0.027969168900804288, + "loss": 0.8098, + "num_input_tokens_seen": 7263424, + "step": 12520 + }, + { + "epoch": 1.8655049151027703, + "grad_norm": 0.03564453125, + "learning_rate": 0.027980339588918678, + "loss": 0.8102, + "num_input_tokens_seen": 7266208, + "step": 12525 + }, + { + "epoch": 1.8662496276437295, + "grad_norm": 0.0361328125, + "learning_rate": 0.027991510277033064, + "loss": 0.7881, + "num_input_tokens_seen": 7269248, + "step": 12530 + }, + { + "epoch": 1.8669943401846887, + "grad_norm": 0.0322265625, + "learning_rate": 0.028002680965147454, + "loss": 0.7939, + "num_input_tokens_seen": 7272320, + "step": 12535 + }, + { + "epoch": 1.8677390527256479, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02801385165326184, + "loss": 0.8045, + "num_input_tokens_seen": 7275552, + "step": 12540 + }, + { + "epoch": 1.868483765266607, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02802502234137623, + "loss": 0.8254, + "num_input_tokens_seen": 7278432, + "step": 12545 + }, + { + "epoch": 1.8692284778075663, + "grad_norm": 0.02734375, + "learning_rate": 0.028036193029490616, + "loss": 0.8117, + "num_input_tokens_seen": 7281024, + "step": 12550 + }, + { + "epoch": 1.8699731903485255, + "grad_norm": 0.0260009765625, + "learning_rate": 0.028047363717605005, + "loss": 0.8063, + "num_input_tokens_seen": 7284032, + "step": 12555 + }, + { + "epoch": 1.8707179028894847, + "grad_norm": 0.032958984375, + "learning_rate": 0.028058534405719388, + "loss": 0.8176, + "num_input_tokens_seen": 7287424, + "step": 12560 + }, + { + "epoch": 1.871462615430444, + "grad_norm": 0.033447265625, + "learning_rate": 0.028069705093833778, + "loss": 0.8114, + "num_input_tokens_seen": 7290272, + "step": 12565 + }, + { + "epoch": 1.8722073279714029, + "grad_norm": 0.01519775390625, + "learning_rate": 0.028080875781948168, + "loss": 0.8237, + "num_input_tokens_seen": 7292864, + "step": 12570 + }, + { + "epoch": 1.872952040512362, + "grad_norm": 0.0299072265625, + "learning_rate": 0.028092046470062554, + "loss": 0.8039, + "num_input_tokens_seen": 7295680, + "step": 12575 + }, + { + "epoch": 1.8736967530533213, + "grad_norm": 0.014404296875, + "learning_rate": 0.028103217158176944, + "loss": 0.8056, + "num_input_tokens_seen": 7298656, + "step": 12580 + }, + { + "epoch": 1.8744414655942805, + "grad_norm": 0.01470947265625, + "learning_rate": 0.02811438784629133, + "loss": 0.8064, + "num_input_tokens_seen": 7301408, + "step": 12585 + }, + { + "epoch": 1.8751861781352397, + "grad_norm": 0.03857421875, + "learning_rate": 0.02812555853440572, + "loss": 0.8046, + "num_input_tokens_seen": 7304128, + "step": 12590 + }, + { + "epoch": 1.875930890676199, + "grad_norm": 0.0216064453125, + "learning_rate": 0.028136729222520106, + "loss": 0.7961, + "num_input_tokens_seen": 7306976, + "step": 12595 + }, + { + "epoch": 1.876675603217158, + "grad_norm": 0.0322265625, + "learning_rate": 0.028147899910634495, + "loss": 0.8078, + "num_input_tokens_seen": 7309504, + "step": 12600 + }, + { + "epoch": 1.8774203157581173, + "grad_norm": 0.01275634765625, + "learning_rate": 0.02815907059874888, + "loss": 0.787, + "num_input_tokens_seen": 7312256, + "step": 12605 + }, + { + "epoch": 1.8781650282990765, + "grad_norm": 0.038330078125, + "learning_rate": 0.02817024128686327, + "loss": 0.7962, + "num_input_tokens_seen": 7315040, + "step": 12610 + }, + { + "epoch": 1.8789097408400357, + "grad_norm": 0.0247802734375, + "learning_rate": 0.028181411974977658, + "loss": 0.7964, + "num_input_tokens_seen": 7318016, + "step": 12615 + }, + { + "epoch": 1.879654453380995, + "grad_norm": 0.024169921875, + "learning_rate": 0.028192582663092047, + "loss": 0.7955, + "num_input_tokens_seen": 7320608, + "step": 12620 + }, + { + "epoch": 1.880399165921954, + "grad_norm": 0.0234375, + "learning_rate": 0.028203753351206434, + "loss": 0.8276, + "num_input_tokens_seen": 7323520, + "step": 12625 + }, + { + "epoch": 1.8811438784629133, + "grad_norm": 0.02392578125, + "learning_rate": 0.028214924039320823, + "loss": 0.7942, + "num_input_tokens_seen": 7326816, + "step": 12630 + }, + { + "epoch": 1.8818885910038725, + "grad_norm": 0.031494140625, + "learning_rate": 0.02822609472743521, + "loss": 0.8039, + "num_input_tokens_seen": 7329824, + "step": 12635 + }, + { + "epoch": 1.8826333035448317, + "grad_norm": 0.01324462890625, + "learning_rate": 0.0282372654155496, + "loss": 0.7946, + "num_input_tokens_seen": 7332736, + "step": 12640 + }, + { + "epoch": 1.883378016085791, + "grad_norm": 0.0223388671875, + "learning_rate": 0.028248436103663982, + "loss": 0.7996, + "num_input_tokens_seen": 7335584, + "step": 12645 + }, + { + "epoch": 1.8841227286267501, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02825960679177837, + "loss": 0.8362, + "num_input_tokens_seen": 7338432, + "step": 12650 + }, + { + "epoch": 1.8848674411677093, + "grad_norm": 0.0255126953125, + "learning_rate": 0.028270777479892758, + "loss": 0.7965, + "num_input_tokens_seen": 7341344, + "step": 12655 + }, + { + "epoch": 1.8856121537086685, + "grad_norm": 0.029296875, + "learning_rate": 0.028281948168007148, + "loss": 0.7904, + "num_input_tokens_seen": 7344512, + "step": 12660 + }, + { + "epoch": 1.8863568662496277, + "grad_norm": 0.03564453125, + "learning_rate": 0.028293118856121534, + "loss": 0.799, + "num_input_tokens_seen": 7347264, + "step": 12665 + }, + { + "epoch": 1.887101578790587, + "grad_norm": 0.0250244140625, + "learning_rate": 0.028304289544235924, + "loss": 0.8062, + "num_input_tokens_seen": 7350336, + "step": 12670 + }, + { + "epoch": 1.8878462913315461, + "grad_norm": 0.026611328125, + "learning_rate": 0.02831546023235031, + "loss": 0.8049, + "num_input_tokens_seen": 7353216, + "step": 12675 + }, + { + "epoch": 1.8885910038725053, + "grad_norm": 0.0244140625, + "learning_rate": 0.0283266309204647, + "loss": 0.8082, + "num_input_tokens_seen": 7355968, + "step": 12680 + }, + { + "epoch": 1.8893357164134645, + "grad_norm": 0.030029296875, + "learning_rate": 0.028337801608579086, + "loss": 0.7899, + "num_input_tokens_seen": 7358912, + "step": 12685 + }, + { + "epoch": 1.8900804289544237, + "grad_norm": 0.0255126953125, + "learning_rate": 0.028348972296693475, + "loss": 0.7958, + "num_input_tokens_seen": 7361664, + "step": 12690 + }, + { + "epoch": 1.890825141495383, + "grad_norm": 0.022216796875, + "learning_rate": 0.028360142984807865, + "loss": 0.7841, + "num_input_tokens_seen": 7364704, + "step": 12695 + }, + { + "epoch": 1.891569854036342, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02837131367292225, + "loss": 0.8098, + "num_input_tokens_seen": 7367424, + "step": 12700 + }, + { + "epoch": 1.8923145665773011, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02838248436103664, + "loss": 0.8033, + "num_input_tokens_seen": 7370432, + "step": 12705 + }, + { + "epoch": 1.8930592791182603, + "grad_norm": 0.0142822265625, + "learning_rate": 0.028393655049151027, + "loss": 0.8082, + "num_input_tokens_seen": 7373152, + "step": 12710 + }, + { + "epoch": 1.8938039916592195, + "grad_norm": 0.027587890625, + "learning_rate": 0.028404825737265417, + "loss": 0.8114, + "num_input_tokens_seen": 7375872, + "step": 12715 + }, + { + "epoch": 1.8945487042001787, + "grad_norm": 0.037353515625, + "learning_rate": 0.028415996425379803, + "loss": 0.7903, + "num_input_tokens_seen": 7378624, + "step": 12720 + }, + { + "epoch": 1.895293416741138, + "grad_norm": 0.025634765625, + "learning_rate": 0.028427167113494193, + "loss": 0.8099, + "num_input_tokens_seen": 7381632, + "step": 12725 + }, + { + "epoch": 1.8960381292820971, + "grad_norm": 0.0322265625, + "learning_rate": 0.028438337801608576, + "loss": 0.7975, + "num_input_tokens_seen": 7384832, + "step": 12730 + }, + { + "epoch": 1.8967828418230563, + "grad_norm": 0.0205078125, + "learning_rate": 0.028449508489722965, + "loss": 0.7992, + "num_input_tokens_seen": 7387808, + "step": 12735 + }, + { + "epoch": 1.8975275543640155, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02846067917783735, + "loss": 0.7869, + "num_input_tokens_seen": 7390752, + "step": 12740 + }, + { + "epoch": 1.8982722669049745, + "grad_norm": 0.0164794921875, + "learning_rate": 0.02847184986595174, + "loss": 0.781, + "num_input_tokens_seen": 7393408, + "step": 12745 + }, + { + "epoch": 1.8990169794459337, + "grad_norm": 0.0245361328125, + "learning_rate": 0.028483020554066128, + "loss": 0.7914, + "num_input_tokens_seen": 7396448, + "step": 12750 + }, + { + "epoch": 1.899761691986893, + "grad_norm": 0.0257568359375, + "learning_rate": 0.028494191242180517, + "loss": 0.79, + "num_input_tokens_seen": 7399584, + "step": 12755 + }, + { + "epoch": 1.9005064045278521, + "grad_norm": 0.037109375, + "learning_rate": 0.028505361930294904, + "loss": 0.835, + "num_input_tokens_seen": 7402400, + "step": 12760 + }, + { + "epoch": 1.9012511170688113, + "grad_norm": 0.0181884765625, + "learning_rate": 0.028516532618409293, + "loss": 0.8232, + "num_input_tokens_seen": 7405280, + "step": 12765 + }, + { + "epoch": 1.9019958296097705, + "grad_norm": 0.046630859375, + "learning_rate": 0.02852770330652368, + "loss": 0.8009, + "num_input_tokens_seen": 7408160, + "step": 12770 + }, + { + "epoch": 1.9027405421507297, + "grad_norm": 0.025390625, + "learning_rate": 0.02853887399463807, + "loss": 0.8381, + "num_input_tokens_seen": 7411136, + "step": 12775 + }, + { + "epoch": 1.903485254691689, + "grad_norm": 0.035400390625, + "learning_rate": 0.028550044682752455, + "loss": 0.7699, + "num_input_tokens_seen": 7413984, + "step": 12780 + }, + { + "epoch": 1.9042299672326481, + "grad_norm": 0.023681640625, + "learning_rate": 0.028561215370866845, + "loss": 0.8001, + "num_input_tokens_seen": 7416960, + "step": 12785 + }, + { + "epoch": 1.9049746797736073, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02857238605898123, + "loss": 0.7944, + "num_input_tokens_seen": 7419776, + "step": 12790 + }, + { + "epoch": 1.9057193923145666, + "grad_norm": 0.031005859375, + "learning_rate": 0.02858355674709562, + "loss": 0.7767, + "num_input_tokens_seen": 7422752, + "step": 12795 + }, + { + "epoch": 1.9064641048555258, + "grad_norm": 0.020751953125, + "learning_rate": 0.028594727435210007, + "loss": 0.792, + "num_input_tokens_seen": 7425664, + "step": 12800 + }, + { + "epoch": 1.907208817396485, + "grad_norm": 0.02197265625, + "learning_rate": 0.028605898123324397, + "loss": 0.806, + "num_input_tokens_seen": 7428448, + "step": 12805 + }, + { + "epoch": 1.9079535299374442, + "grad_norm": 0.01470947265625, + "learning_rate": 0.028617068811438787, + "loss": 0.8235, + "num_input_tokens_seen": 7431200, + "step": 12810 + }, + { + "epoch": 1.9086982424784034, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02862823949955317, + "loss": 0.7969, + "num_input_tokens_seen": 7434144, + "step": 12815 + }, + { + "epoch": 1.9094429550193626, + "grad_norm": 0.025634765625, + "learning_rate": 0.02863941018766756, + "loss": 0.8067, + "num_input_tokens_seen": 7437056, + "step": 12820 + }, + { + "epoch": 1.9101876675603218, + "grad_norm": 0.037841796875, + "learning_rate": 0.028650580875781945, + "loss": 0.7801, + "num_input_tokens_seen": 7439648, + "step": 12825 + }, + { + "epoch": 1.910932380101281, + "grad_norm": 0.0247802734375, + "learning_rate": 0.028661751563896335, + "loss": 0.7886, + "num_input_tokens_seen": 7442272, + "step": 12830 + }, + { + "epoch": 1.9116770926422402, + "grad_norm": 0.034423828125, + "learning_rate": 0.02867292225201072, + "loss": 0.8017, + "num_input_tokens_seen": 7445152, + "step": 12835 + }, + { + "epoch": 1.9124218051831994, + "grad_norm": 0.015625, + "learning_rate": 0.02868409294012511, + "loss": 0.7989, + "num_input_tokens_seen": 7448192, + "step": 12840 + }, + { + "epoch": 1.9131665177241586, + "grad_norm": 0.0439453125, + "learning_rate": 0.028695263628239497, + "loss": 0.8532, + "num_input_tokens_seen": 7450976, + "step": 12845 + }, + { + "epoch": 1.9139112302651178, + "grad_norm": 0.01226806640625, + "learning_rate": 0.028706434316353887, + "loss": 0.8053, + "num_input_tokens_seen": 7454048, + "step": 12850 + }, + { + "epoch": 1.914655942806077, + "grad_norm": 0.015625, + "learning_rate": 0.028717605004468273, + "loss": 0.7914, + "num_input_tokens_seen": 7457248, + "step": 12855 + }, + { + "epoch": 1.9154006553470362, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028728775692582663, + "loss": 0.7771, + "num_input_tokens_seen": 7460192, + "step": 12860 + }, + { + "epoch": 1.9161453678879954, + "grad_norm": 0.034912109375, + "learning_rate": 0.02873994638069705, + "loss": 0.8124, + "num_input_tokens_seen": 7463008, + "step": 12865 + }, + { + "epoch": 1.9168900804289544, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02875111706881144, + "loss": 0.8264, + "num_input_tokens_seen": 7465952, + "step": 12870 + }, + { + "epoch": 1.9176347929699136, + "grad_norm": 0.0322265625, + "learning_rate": 0.028762287756925825, + "loss": 0.7962, + "num_input_tokens_seen": 7468864, + "step": 12875 + }, + { + "epoch": 1.9183795055108728, + "grad_norm": 0.036376953125, + "learning_rate": 0.028773458445040215, + "loss": 0.8089, + "num_input_tokens_seen": 7472032, + "step": 12880 + }, + { + "epoch": 1.919124218051832, + "grad_norm": 0.03271484375, + "learning_rate": 0.0287846291331546, + "loss": 0.8057, + "num_input_tokens_seen": 7474912, + "step": 12885 + }, + { + "epoch": 1.9198689305927912, + "grad_norm": 0.01397705078125, + "learning_rate": 0.02879579982126899, + "loss": 0.81, + "num_input_tokens_seen": 7477568, + "step": 12890 + }, + { + "epoch": 1.9206136431337504, + "grad_norm": 0.0224609375, + "learning_rate": 0.028806970509383377, + "loss": 0.8438, + "num_input_tokens_seen": 7480256, + "step": 12895 + }, + { + "epoch": 1.9213583556747096, + "grad_norm": 0.0145263671875, + "learning_rate": 0.028818141197497767, + "loss": 0.8238, + "num_input_tokens_seen": 7483072, + "step": 12900 + }, + { + "epoch": 1.9221030682156688, + "grad_norm": 0.015625, + "learning_rate": 0.02882931188561215, + "loss": 0.7848, + "num_input_tokens_seen": 7486016, + "step": 12905 + }, + { + "epoch": 1.922847780756628, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02884048257372654, + "loss": 0.8141, + "num_input_tokens_seen": 7488864, + "step": 12910 + }, + { + "epoch": 1.923592493297587, + "grad_norm": 0.023193359375, + "learning_rate": 0.02885165326184093, + "loss": 0.7981, + "num_input_tokens_seen": 7491808, + "step": 12915 + }, + { + "epoch": 1.9243372058385462, + "grad_norm": 0.021728515625, + "learning_rate": 0.028862823949955315, + "loss": 0.8048, + "num_input_tokens_seen": 7495072, + "step": 12920 + }, + { + "epoch": 1.9250819183795054, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028873994638069705, + "loss": 0.7991, + "num_input_tokens_seen": 7497984, + "step": 12925 + }, + { + "epoch": 1.9258266309204646, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02888516532618409, + "loss": 0.8037, + "num_input_tokens_seen": 7500736, + "step": 12930 + }, + { + "epoch": 1.9265713434614238, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02889633601429848, + "loss": 0.8217, + "num_input_tokens_seen": 7503520, + "step": 12935 + }, + { + "epoch": 1.927316056002383, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028907506702412867, + "loss": 0.7863, + "num_input_tokens_seen": 7506304, + "step": 12940 + }, + { + "epoch": 1.9280607685433422, + "grad_norm": 0.0234375, + "learning_rate": 0.028918677390527257, + "loss": 0.8002, + "num_input_tokens_seen": 7509248, + "step": 12945 + }, + { + "epoch": 1.9288054810843014, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028929848078641643, + "loss": 0.8146, + "num_input_tokens_seen": 7512224, + "step": 12950 + }, + { + "epoch": 1.9295501936252606, + "grad_norm": 0.029052734375, + "learning_rate": 0.028941018766756033, + "loss": 0.7993, + "num_input_tokens_seen": 7515328, + "step": 12955 + }, + { + "epoch": 1.9302949061662198, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02895218945487042, + "loss": 0.7948, + "num_input_tokens_seen": 7517984, + "step": 12960 + }, + { + "epoch": 1.931039618707179, + "grad_norm": 0.0262451171875, + "learning_rate": 0.02896336014298481, + "loss": 0.8247, + "num_input_tokens_seen": 7520864, + "step": 12965 + }, + { + "epoch": 1.9317843312481382, + "grad_norm": 0.0230712890625, + "learning_rate": 0.028974530831099195, + "loss": 0.8083, + "num_input_tokens_seen": 7524000, + "step": 12970 + }, + { + "epoch": 1.9325290437890974, + "grad_norm": 0.0284423828125, + "learning_rate": 0.028985701519213584, + "loss": 0.8174, + "num_input_tokens_seen": 7526688, + "step": 12975 + }, + { + "epoch": 1.9332737563300566, + "grad_norm": 0.022705078125, + "learning_rate": 0.02899687220732797, + "loss": 0.804, + "num_input_tokens_seen": 7529440, + "step": 12980 + }, + { + "epoch": 1.9340184688710158, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02900804289544236, + "loss": 0.7953, + "num_input_tokens_seen": 7532448, + "step": 12985 + }, + { + "epoch": 1.934763181411975, + "grad_norm": 0.01287841796875, + "learning_rate": 0.029019213583556743, + "loss": 0.813, + "num_input_tokens_seen": 7535328, + "step": 12990 + }, + { + "epoch": 1.9355078939529342, + "grad_norm": 0.02197265625, + "learning_rate": 0.029030384271671133, + "loss": 0.8121, + "num_input_tokens_seen": 7537984, + "step": 12995 + }, + { + "epoch": 1.9362526064938934, + "grad_norm": 0.020751953125, + "learning_rate": 0.02904155495978552, + "loss": 0.8, + "num_input_tokens_seen": 7540928, + "step": 13000 + }, + { + "epoch": 1.9369973190348526, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02905272564789991, + "loss": 0.808, + "num_input_tokens_seen": 7543680, + "step": 13005 + }, + { + "epoch": 1.9377420315758118, + "grad_norm": 0.0272216796875, + "learning_rate": 0.029063896336014295, + "loss": 0.812, + "num_input_tokens_seen": 7546720, + "step": 13010 + }, + { + "epoch": 1.938486744116771, + "grad_norm": 0.013671875, + "learning_rate": 0.029075067024128685, + "loss": 0.8153, + "num_input_tokens_seen": 7549920, + "step": 13015 + }, + { + "epoch": 1.9392314566577302, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02908623771224307, + "loss": 0.817, + "num_input_tokens_seen": 7552960, + "step": 13020 + }, + { + "epoch": 1.9399761691986894, + "grad_norm": 0.02099609375, + "learning_rate": 0.02909740840035746, + "loss": 0.8004, + "num_input_tokens_seen": 7555584, + "step": 13025 + }, + { + "epoch": 1.9407208817396486, + "grad_norm": 0.034423828125, + "learning_rate": 0.02910857908847185, + "loss": 0.8, + "num_input_tokens_seen": 7558688, + "step": 13030 + }, + { + "epoch": 1.9414655942806078, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029119749776586237, + "loss": 0.8025, + "num_input_tokens_seen": 7561344, + "step": 13035 + }, + { + "epoch": 1.942210306821567, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029130920464700626, + "loss": 0.8105, + "num_input_tokens_seen": 7564544, + "step": 13040 + }, + { + "epoch": 1.942955019362526, + "grad_norm": 0.0263671875, + "learning_rate": 0.029142091152815013, + "loss": 0.8021, + "num_input_tokens_seen": 7567488, + "step": 13045 + }, + { + "epoch": 1.9436997319034852, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029153261840929402, + "loss": 0.8008, + "num_input_tokens_seen": 7570048, + "step": 13050 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02916443252904379, + "loss": 0.8, + "num_input_tokens_seen": 7572704, + "step": 13055 + }, + { + "epoch": 1.9451891569854036, + "grad_norm": 0.0140380859375, + "learning_rate": 0.029175603217158178, + "loss": 0.7983, + "num_input_tokens_seen": 7575712, + "step": 13060 + }, + { + "epoch": 1.9459338695263628, + "grad_norm": 0.0260009765625, + "learning_rate": 0.029186773905272564, + "loss": 0.7983, + "num_input_tokens_seen": 7578496, + "step": 13065 + }, + { + "epoch": 1.946678582067322, + "grad_norm": 0.022216796875, + "learning_rate": 0.029197944593386954, + "loss": 0.7881, + "num_input_tokens_seen": 7581248, + "step": 13070 + }, + { + "epoch": 1.9474232946082812, + "grad_norm": 0.022216796875, + "learning_rate": 0.029209115281501337, + "loss": 0.7989, + "num_input_tokens_seen": 7583712, + "step": 13075 + }, + { + "epoch": 1.9481680071492404, + "grad_norm": 0.023193359375, + "learning_rate": 0.029220285969615727, + "loss": 0.796, + "num_input_tokens_seen": 7586336, + "step": 13080 + }, + { + "epoch": 1.9489127196901996, + "grad_norm": 0.05419921875, + "learning_rate": 0.029231456657730113, + "loss": 0.804, + "num_input_tokens_seen": 7589216, + "step": 13085 + }, + { + "epoch": 1.9496574322311586, + "grad_norm": 0.02197265625, + "learning_rate": 0.029242627345844503, + "loss": 0.7961, + "num_input_tokens_seen": 7592128, + "step": 13090 + }, + { + "epoch": 1.9504021447721178, + "grad_norm": 0.01361083984375, + "learning_rate": 0.02925379803395889, + "loss": 0.7939, + "num_input_tokens_seen": 7594880, + "step": 13095 + }, + { + "epoch": 1.951146857313077, + "grad_norm": 0.01434326171875, + "learning_rate": 0.02926496872207328, + "loss": 0.7988, + "num_input_tokens_seen": 7597632, + "step": 13100 + }, + { + "epoch": 1.9518915698540362, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029276139410187665, + "loss": 0.8022, + "num_input_tokens_seen": 7600672, + "step": 13105 + }, + { + "epoch": 1.9526362823949954, + "grad_norm": 0.035400390625, + "learning_rate": 0.029287310098302054, + "loss": 0.8199, + "num_input_tokens_seen": 7603680, + "step": 13110 + }, + { + "epoch": 1.9533809949359546, + "grad_norm": 0.020751953125, + "learning_rate": 0.02929848078641644, + "loss": 0.7994, + "num_input_tokens_seen": 7606464, + "step": 13115 + }, + { + "epoch": 1.9541257074769138, + "grad_norm": 0.01708984375, + "learning_rate": 0.02930965147453083, + "loss": 0.8039, + "num_input_tokens_seen": 7609408, + "step": 13120 + }, + { + "epoch": 1.954870420017873, + "grad_norm": 0.0283203125, + "learning_rate": 0.029320822162645217, + "loss": 0.8302, + "num_input_tokens_seen": 7612960, + "step": 13125 + }, + { + "epoch": 1.9556151325588322, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029331992850759606, + "loss": 0.7856, + "num_input_tokens_seen": 7615680, + "step": 13130 + }, + { + "epoch": 1.9563598450997914, + "grad_norm": 0.031494140625, + "learning_rate": 0.029343163538873993, + "loss": 0.7912, + "num_input_tokens_seen": 7618752, + "step": 13135 + }, + { + "epoch": 1.9571045576407506, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029354334226988382, + "loss": 0.7992, + "num_input_tokens_seen": 7621472, + "step": 13140 + }, + { + "epoch": 1.9578492701817098, + "grad_norm": 0.0263671875, + "learning_rate": 0.029365504915102772, + "loss": 0.7951, + "num_input_tokens_seen": 7624480, + "step": 13145 + }, + { + "epoch": 1.958593982722669, + "grad_norm": 0.0162353515625, + "learning_rate": 0.029376675603217158, + "loss": 0.8131, + "num_input_tokens_seen": 7627360, + "step": 13150 + }, + { + "epoch": 1.9593386952636282, + "grad_norm": 0.0142822265625, + "learning_rate": 0.029387846291331548, + "loss": 0.8131, + "num_input_tokens_seen": 7630336, + "step": 13155 + }, + { + "epoch": 1.9600834078045875, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029399016979445934, + "loss": 0.8123, + "num_input_tokens_seen": 7632992, + "step": 13160 + }, + { + "epoch": 1.9608281203455467, + "grad_norm": 0.013916015625, + "learning_rate": 0.02941018766756032, + "loss": 0.8161, + "num_input_tokens_seen": 7635648, + "step": 13165 + }, + { + "epoch": 1.9615728328865059, + "grad_norm": 0.037109375, + "learning_rate": 0.029421358355674707, + "loss": 0.7919, + "num_input_tokens_seen": 7638656, + "step": 13170 + }, + { + "epoch": 1.962317545427465, + "grad_norm": 0.034423828125, + "learning_rate": 0.029432529043789096, + "loss": 0.7925, + "num_input_tokens_seen": 7641760, + "step": 13175 + }, + { + "epoch": 1.9630622579684243, + "grad_norm": 0.0272216796875, + "learning_rate": 0.029443699731903483, + "loss": 0.7941, + "num_input_tokens_seen": 7644448, + "step": 13180 + }, + { + "epoch": 1.9638069705093835, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029454870420017872, + "loss": 0.793, + "num_input_tokens_seen": 7647392, + "step": 13185 + }, + { + "epoch": 1.9645516830503427, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02946604110813226, + "loss": 0.7878, + "num_input_tokens_seen": 7650304, + "step": 13190 + }, + { + "epoch": 1.9652963955913019, + "grad_norm": 0.0361328125, + "learning_rate": 0.029477211796246648, + "loss": 0.7764, + "num_input_tokens_seen": 7652960, + "step": 13195 + }, + { + "epoch": 1.966041108132261, + "grad_norm": 0.02685546875, + "learning_rate": 0.029488382484361034, + "loss": 0.7838, + "num_input_tokens_seen": 7655808, + "step": 13200 + }, + { + "epoch": 1.9667858206732203, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029499553172475424, + "loss": 0.7829, + "num_input_tokens_seen": 7658560, + "step": 13205 + }, + { + "epoch": 1.9675305332141795, + "grad_norm": 0.03759765625, + "learning_rate": 0.02951072386058981, + "loss": 0.7372, + "num_input_tokens_seen": 7661568, + "step": 13210 + }, + { + "epoch": 1.9682752457551387, + "grad_norm": 0.029296875, + "learning_rate": 0.0295218945487042, + "loss": 0.7826, + "num_input_tokens_seen": 7664352, + "step": 13215 + }, + { + "epoch": 1.9690199582960977, + "grad_norm": 0.01531982421875, + "learning_rate": 0.029533065236818586, + "loss": 0.8313, + "num_input_tokens_seen": 7667392, + "step": 13220 + }, + { + "epoch": 1.9697646708370569, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029544235924932976, + "loss": 0.8264, + "num_input_tokens_seen": 7670240, + "step": 13225 + }, + { + "epoch": 1.970509383378016, + "grad_norm": 0.02294921875, + "learning_rate": 0.029555406613047362, + "loss": 0.779, + "num_input_tokens_seen": 7673312, + "step": 13230 + }, + { + "epoch": 1.9712540959189753, + "grad_norm": 0.03857421875, + "learning_rate": 0.029566577301161752, + "loss": 0.7962, + "num_input_tokens_seen": 7676448, + "step": 13235 + }, + { + "epoch": 1.9719988084599345, + "grad_norm": 0.038818359375, + "learning_rate": 0.029577747989276138, + "loss": 0.8006, + "num_input_tokens_seen": 7679360, + "step": 13240 + }, + { + "epoch": 1.9727435210008937, + "grad_norm": 0.025146484375, + "learning_rate": 0.029588918677390528, + "loss": 0.7831, + "num_input_tokens_seen": 7682272, + "step": 13245 + }, + { + "epoch": 1.9734882335418529, + "grad_norm": 0.02294921875, + "learning_rate": 0.029600089365504914, + "loss": 0.8102, + "num_input_tokens_seen": 7685280, + "step": 13250 + }, + { + "epoch": 1.974232946082812, + "grad_norm": 0.040283203125, + "learning_rate": 0.0296112600536193, + "loss": 0.8037, + "num_input_tokens_seen": 7687968, + "step": 13255 + }, + { + "epoch": 1.974977658623771, + "grad_norm": 0.037353515625, + "learning_rate": 0.02962243074173369, + "loss": 0.7791, + "num_input_tokens_seen": 7690976, + "step": 13260 + }, + { + "epoch": 1.9757223711647303, + "grad_norm": 0.05126953125, + "learning_rate": 0.029633601429848076, + "loss": 0.8314, + "num_input_tokens_seen": 7693856, + "step": 13265 + }, + { + "epoch": 1.9764670837056895, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029644772117962466, + "loss": 0.7831, + "num_input_tokens_seen": 7696672, + "step": 13270 + }, + { + "epoch": 1.9772117962466487, + "grad_norm": 0.0380859375, + "learning_rate": 0.029655942806076852, + "loss": 0.828, + "num_input_tokens_seen": 7699808, + "step": 13275 + }, + { + "epoch": 1.9779565087876079, + "grad_norm": 0.038818359375, + "learning_rate": 0.029667113494191242, + "loss": 0.7942, + "num_input_tokens_seen": 7702720, + "step": 13280 + }, + { + "epoch": 1.978701221328567, + "grad_norm": 0.01031494140625, + "learning_rate": 0.029678284182305628, + "loss": 0.8058, + "num_input_tokens_seen": 7705504, + "step": 13285 + }, + { + "epoch": 1.9794459338695263, + "grad_norm": 0.03076171875, + "learning_rate": 0.029689454870420018, + "loss": 0.7945, + "num_input_tokens_seen": 7708672, + "step": 13290 + }, + { + "epoch": 1.9801906464104855, + "grad_norm": 0.02587890625, + "learning_rate": 0.029700625558534404, + "loss": 0.8177, + "num_input_tokens_seen": 7711744, + "step": 13295 + }, + { + "epoch": 1.9809353589514447, + "grad_norm": 0.0308837890625, + "learning_rate": 0.029711796246648794, + "loss": 0.79, + "num_input_tokens_seen": 7714656, + "step": 13300 + }, + { + "epoch": 1.9816800714924039, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02972296693476318, + "loss": 0.8201, + "num_input_tokens_seen": 7717344, + "step": 13305 + }, + { + "epoch": 1.982424784033363, + "grad_norm": 0.030517578125, + "learning_rate": 0.02973413762287757, + "loss": 0.7921, + "num_input_tokens_seen": 7720384, + "step": 13310 + }, + { + "epoch": 1.9831694965743223, + "grad_norm": 0.0128173828125, + "learning_rate": 0.029745308310991956, + "loss": 0.7989, + "num_input_tokens_seen": 7723040, + "step": 13315 + }, + { + "epoch": 1.9839142091152815, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029756478999106346, + "loss": 0.8046, + "num_input_tokens_seen": 7725920, + "step": 13320 + }, + { + "epoch": 1.9846589216562407, + "grad_norm": 0.033203125, + "learning_rate": 0.029767649687220732, + "loss": 0.7941, + "num_input_tokens_seen": 7728960, + "step": 13325 + }, + { + "epoch": 1.9854036341972, + "grad_norm": 0.02001953125, + "learning_rate": 0.02977882037533512, + "loss": 0.7976, + "num_input_tokens_seen": 7731616, + "step": 13330 + }, + { + "epoch": 1.986148346738159, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029789991063449504, + "loss": 0.805, + "num_input_tokens_seen": 7734688, + "step": 13335 + }, + { + "epoch": 1.9868930592791183, + "grad_norm": 0.0341796875, + "learning_rate": 0.029801161751563894, + "loss": 0.791, + "num_input_tokens_seen": 7737376, + "step": 13340 + }, + { + "epoch": 1.9876377718200775, + "grad_norm": 0.024169921875, + "learning_rate": 0.02981233243967828, + "loss": 0.8079, + "num_input_tokens_seen": 7740352, + "step": 13345 + }, + { + "epoch": 1.9883824843610367, + "grad_norm": 0.033203125, + "learning_rate": 0.02982350312779267, + "loss": 0.7846, + "num_input_tokens_seen": 7743200, + "step": 13350 + }, + { + "epoch": 1.989127196901996, + "grad_norm": 0.032470703125, + "learning_rate": 0.029834673815907056, + "loss": 0.7621, + "num_input_tokens_seen": 7746144, + "step": 13355 + }, + { + "epoch": 1.9898719094429551, + "grad_norm": 0.040283203125, + "learning_rate": 0.029845844504021446, + "loss": 0.8097, + "num_input_tokens_seen": 7749024, + "step": 13360 + }, + { + "epoch": 1.9906166219839143, + "grad_norm": 0.01458740234375, + "learning_rate": 0.029857015192135836, + "loss": 0.8146, + "num_input_tokens_seen": 7752224, + "step": 13365 + }, + { + "epoch": 1.9913613345248735, + "grad_norm": 0.03125, + "learning_rate": 0.029868185880250222, + "loss": 0.7938, + "num_input_tokens_seen": 7755008, + "step": 13370 + }, + { + "epoch": 1.9921060470658327, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02987935656836461, + "loss": 0.8186, + "num_input_tokens_seen": 7757728, + "step": 13375 + }, + { + "epoch": 1.992850759606792, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029890527256478998, + "loss": 0.7959, + "num_input_tokens_seen": 7760640, + "step": 13380 + }, + { + "epoch": 1.9935954721477511, + "grad_norm": 0.01019287109375, + "learning_rate": 0.029901697944593388, + "loss": 0.7815, + "num_input_tokens_seen": 7763552, + "step": 13385 + }, + { + "epoch": 1.99434018468871, + "grad_norm": 0.0234375, + "learning_rate": 0.029912868632707774, + "loss": 0.794, + "num_input_tokens_seen": 7766368, + "step": 13390 + }, + { + "epoch": 1.9950848972296693, + "grad_norm": 0.029052734375, + "learning_rate": 0.029924039320822163, + "loss": 0.8233, + "num_input_tokens_seen": 7769664, + "step": 13395 + }, + { + "epoch": 1.9958296097706285, + "grad_norm": 0.0169677734375, + "learning_rate": 0.02993521000893655, + "loss": 0.7885, + "num_input_tokens_seen": 7772704, + "step": 13400 + }, + { + "epoch": 1.9965743223115877, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02994638069705094, + "loss": 0.8068, + "num_input_tokens_seen": 7776032, + "step": 13405 + }, + { + "epoch": 1.997319034852547, + "grad_norm": 0.032958984375, + "learning_rate": 0.029957551385165326, + "loss": 0.7839, + "num_input_tokens_seen": 7779008, + "step": 13410 + }, + { + "epoch": 1.9980637473935061, + "grad_norm": 0.01953125, + "learning_rate": 0.029968722073279715, + "loss": 0.8018, + "num_input_tokens_seen": 7781888, + "step": 13415 + }, + { + "epoch": 1.9988084599344653, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0299798927613941, + "loss": 0.8059, + "num_input_tokens_seen": 7785120, + "step": 13420 + }, + { + "epoch": 1.9995531724754245, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029991063449508488, + "loss": 0.8422, + "num_input_tokens_seen": 7788096, + "step": 13425 + }, + { + "epoch": 2.0, + "eval_loss": 0.8020058870315552, + "eval_runtime": 70.6448, + "eval_samples_per_second": 42.24, + "eval_steps_per_second": 10.56, + "num_input_tokens_seen": 7789256, + "step": 13428 + }, + { + "epoch": 2.0002978850163835, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029999999994931803, + "loss": 0.8008, + "num_input_tokens_seen": 7790536, + "step": 13430 + }, + { + "epoch": 2.0010425975573427, + "grad_norm": 0.030517578125, + "learning_rate": 0.029999999817544976, + "loss": 0.8042, + "num_input_tokens_seen": 7793416, + "step": 13435 + }, + { + "epoch": 2.001787310098302, + "grad_norm": 0.03125, + "learning_rate": 0.029999999386748397, + "loss": 0.8011, + "num_input_tokens_seen": 7796296, + "step": 13440 + }, + { + "epoch": 2.002532022639261, + "grad_norm": 0.019775390625, + "learning_rate": 0.02999999870254207, + "loss": 0.7854, + "num_input_tokens_seen": 7799144, + "step": 13445 + }, + { + "epoch": 2.0032767351802203, + "grad_norm": 0.020751953125, + "learning_rate": 0.029999997764926012, + "loss": 0.8005, + "num_input_tokens_seen": 7802088, + "step": 13450 + }, + { + "epoch": 2.0040214477211795, + "grad_norm": 0.02197265625, + "learning_rate": 0.02999999657390024, + "loss": 0.7948, + "num_input_tokens_seen": 7805128, + "step": 13455 + }, + { + "epoch": 2.0047661602621387, + "grad_norm": 0.01055908203125, + "learning_rate": 0.02999999512946477, + "loss": 0.8036, + "num_input_tokens_seen": 7807880, + "step": 13460 + }, + { + "epoch": 2.005510872803098, + "grad_norm": 0.01507568359375, + "learning_rate": 0.029999993431619625, + "loss": 0.8612, + "num_input_tokens_seen": 7811080, + "step": 13465 + }, + { + "epoch": 2.006255585344057, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029999991480364842, + "loss": 0.7916, + "num_input_tokens_seen": 7814120, + "step": 13470 + }, + { + "epoch": 2.0070002978850163, + "grad_norm": 0.033447265625, + "learning_rate": 0.029999989275700443, + "loss": 0.7857, + "num_input_tokens_seen": 7816872, + "step": 13475 + }, + { + "epoch": 2.0077450104259755, + "grad_norm": 0.030517578125, + "learning_rate": 0.029999986817626473, + "loss": 0.8227, + "num_input_tokens_seen": 7819464, + "step": 13480 + }, + { + "epoch": 2.0084897229669347, + "grad_norm": 0.023193359375, + "learning_rate": 0.02999998410614297, + "loss": 0.7951, + "num_input_tokens_seen": 7822216, + "step": 13485 + }, + { + "epoch": 2.009234435507894, + "grad_norm": 0.01708984375, + "learning_rate": 0.02999998114124998, + "loss": 0.787, + "num_input_tokens_seen": 7825128, + "step": 13490 + }, + { + "epoch": 2.009979148048853, + "grad_norm": 0.02490234375, + "learning_rate": 0.029999977922947556, + "loss": 0.8191, + "num_input_tokens_seen": 7828264, + "step": 13495 + }, + { + "epoch": 2.0107238605898123, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02999997445123575, + "loss": 0.7929, + "num_input_tokens_seen": 7831016, + "step": 13500 + }, + { + "epoch": 2.0114685731307715, + "grad_norm": 0.0341796875, + "learning_rate": 0.029999970726114617, + "loss": 0.8079, + "num_input_tokens_seen": 7834056, + "step": 13505 + }, + { + "epoch": 2.0122132856717307, + "grad_norm": 0.02099609375, + "learning_rate": 0.029999966747584227, + "loss": 0.7975, + "num_input_tokens_seen": 7836648, + "step": 13510 + }, + { + "epoch": 2.01295799821269, + "grad_norm": 0.01336669921875, + "learning_rate": 0.029999962515644647, + "loss": 0.8147, + "num_input_tokens_seen": 7839720, + "step": 13515 + }, + { + "epoch": 2.013702710753649, + "grad_norm": 0.03271484375, + "learning_rate": 0.02999995803029594, + "loss": 0.8096, + "num_input_tokens_seen": 7842696, + "step": 13520 + }, + { + "epoch": 2.0144474232946084, + "grad_norm": 0.0147705078125, + "learning_rate": 0.029999953291538192, + "loss": 0.7975, + "num_input_tokens_seen": 7845576, + "step": 13525 + }, + { + "epoch": 2.0151921358355676, + "grad_norm": 0.02294921875, + "learning_rate": 0.029999948299371474, + "loss": 0.8253, + "num_input_tokens_seen": 7848200, + "step": 13530 + }, + { + "epoch": 2.0159368483765268, + "grad_norm": 0.02099609375, + "learning_rate": 0.02999994305379588, + "loss": 0.7833, + "num_input_tokens_seen": 7850856, + "step": 13535 + }, + { + "epoch": 2.016681560917486, + "grad_norm": 0.0234375, + "learning_rate": 0.02999993755481149, + "loss": 0.7802, + "num_input_tokens_seen": 7853608, + "step": 13540 + }, + { + "epoch": 2.017426273458445, + "grad_norm": 0.0263671875, + "learning_rate": 0.0299999318024184, + "loss": 0.7778, + "num_input_tokens_seen": 7856552, + "step": 13545 + }, + { + "epoch": 2.0181709859994044, + "grad_norm": 0.021484375, + "learning_rate": 0.029999925796616707, + "loss": 0.8001, + "num_input_tokens_seen": 7859496, + "step": 13550 + }, + { + "epoch": 2.0189156985403636, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02999991953740652, + "loss": 0.8089, + "num_input_tokens_seen": 7862632, + "step": 13555 + }, + { + "epoch": 2.0196604110813228, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029999913024787925, + "loss": 0.7918, + "num_input_tokens_seen": 7865480, + "step": 13560 + }, + { + "epoch": 2.020405123622282, + "grad_norm": 0.021484375, + "learning_rate": 0.029999906258761054, + "loss": 0.8245, + "num_input_tokens_seen": 7868392, + "step": 13565 + }, + { + "epoch": 2.021149836163241, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029999899239326007, + "loss": 0.7723, + "num_input_tokens_seen": 7871208, + "step": 13570 + }, + { + "epoch": 2.0218945487042004, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029999891966482912, + "loss": 0.8241, + "num_input_tokens_seen": 7873896, + "step": 13575 + }, + { + "epoch": 2.0226392612451596, + "grad_norm": 0.01953125, + "learning_rate": 0.029999884440231883, + "loss": 0.7913, + "num_input_tokens_seen": 7876616, + "step": 13580 + }, + { + "epoch": 2.0233839737861183, + "grad_norm": 0.02685546875, + "learning_rate": 0.029999876660573056, + "loss": 0.8459, + "num_input_tokens_seen": 7879592, + "step": 13585 + }, + { + "epoch": 2.0241286863270775, + "grad_norm": 0.0167236328125, + "learning_rate": 0.029999868627506556, + "loss": 0.7886, + "num_input_tokens_seen": 7882120, + "step": 13590 + }, + { + "epoch": 2.0248733988680367, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02999986034103252, + "loss": 0.7896, + "num_input_tokens_seen": 7884808, + "step": 13595 + }, + { + "epoch": 2.025618111408996, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029999851801151094, + "loss": 0.8087, + "num_input_tokens_seen": 7887560, + "step": 13600 + }, + { + "epoch": 2.026362823949955, + "grad_norm": 0.0311279296875, + "learning_rate": 0.02999984300786241, + "loss": 0.8023, + "num_input_tokens_seen": 7890248, + "step": 13605 + }, + { + "epoch": 2.0271075364909144, + "grad_norm": 0.0140380859375, + "learning_rate": 0.029999833961166622, + "loss": 0.7887, + "num_input_tokens_seen": 7893352, + "step": 13610 + }, + { + "epoch": 2.0278522490318736, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029999824661063892, + "loss": 0.7812, + "num_input_tokens_seen": 7896136, + "step": 13615 + }, + { + "epoch": 2.0285969615728328, + "grad_norm": 0.0263671875, + "learning_rate": 0.029999815107554364, + "loss": 0.8429, + "num_input_tokens_seen": 7899112, + "step": 13620 + }, + { + "epoch": 2.029341674113792, + "grad_norm": 0.0294189453125, + "learning_rate": 0.029999805300638206, + "loss": 0.8085, + "num_input_tokens_seen": 7902120, + "step": 13625 + }, + { + "epoch": 2.030086386654751, + "grad_norm": 0.0361328125, + "learning_rate": 0.029999795240315583, + "loss": 0.8142, + "num_input_tokens_seen": 7905384, + "step": 13630 + }, + { + "epoch": 2.0308310991957104, + "grad_norm": 0.02392578125, + "learning_rate": 0.029999784926586667, + "loss": 0.7991, + "num_input_tokens_seen": 7908616, + "step": 13635 + }, + { + "epoch": 2.0315758117366696, + "grad_norm": 0.033203125, + "learning_rate": 0.029999774359451628, + "loss": 0.806, + "num_input_tokens_seen": 7911688, + "step": 13640 + }, + { + "epoch": 2.0323205242776288, + "grad_norm": 0.01446533203125, + "learning_rate": 0.029999763538910644, + "loss": 0.8023, + "num_input_tokens_seen": 7914664, + "step": 13645 + }, + { + "epoch": 2.033065236818588, + "grad_norm": 0.022705078125, + "learning_rate": 0.029999752464963905, + "loss": 0.8108, + "num_input_tokens_seen": 7917800, + "step": 13650 + }, + { + "epoch": 2.033809949359547, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02999974113761159, + "loss": 0.8, + "num_input_tokens_seen": 7920776, + "step": 13655 + }, + { + "epoch": 2.0345546619005064, + "grad_norm": 0.0277099609375, + "learning_rate": 0.029999729556853896, + "loss": 0.8178, + "num_input_tokens_seen": 7923592, + "step": 13660 + }, + { + "epoch": 2.0352993744414656, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029999717722691013, + "loss": 0.7963, + "num_input_tokens_seen": 7926632, + "step": 13665 + }, + { + "epoch": 2.036044086982425, + "grad_norm": 0.033203125, + "learning_rate": 0.029999705635123143, + "loss": 0.7933, + "num_input_tokens_seen": 7929384, + "step": 13670 + }, + { + "epoch": 2.036788799523384, + "grad_norm": 0.014892578125, + "learning_rate": 0.029999693294150493, + "loss": 0.7814, + "num_input_tokens_seen": 7932200, + "step": 13675 + }, + { + "epoch": 2.037533512064343, + "grad_norm": 0.03759765625, + "learning_rate": 0.02999968069977327, + "loss": 0.8255, + "num_input_tokens_seen": 7935080, + "step": 13680 + }, + { + "epoch": 2.0382782246053024, + "grad_norm": 0.03857421875, + "learning_rate": 0.02999966785199169, + "loss": 0.8191, + "num_input_tokens_seen": 7937960, + "step": 13685 + }, + { + "epoch": 2.0390229371462616, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02999965475080596, + "loss": 0.815, + "num_input_tokens_seen": 7941032, + "step": 13690 + }, + { + "epoch": 2.039767649687221, + "grad_norm": 0.021240234375, + "learning_rate": 0.02999964139621631, + "loss": 0.8103, + "num_input_tokens_seen": 7943976, + "step": 13695 + }, + { + "epoch": 2.04051236222818, + "grad_norm": 0.0296630859375, + "learning_rate": 0.029999627788222967, + "loss": 0.801, + "num_input_tokens_seen": 7946952, + "step": 13700 + }, + { + "epoch": 2.041257074769139, + "grad_norm": 0.02880859375, + "learning_rate": 0.029999613926826155, + "loss": 0.8152, + "num_input_tokens_seen": 7949736, + "step": 13705 + }, + { + "epoch": 2.0420017873100984, + "grad_norm": 0.013427734375, + "learning_rate": 0.02999959981202611, + "loss": 0.801, + "num_input_tokens_seen": 7952648, + "step": 13710 + }, + { + "epoch": 2.0427464998510576, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02999958544382307, + "loss": 0.7953, + "num_input_tokens_seen": 7955912, + "step": 13715 + }, + { + "epoch": 2.043491212392017, + "grad_norm": 0.0128173828125, + "learning_rate": 0.029999570822217284, + "loss": 0.8124, + "num_input_tokens_seen": 7958856, + "step": 13720 + }, + { + "epoch": 2.044235924932976, + "grad_norm": 0.01324462890625, + "learning_rate": 0.02999955594720899, + "loss": 0.806, + "num_input_tokens_seen": 7961992, + "step": 13725 + }, + { + "epoch": 2.044980637473935, + "grad_norm": 0.0286865234375, + "learning_rate": 0.029999540818798444, + "loss": 0.8116, + "num_input_tokens_seen": 7964712, + "step": 13730 + }, + { + "epoch": 2.0457253500148944, + "grad_norm": 0.0146484375, + "learning_rate": 0.0299995254369859, + "loss": 0.8014, + "num_input_tokens_seen": 7967656, + "step": 13735 + }, + { + "epoch": 2.0464700625558536, + "grad_norm": 0.01483154296875, + "learning_rate": 0.029999509801771615, + "loss": 0.7826, + "num_input_tokens_seen": 7970696, + "step": 13740 + }, + { + "epoch": 2.047214775096813, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029999493913155865, + "loss": 0.793, + "num_input_tokens_seen": 7973192, + "step": 13745 + }, + { + "epoch": 2.047959487637772, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029999477771138903, + "loss": 0.8078, + "num_input_tokens_seen": 7976168, + "step": 13750 + }, + { + "epoch": 2.0487042001787312, + "grad_norm": 0.01416015625, + "learning_rate": 0.029999461375721015, + "loss": 0.8064, + "num_input_tokens_seen": 7979240, + "step": 13755 + }, + { + "epoch": 2.04944891271969, + "grad_norm": 0.0263671875, + "learning_rate": 0.02999944472690247, + "loss": 0.8208, + "num_input_tokens_seen": 7982088, + "step": 13760 + }, + { + "epoch": 2.050193625260649, + "grad_norm": 0.032470703125, + "learning_rate": 0.029999427824683546, + "loss": 0.8046, + "num_input_tokens_seen": 7984904, + "step": 13765 + }, + { + "epoch": 2.0509383378016084, + "grad_norm": 0.033203125, + "learning_rate": 0.02999941066906454, + "loss": 0.803, + "num_input_tokens_seen": 7987976, + "step": 13770 + }, + { + "epoch": 2.0516830503425676, + "grad_norm": 0.021728515625, + "learning_rate": 0.029999393260045734, + "loss": 0.8049, + "num_input_tokens_seen": 7990664, + "step": 13775 + }, + { + "epoch": 2.052427762883527, + "grad_norm": 0.02587890625, + "learning_rate": 0.029999375597627418, + "loss": 0.8068, + "num_input_tokens_seen": 7994856, + "step": 13780 + }, + { + "epoch": 2.053172475424486, + "grad_norm": 0.0205078125, + "learning_rate": 0.029999357681809904, + "loss": 0.8054, + "num_input_tokens_seen": 7997608, + "step": 13785 + }, + { + "epoch": 2.053917187965445, + "grad_norm": 0.032958984375, + "learning_rate": 0.02999933951259348, + "loss": 0.8118, + "num_input_tokens_seen": 8000488, + "step": 13790 + }, + { + "epoch": 2.0546619005064044, + "grad_norm": 0.01318359375, + "learning_rate": 0.029999321089978465, + "loss": 0.8059, + "num_input_tokens_seen": 8003368, + "step": 13795 + }, + { + "epoch": 2.0554066130473636, + "grad_norm": 0.02099609375, + "learning_rate": 0.029999302413965163, + "loss": 0.7983, + "num_input_tokens_seen": 8006280, + "step": 13800 + }, + { + "epoch": 2.056151325588323, + "grad_norm": 0.02099609375, + "learning_rate": 0.02999928348455389, + "loss": 0.7901, + "num_input_tokens_seen": 8009160, + "step": 13805 + }, + { + "epoch": 2.056896038129282, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02999926430174497, + "loss": 0.7801, + "num_input_tokens_seen": 8012168, + "step": 13810 + }, + { + "epoch": 2.057640750670241, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02999924486553872, + "loss": 0.8073, + "num_input_tokens_seen": 8015016, + "step": 13815 + }, + { + "epoch": 2.0583854632112004, + "grad_norm": 0.0272216796875, + "learning_rate": 0.029999225175935472, + "loss": 0.8287, + "num_input_tokens_seen": 8017864, + "step": 13820 + }, + { + "epoch": 2.0591301757521596, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029999205232935564, + "loss": 0.8077, + "num_input_tokens_seen": 8020936, + "step": 13825 + }, + { + "epoch": 2.059874888293119, + "grad_norm": 0.023193359375, + "learning_rate": 0.029999185036539325, + "loss": 0.7836, + "num_input_tokens_seen": 8023592, + "step": 13830 + }, + { + "epoch": 2.060619600834078, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0299991645867471, + "loss": 0.8135, + "num_input_tokens_seen": 8026792, + "step": 13835 + }, + { + "epoch": 2.0613643133750372, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02999914388355923, + "loss": 0.8265, + "num_input_tokens_seen": 8029448, + "step": 13840 + }, + { + "epoch": 2.0621090259159964, + "grad_norm": 0.0302734375, + "learning_rate": 0.02999912292697608, + "loss": 0.8245, + "num_input_tokens_seen": 8032424, + "step": 13845 + }, + { + "epoch": 2.0628537384569556, + "grad_norm": 0.030517578125, + "learning_rate": 0.029999101716997982, + "loss": 0.8041, + "num_input_tokens_seen": 8035080, + "step": 13850 + }, + { + "epoch": 2.063598450997915, + "grad_norm": 0.023193359375, + "learning_rate": 0.02999908025362531, + "loss": 0.8018, + "num_input_tokens_seen": 8038056, + "step": 13855 + }, + { + "epoch": 2.064343163538874, + "grad_norm": 0.033447265625, + "learning_rate": 0.02999905853685842, + "loss": 0.7937, + "num_input_tokens_seen": 8040840, + "step": 13860 + }, + { + "epoch": 2.0650878760798332, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029999036566697675, + "loss": 0.7913, + "num_input_tokens_seen": 8043912, + "step": 13865 + }, + { + "epoch": 2.0658325886207924, + "grad_norm": 0.033203125, + "learning_rate": 0.02999901434314346, + "loss": 0.8102, + "num_input_tokens_seen": 8046824, + "step": 13870 + }, + { + "epoch": 2.0665773011617516, + "grad_norm": 0.036376953125, + "learning_rate": 0.029998991866196135, + "loss": 0.7817, + "num_input_tokens_seen": 8049704, + "step": 13875 + }, + { + "epoch": 2.067322013702711, + "grad_norm": 0.024169921875, + "learning_rate": 0.02999896913585609, + "loss": 0.7774, + "num_input_tokens_seen": 8052680, + "step": 13880 + }, + { + "epoch": 2.06806672624367, + "grad_norm": 0.020751953125, + "learning_rate": 0.029998946152123708, + "loss": 0.7859, + "num_input_tokens_seen": 8055400, + "step": 13885 + }, + { + "epoch": 2.0688114387846293, + "grad_norm": 0.03076171875, + "learning_rate": 0.029998922914999372, + "loss": 0.8237, + "num_input_tokens_seen": 8058312, + "step": 13890 + }, + { + "epoch": 2.0695561513255885, + "grad_norm": 0.01239013671875, + "learning_rate": 0.02999889942448348, + "loss": 0.7853, + "num_input_tokens_seen": 8061160, + "step": 13895 + }, + { + "epoch": 2.0703008638665477, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029998875680576423, + "loss": 0.763, + "num_input_tokens_seen": 8064424, + "step": 13900 + }, + { + "epoch": 2.071045576407507, + "grad_norm": 0.033203125, + "learning_rate": 0.029998851683278607, + "loss": 0.7773, + "num_input_tokens_seen": 8067336, + "step": 13905 + }, + { + "epoch": 2.071790288948466, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029998827432590438, + "loss": 0.811, + "num_input_tokens_seen": 8070600, + "step": 13910 + }, + { + "epoch": 2.0725350014894253, + "grad_norm": 0.039794921875, + "learning_rate": 0.029998802928512324, + "loss": 0.7897, + "num_input_tokens_seen": 8073352, + "step": 13915 + }, + { + "epoch": 2.0732797140303845, + "grad_norm": 0.037841796875, + "learning_rate": 0.029998778171044675, + "loss": 0.8355, + "num_input_tokens_seen": 8076456, + "step": 13920 + }, + { + "epoch": 2.0740244265713437, + "grad_norm": 0.01953125, + "learning_rate": 0.029998753160187915, + "loss": 0.7819, + "num_input_tokens_seen": 8079208, + "step": 13925 + }, + { + "epoch": 2.074769139112303, + "grad_norm": 0.01220703125, + "learning_rate": 0.029998727895942462, + "loss": 0.7878, + "num_input_tokens_seen": 8082280, + "step": 13930 + }, + { + "epoch": 2.0755138516532616, + "grad_norm": 0.015869140625, + "learning_rate": 0.029998702378308748, + "loss": 0.8167, + "num_input_tokens_seen": 8084968, + "step": 13935 + }, + { + "epoch": 2.076258564194221, + "grad_norm": 0.02734375, + "learning_rate": 0.0299986766072872, + "loss": 0.8235, + "num_input_tokens_seen": 8088008, + "step": 13940 + }, + { + "epoch": 2.07700327673518, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029998650582878254, + "loss": 0.801, + "num_input_tokens_seen": 8091048, + "step": 13945 + }, + { + "epoch": 2.0777479892761392, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029998624305082353, + "loss": 0.8475, + "num_input_tokens_seen": 8093992, + "step": 13950 + }, + { + "epoch": 2.0784927018170984, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02999859777389994, + "loss": 0.8027, + "num_input_tokens_seen": 8096648, + "step": 13955 + }, + { + "epoch": 2.0792374143580576, + "grad_norm": 0.01434326171875, + "learning_rate": 0.029998570989331456, + "loss": 0.7849, + "num_input_tokens_seen": 8099624, + "step": 13960 + }, + { + "epoch": 2.079982126899017, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02999854395137736, + "loss": 0.7933, + "num_input_tokens_seen": 8102504, + "step": 13965 + }, + { + "epoch": 2.080726839439976, + "grad_norm": 0.04736328125, + "learning_rate": 0.029998516660038107, + "loss": 0.7819, + "num_input_tokens_seen": 8105224, + "step": 13970 + }, + { + "epoch": 2.0814715519809353, + "grad_norm": 0.021728515625, + "learning_rate": 0.029998489115314164, + "loss": 0.8155, + "num_input_tokens_seen": 8108040, + "step": 13975 + }, + { + "epoch": 2.0822162645218945, + "grad_norm": 0.01123046875, + "learning_rate": 0.029998461317205988, + "loss": 0.7998, + "num_input_tokens_seen": 8111144, + "step": 13980 + }, + { + "epoch": 2.0829609770628537, + "grad_norm": 0.02197265625, + "learning_rate": 0.029998433265714053, + "loss": 0.8274, + "num_input_tokens_seen": 8114056, + "step": 13985 + }, + { + "epoch": 2.083705689603813, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029998404960838828, + "loss": 0.8033, + "num_input_tokens_seen": 8116744, + "step": 13990 + }, + { + "epoch": 2.084450402144772, + "grad_norm": 0.037841796875, + "learning_rate": 0.029998376402580795, + "loss": 0.7938, + "num_input_tokens_seen": 8120008, + "step": 13995 + }, + { + "epoch": 2.0851951146857313, + "grad_norm": 0.0146484375, + "learning_rate": 0.029998347590940444, + "loss": 0.7749, + "num_input_tokens_seen": 8122664, + "step": 14000 + }, + { + "epoch": 2.0859398272266905, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029998318525918246, + "loss": 0.7979, + "num_input_tokens_seen": 8125832, + "step": 14005 + }, + { + "epoch": 2.0866845397676497, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0299982892075147, + "loss": 0.7916, + "num_input_tokens_seen": 8129032, + "step": 14010 + }, + { + "epoch": 2.087429252308609, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029998259635730308, + "loss": 0.816, + "num_input_tokens_seen": 8131880, + "step": 14015 + }, + { + "epoch": 2.088173964849568, + "grad_norm": 0.033935546875, + "learning_rate": 0.02999822981056556, + "loss": 0.8454, + "num_input_tokens_seen": 8134632, + "step": 14020 + }, + { + "epoch": 2.0889186773905273, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02999819973202096, + "loss": 0.8102, + "num_input_tokens_seen": 8137512, + "step": 14025 + }, + { + "epoch": 2.0896633899314865, + "grad_norm": 0.02099609375, + "learning_rate": 0.029998169400097023, + "loss": 0.8103, + "num_input_tokens_seen": 8140424, + "step": 14030 + }, + { + "epoch": 2.0904081024724457, + "grad_norm": 0.0224609375, + "learning_rate": 0.029998138814794253, + "loss": 0.8051, + "num_input_tokens_seen": 8143272, + "step": 14035 + }, + { + "epoch": 2.091152815013405, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029998107976113176, + "loss": 0.8001, + "num_input_tokens_seen": 8146376, + "step": 14040 + }, + { + "epoch": 2.091897527554364, + "grad_norm": 0.02197265625, + "learning_rate": 0.029998076884054307, + "loss": 0.8042, + "num_input_tokens_seen": 8149224, + "step": 14045 + }, + { + "epoch": 2.0926422400953233, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02999804553861817, + "loss": 0.8002, + "num_input_tokens_seen": 8152040, + "step": 14050 + }, + { + "epoch": 2.0933869526362825, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0299980139398053, + "loss": 0.8133, + "num_input_tokens_seen": 8154856, + "step": 14055 + }, + { + "epoch": 2.0941316651772417, + "grad_norm": 0.020751953125, + "learning_rate": 0.029997982087616226, + "loss": 0.8041, + "num_input_tokens_seen": 8157512, + "step": 14060 + }, + { + "epoch": 2.094876377718201, + "grad_norm": 0.01953125, + "learning_rate": 0.029997949982051488, + "loss": 0.8132, + "num_input_tokens_seen": 8160584, + "step": 14065 + }, + { + "epoch": 2.09562109025916, + "grad_norm": 0.01458740234375, + "learning_rate": 0.029997917623111627, + "loss": 0.8009, + "num_input_tokens_seen": 8163624, + "step": 14070 + }, + { + "epoch": 2.0963658028001193, + "grad_norm": 0.022705078125, + "learning_rate": 0.02999788501079719, + "loss": 0.8188, + "num_input_tokens_seen": 8166280, + "step": 14075 + }, + { + "epoch": 2.0971105153410785, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029997852145108736, + "loss": 0.799, + "num_input_tokens_seen": 8169096, + "step": 14080 + }, + { + "epoch": 2.0978552278820377, + "grad_norm": 0.03515625, + "learning_rate": 0.02999781902604681, + "loss": 0.8106, + "num_input_tokens_seen": 8171944, + "step": 14085 + }, + { + "epoch": 2.098599940422997, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02999778565361197, + "loss": 0.8047, + "num_input_tokens_seen": 8175240, + "step": 14090 + }, + { + "epoch": 2.099344652963956, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02999775202780479, + "loss": 0.8035, + "num_input_tokens_seen": 8177896, + "step": 14095 + }, + { + "epoch": 2.1000893655049153, + "grad_norm": 0.01275634765625, + "learning_rate": 0.029997718148625827, + "loss": 0.8014, + "num_input_tokens_seen": 8180456, + "step": 14100 + }, + { + "epoch": 2.1008340780458745, + "grad_norm": 0.01300048828125, + "learning_rate": 0.029997684016075662, + "loss": 0.7978, + "num_input_tokens_seen": 8183272, + "step": 14105 + }, + { + "epoch": 2.1015787905868333, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02999764963015487, + "loss": 0.8039, + "num_input_tokens_seen": 8186280, + "step": 14110 + }, + { + "epoch": 2.1023235031277925, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029997614990864025, + "loss": 0.8012, + "num_input_tokens_seen": 8189288, + "step": 14115 + }, + { + "epoch": 2.1030682156687517, + "grad_norm": 0.0294189453125, + "learning_rate": 0.029997580098203726, + "loss": 0.796, + "num_input_tokens_seen": 8192136, + "step": 14120 + }, + { + "epoch": 2.103812928209711, + "grad_norm": 0.032958984375, + "learning_rate": 0.02999754495217455, + "loss": 0.7822, + "num_input_tokens_seen": 8194952, + "step": 14125 + }, + { + "epoch": 2.10455764075067, + "grad_norm": 0.04541015625, + "learning_rate": 0.029997509552777094, + "loss": 0.8283, + "num_input_tokens_seen": 8197736, + "step": 14130 + }, + { + "epoch": 2.1053023532916293, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029997473900011958, + "loss": 0.8176, + "num_input_tokens_seen": 8200584, + "step": 14135 + }, + { + "epoch": 2.1060470658325885, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02999743799387974, + "loss": 0.8226, + "num_input_tokens_seen": 8203560, + "step": 14140 + }, + { + "epoch": 2.1067917783735477, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029997401834381054, + "loss": 0.8106, + "num_input_tokens_seen": 8206344, + "step": 14145 + }, + { + "epoch": 2.107536490914507, + "grad_norm": 0.0296630859375, + "learning_rate": 0.029997365421516504, + "loss": 0.7908, + "num_input_tokens_seen": 8209128, + "step": 14150 + }, + { + "epoch": 2.108281203455466, + "grad_norm": 0.03466796875, + "learning_rate": 0.02999732875528671, + "loss": 0.8042, + "num_input_tokens_seen": 8211976, + "step": 14155 + }, + { + "epoch": 2.1090259159964253, + "grad_norm": 0.01513671875, + "learning_rate": 0.029997291835692285, + "loss": 0.8012, + "num_input_tokens_seen": 8214920, + "step": 14160 + }, + { + "epoch": 2.1097706285373845, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02999725466273386, + "loss": 0.7968, + "num_input_tokens_seen": 8217992, + "step": 14165 + }, + { + "epoch": 2.1105153410783437, + "grad_norm": 0.0294189453125, + "learning_rate": 0.029997217236412063, + "loss": 0.7859, + "num_input_tokens_seen": 8221000, + "step": 14170 + }, + { + "epoch": 2.111260053619303, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02999717955672752, + "loss": 0.7779, + "num_input_tokens_seen": 8224008, + "step": 14175 + }, + { + "epoch": 2.112004766160262, + "grad_norm": 0.0185546875, + "learning_rate": 0.029997141623680872, + "loss": 0.7939, + "num_input_tokens_seen": 8226760, + "step": 14180 + }, + { + "epoch": 2.1127494787012213, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02999710343727276, + "loss": 0.8169, + "num_input_tokens_seen": 8229768, + "step": 14185 + }, + { + "epoch": 2.1134941912421805, + "grad_norm": 0.028076171875, + "learning_rate": 0.029997064997503826, + "loss": 0.8045, + "num_input_tokens_seen": 8232744, + "step": 14190 + }, + { + "epoch": 2.1142389037831397, + "grad_norm": 0.0260009765625, + "learning_rate": 0.029997026304374716, + "loss": 0.8103, + "num_input_tokens_seen": 8235880, + "step": 14195 + }, + { + "epoch": 2.114983616324099, + "grad_norm": 0.024658203125, + "learning_rate": 0.0299969873578861, + "loss": 0.8067, + "num_input_tokens_seen": 8238920, + "step": 14200 + }, + { + "epoch": 2.115728328865058, + "grad_norm": 0.021240234375, + "learning_rate": 0.029996948158038616, + "loss": 0.8039, + "num_input_tokens_seen": 8241896, + "step": 14205 + }, + { + "epoch": 2.1164730414060173, + "grad_norm": 0.0263671875, + "learning_rate": 0.029996908704832938, + "loss": 0.8155, + "num_input_tokens_seen": 8244744, + "step": 14210 + }, + { + "epoch": 2.1172177539469765, + "grad_norm": 0.01275634765625, + "learning_rate": 0.02999686899826973, + "loss": 0.7796, + "num_input_tokens_seen": 8247752, + "step": 14215 + }, + { + "epoch": 2.1179624664879357, + "grad_norm": 0.042724609375, + "learning_rate": 0.02999682903834966, + "loss": 0.8232, + "num_input_tokens_seen": 8250536, + "step": 14220 + }, + { + "epoch": 2.118707179028895, + "grad_norm": 0.032958984375, + "learning_rate": 0.02999678882507341, + "loss": 0.81, + "num_input_tokens_seen": 8253480, + "step": 14225 + }, + { + "epoch": 2.119451891569854, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029996748358441648, + "loss": 0.7979, + "num_input_tokens_seen": 8256328, + "step": 14230 + }, + { + "epoch": 2.1201966041108133, + "grad_norm": 0.021240234375, + "learning_rate": 0.029996707638455068, + "loss": 0.7942, + "num_input_tokens_seen": 8259176, + "step": 14235 + }, + { + "epoch": 2.1209413166517725, + "grad_norm": 0.0140380859375, + "learning_rate": 0.029996666665114358, + "loss": 0.8091, + "num_input_tokens_seen": 8261992, + "step": 14240 + }, + { + "epoch": 2.1216860291927317, + "grad_norm": 0.023193359375, + "learning_rate": 0.029996625438420202, + "loss": 0.7916, + "num_input_tokens_seen": 8265032, + "step": 14245 + }, + { + "epoch": 2.122430741733691, + "grad_norm": 0.021240234375, + "learning_rate": 0.029996583958373305, + "loss": 0.819, + "num_input_tokens_seen": 8268168, + "step": 14250 + }, + { + "epoch": 2.12317545427465, + "grad_norm": 0.023681640625, + "learning_rate": 0.02999654222497436, + "loss": 0.8019, + "num_input_tokens_seen": 8271208, + "step": 14255 + }, + { + "epoch": 2.1239201668156094, + "grad_norm": 0.02587890625, + "learning_rate": 0.029996500238224075, + "loss": 0.8086, + "num_input_tokens_seen": 8274184, + "step": 14260 + }, + { + "epoch": 2.1246648793565686, + "grad_norm": 0.0263671875, + "learning_rate": 0.029996457998123165, + "loss": 0.8015, + "num_input_tokens_seen": 8277064, + "step": 14265 + }, + { + "epoch": 2.1254095918975278, + "grad_norm": 0.029052734375, + "learning_rate": 0.029996415504672337, + "loss": 0.8082, + "num_input_tokens_seen": 8280200, + "step": 14270 + }, + { + "epoch": 2.1261543044384865, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029996372757872307, + "loss": 0.7967, + "num_input_tokens_seen": 8282888, + "step": 14275 + }, + { + "epoch": 2.126899016979446, + "grad_norm": 0.034912109375, + "learning_rate": 0.029996329757723806, + "loss": 0.8034, + "num_input_tokens_seen": 8285704, + "step": 14280 + }, + { + "epoch": 2.127643729520405, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029996286504227556, + "loss": 0.8002, + "num_input_tokens_seen": 8288520, + "step": 14285 + }, + { + "epoch": 2.128388442061364, + "grad_norm": 0.0272216796875, + "learning_rate": 0.029996242997384282, + "loss": 0.8086, + "num_input_tokens_seen": 8291304, + "step": 14290 + }, + { + "epoch": 2.1291331546023233, + "grad_norm": 0.0205078125, + "learning_rate": 0.029996199237194727, + "loss": 0.8054, + "num_input_tokens_seen": 8294120, + "step": 14295 + }, + { + "epoch": 2.1298778671432825, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02999615522365963, + "loss": 0.8054, + "num_input_tokens_seen": 8297160, + "step": 14300 + }, + { + "epoch": 2.1306225796842417, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02999611095677973, + "loss": 0.8041, + "num_input_tokens_seen": 8299944, + "step": 14305 + }, + { + "epoch": 2.131367292225201, + "grad_norm": 0.0283203125, + "learning_rate": 0.029996066436555776, + "loss": 0.8047, + "num_input_tokens_seen": 8302920, + "step": 14310 + }, + { + "epoch": 2.13211200476616, + "grad_norm": 0.026611328125, + "learning_rate": 0.02999602166298852, + "loss": 0.8015, + "num_input_tokens_seen": 8306024, + "step": 14315 + }, + { + "epoch": 2.1328567173071193, + "grad_norm": 0.03466796875, + "learning_rate": 0.029995976636078718, + "loss": 0.8144, + "num_input_tokens_seen": 8308840, + "step": 14320 + }, + { + "epoch": 2.1336014298480785, + "grad_norm": 0.0279541015625, + "learning_rate": 0.029995931355827135, + "loss": 0.7978, + "num_input_tokens_seen": 8312008, + "step": 14325 + }, + { + "epoch": 2.1343461423890377, + "grad_norm": 0.01123046875, + "learning_rate": 0.02999588582223453, + "loss": 0.7956, + "num_input_tokens_seen": 8314856, + "step": 14330 + }, + { + "epoch": 2.135090854929997, + "grad_norm": 0.0322265625, + "learning_rate": 0.029995840035301678, + "loss": 0.8089, + "num_input_tokens_seen": 8317608, + "step": 14335 + }, + { + "epoch": 2.135835567470956, + "grad_norm": 0.01544189453125, + "learning_rate": 0.02999579399502935, + "loss": 0.8102, + "num_input_tokens_seen": 8320520, + "step": 14340 + }, + { + "epoch": 2.1365802800119154, + "grad_norm": 0.01458740234375, + "learning_rate": 0.02999574770141832, + "loss": 0.8273, + "num_input_tokens_seen": 8323272, + "step": 14345 + }, + { + "epoch": 2.1373249925528746, + "grad_norm": 0.033447265625, + "learning_rate": 0.029995701154469378, + "loss": 0.8137, + "num_input_tokens_seen": 8326184, + "step": 14350 + }, + { + "epoch": 2.1380697050938338, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029995654354183304, + "loss": 0.8006, + "num_input_tokens_seen": 8329192, + "step": 14355 + }, + { + "epoch": 2.138814417634793, + "grad_norm": 0.01507568359375, + "learning_rate": 0.02999560730056089, + "loss": 0.8051, + "num_input_tokens_seen": 8332392, + "step": 14360 + }, + { + "epoch": 2.139559130175752, + "grad_norm": 0.026123046875, + "learning_rate": 0.029995559993602927, + "loss": 0.8021, + "num_input_tokens_seen": 8335432, + "step": 14365 + }, + { + "epoch": 2.1403038427167114, + "grad_norm": 0.0234375, + "learning_rate": 0.029995512433310226, + "loss": 0.814, + "num_input_tokens_seen": 8338216, + "step": 14370 + }, + { + "epoch": 2.1410485552576706, + "grad_norm": 0.02392578125, + "learning_rate": 0.029995464619683578, + "loss": 0.8026, + "num_input_tokens_seen": 8341224, + "step": 14375 + }, + { + "epoch": 2.1417932677986298, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0299954165527238, + "loss": 0.8044, + "num_input_tokens_seen": 8344104, + "step": 14380 + }, + { + "epoch": 2.142537980339589, + "grad_norm": 0.022216796875, + "learning_rate": 0.0299953682324317, + "loss": 0.8069, + "num_input_tokens_seen": 8346792, + "step": 14385 + }, + { + "epoch": 2.143282692880548, + "grad_norm": 0.01611328125, + "learning_rate": 0.029995319658808098, + "loss": 0.7966, + "num_input_tokens_seen": 8349864, + "step": 14390 + }, + { + "epoch": 2.1440274054215074, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029995270831853802, + "loss": 0.8168, + "num_input_tokens_seen": 8352904, + "step": 14395 + }, + { + "epoch": 2.1447721179624666, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02999522175156965, + "loss": 0.813, + "num_input_tokens_seen": 8355848, + "step": 14400 + }, + { + "epoch": 2.145516830503426, + "grad_norm": 0.04638671875, + "learning_rate": 0.029995172417956465, + "loss": 0.7967, + "num_input_tokens_seen": 8358792, + "step": 14405 + }, + { + "epoch": 2.146261543044385, + "grad_norm": 0.02392578125, + "learning_rate": 0.029995122831015086, + "loss": 0.7977, + "num_input_tokens_seen": 8361224, + "step": 14410 + }, + { + "epoch": 2.147006255585344, + "grad_norm": 0.014892578125, + "learning_rate": 0.029995072990746347, + "loss": 0.8086, + "num_input_tokens_seen": 8364360, + "step": 14415 + }, + { + "epoch": 2.1477509681263034, + "grad_norm": 0.027587890625, + "learning_rate": 0.02999502289715109, + "loss": 0.7939, + "num_input_tokens_seen": 8367016, + "step": 14420 + }, + { + "epoch": 2.1484956806672626, + "grad_norm": 0.01556396484375, + "learning_rate": 0.029994972550230155, + "loss": 0.8085, + "num_input_tokens_seen": 8369928, + "step": 14425 + }, + { + "epoch": 2.149240393208222, + "grad_norm": 0.02978515625, + "learning_rate": 0.0299949219499844, + "loss": 0.8051, + "num_input_tokens_seen": 8372712, + "step": 14430 + }, + { + "epoch": 2.149985105749181, + "grad_norm": 0.01336669921875, + "learning_rate": 0.029994871096414688, + "loss": 0.8002, + "num_input_tokens_seen": 8375560, + "step": 14435 + }, + { + "epoch": 2.15072981829014, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029994819989521863, + "loss": 0.8003, + "num_input_tokens_seen": 8378536, + "step": 14440 + }, + { + "epoch": 2.1514745308310994, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029994768629306796, + "loss": 0.7941, + "num_input_tokens_seen": 8381288, + "step": 14445 + }, + { + "epoch": 2.152219243372058, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02999471701577035, + "loss": 0.8137, + "num_input_tokens_seen": 8384200, + "step": 14450 + }, + { + "epoch": 2.1529639559130174, + "grad_norm": 0.021728515625, + "learning_rate": 0.0299946651489134, + "loss": 0.8017, + "num_input_tokens_seen": 8387208, + "step": 14455 + }, + { + "epoch": 2.1537086684539766, + "grad_norm": 0.027099609375, + "learning_rate": 0.029994613028736825, + "loss": 0.8036, + "num_input_tokens_seen": 8390280, + "step": 14460 + }, + { + "epoch": 2.1544533809949358, + "grad_norm": 0.01495361328125, + "learning_rate": 0.029994560655241503, + "loss": 0.8059, + "num_input_tokens_seen": 8393128, + "step": 14465 + }, + { + "epoch": 2.155198093535895, + "grad_norm": 0.0263671875, + "learning_rate": 0.02999450802842832, + "loss": 0.8132, + "num_input_tokens_seen": 8396136, + "step": 14470 + }, + { + "epoch": 2.155942806076854, + "grad_norm": 0.01239013671875, + "learning_rate": 0.029994455148298158, + "loss": 0.7956, + "num_input_tokens_seen": 8399208, + "step": 14475 + }, + { + "epoch": 2.1566875186178134, + "grad_norm": 0.03564453125, + "learning_rate": 0.02999440201485192, + "loss": 0.7949, + "num_input_tokens_seen": 8402152, + "step": 14480 + }, + { + "epoch": 2.1574322311587726, + "grad_norm": 0.01458740234375, + "learning_rate": 0.029994348628090497, + "loss": 0.7961, + "num_input_tokens_seen": 8405064, + "step": 14485 + }, + { + "epoch": 2.158176943699732, + "grad_norm": 0.02099609375, + "learning_rate": 0.029994294988014796, + "loss": 0.7812, + "num_input_tokens_seen": 8408808, + "step": 14490 + }, + { + "epoch": 2.158921656240691, + "grad_norm": 0.0322265625, + "learning_rate": 0.029994241094625715, + "loss": 0.8055, + "num_input_tokens_seen": 8411880, + "step": 14495 + }, + { + "epoch": 2.15966636878165, + "grad_norm": 0.02197265625, + "learning_rate": 0.02999418694792418, + "loss": 0.8014, + "num_input_tokens_seen": 8414568, + "step": 14500 + }, + { + "epoch": 2.1604110813226094, + "grad_norm": 0.02197265625, + "learning_rate": 0.029994132547911088, + "loss": 0.7824, + "num_input_tokens_seen": 8417192, + "step": 14505 + }, + { + "epoch": 2.1611557938635686, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02999407789458737, + "loss": 0.7915, + "num_input_tokens_seen": 8420200, + "step": 14510 + }, + { + "epoch": 2.161900506404528, + "grad_norm": 0.03369140625, + "learning_rate": 0.029994022987953943, + "loss": 0.78, + "num_input_tokens_seen": 8422952, + "step": 14515 + }, + { + "epoch": 2.162645218945487, + "grad_norm": 0.031494140625, + "learning_rate": 0.02999396782801174, + "loss": 0.8278, + "num_input_tokens_seen": 8425544, + "step": 14520 + }, + { + "epoch": 2.163389931486446, + "grad_norm": 0.031494140625, + "learning_rate": 0.029993912414761684, + "loss": 0.8038, + "num_input_tokens_seen": 8428616, + "step": 14525 + }, + { + "epoch": 2.1641346440274054, + "grad_norm": 0.038818359375, + "learning_rate": 0.02999385674820472, + "loss": 0.8085, + "num_input_tokens_seen": 8431592, + "step": 14530 + }, + { + "epoch": 2.1648793565683646, + "grad_norm": 0.033935546875, + "learning_rate": 0.029993800828341785, + "loss": 0.7958, + "num_input_tokens_seen": 8434568, + "step": 14535 + }, + { + "epoch": 2.165624069109324, + "grad_norm": 0.0244140625, + "learning_rate": 0.02999374465517383, + "loss": 0.7964, + "num_input_tokens_seen": 8437288, + "step": 14540 + }, + { + "epoch": 2.166368781650283, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029993688228701788, + "loss": 0.82, + "num_input_tokens_seen": 8439976, + "step": 14545 + }, + { + "epoch": 2.167113494191242, + "grad_norm": 0.034912109375, + "learning_rate": 0.02999363154892663, + "loss": 0.822, + "num_input_tokens_seen": 8442728, + "step": 14550 + }, + { + "epoch": 2.1678582067322014, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0299935746158493, + "loss": 0.8073, + "num_input_tokens_seen": 8445608, + "step": 14555 + }, + { + "epoch": 2.1686029192731606, + "grad_norm": 0.01904296875, + "learning_rate": 0.02999351742947077, + "loss": 0.7936, + "num_input_tokens_seen": 8448360, + "step": 14560 + }, + { + "epoch": 2.16934763181412, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029993459989792002, + "loss": 0.8128, + "num_input_tokens_seen": 8451368, + "step": 14565 + }, + { + "epoch": 2.170092344355079, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02999340229681396, + "loss": 0.8041, + "num_input_tokens_seen": 8453960, + "step": 14570 + }, + { + "epoch": 2.1708370568960382, + "grad_norm": 0.01544189453125, + "learning_rate": 0.029993344350537632, + "loss": 0.7979, + "num_input_tokens_seen": 8456872, + "step": 14575 + }, + { + "epoch": 2.1715817694369974, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029993286150963985, + "loss": 0.8056, + "num_input_tokens_seen": 8459656, + "step": 14580 + }, + { + "epoch": 2.1723264819779566, + "grad_norm": 0.0244140625, + "learning_rate": 0.029993227698094014, + "loss": 0.7876, + "num_input_tokens_seen": 8462536, + "step": 14585 + }, + { + "epoch": 2.173071194518916, + "grad_norm": 0.034912109375, + "learning_rate": 0.02999316899192869, + "loss": 0.7832, + "num_input_tokens_seen": 8465544, + "step": 14590 + }, + { + "epoch": 2.173815907059875, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029993110032469024, + "loss": 0.792, + "num_input_tokens_seen": 8468424, + "step": 14595 + }, + { + "epoch": 2.1745606196008342, + "grad_norm": 0.03662109375, + "learning_rate": 0.029993050819715997, + "loss": 0.8075, + "num_input_tokens_seen": 8471496, + "step": 14600 + }, + { + "epoch": 2.1753053321417934, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029992991353670614, + "loss": 0.8226, + "num_input_tokens_seen": 8474728, + "step": 14605 + }, + { + "epoch": 2.1760500446827526, + "grad_norm": 0.035400390625, + "learning_rate": 0.029992931634333878, + "loss": 0.8403, + "num_input_tokens_seen": 8477320, + "step": 14610 + }, + { + "epoch": 2.176794757223712, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029992871661706805, + "loss": 0.7986, + "num_input_tokens_seen": 8480136, + "step": 14615 + }, + { + "epoch": 2.177539469764671, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029992811435790398, + "loss": 0.8088, + "num_input_tokens_seen": 8482952, + "step": 14620 + }, + { + "epoch": 2.17828418230563, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029992750956585684, + "loss": 0.8334, + "num_input_tokens_seen": 8485736, + "step": 14625 + }, + { + "epoch": 2.179028894846589, + "grad_norm": 0.0262451171875, + "learning_rate": 0.02999269022409368, + "loss": 0.8134, + "num_input_tokens_seen": 8488360, + "step": 14630 + }, + { + "epoch": 2.179773607387548, + "grad_norm": 0.0301513671875, + "learning_rate": 0.029992629238315412, + "loss": 0.7946, + "num_input_tokens_seen": 8491368, + "step": 14635 + }, + { + "epoch": 2.1805183199285074, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02999256799925191, + "loss": 0.7886, + "num_input_tokens_seen": 8494120, + "step": 14640 + }, + { + "epoch": 2.1812630324694666, + "grad_norm": 0.0341796875, + "learning_rate": 0.02999250650690421, + "loss": 0.7989, + "num_input_tokens_seen": 8497000, + "step": 14645 + }, + { + "epoch": 2.182007745010426, + "grad_norm": 0.025146484375, + "learning_rate": 0.029992444761273353, + "loss": 0.7921, + "num_input_tokens_seen": 8499784, + "step": 14650 + }, + { + "epoch": 2.182752457551385, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02999238276236038, + "loss": 0.7884, + "num_input_tokens_seen": 8502632, + "step": 14655 + }, + { + "epoch": 2.1834971700923442, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029992320510166332, + "loss": 0.7844, + "num_input_tokens_seen": 8505352, + "step": 14660 + }, + { + "epoch": 2.1842418826333034, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02999225800469227, + "loss": 0.8152, + "num_input_tokens_seen": 8508264, + "step": 14665 + }, + { + "epoch": 2.1849865951742626, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02999219524593925, + "loss": 0.7646, + "num_input_tokens_seen": 8510952, + "step": 14670 + }, + { + "epoch": 2.185731307715222, + "grad_norm": 0.022216796875, + "learning_rate": 0.029992132233908324, + "loss": 0.7855, + "num_input_tokens_seen": 8514312, + "step": 14675 + }, + { + "epoch": 2.186476020256181, + "grad_norm": 0.059814453125, + "learning_rate": 0.02999206896860056, + "loss": 0.8245, + "num_input_tokens_seen": 8517224, + "step": 14680 + }, + { + "epoch": 2.1872207327971402, + "grad_norm": 0.02734375, + "learning_rate": 0.02999200545001703, + "loss": 0.8052, + "num_input_tokens_seen": 8520008, + "step": 14685 + }, + { + "epoch": 2.1879654453380994, + "grad_norm": 0.044189453125, + "learning_rate": 0.029991941678158805, + "loss": 0.8445, + "num_input_tokens_seen": 8522952, + "step": 14690 + }, + { + "epoch": 2.1887101578790586, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02999187765302696, + "loss": 0.7854, + "num_input_tokens_seen": 8526088, + "step": 14695 + }, + { + "epoch": 2.189454870420018, + "grad_norm": 0.0234375, + "learning_rate": 0.029991813374622586, + "loss": 0.8308, + "num_input_tokens_seen": 8529128, + "step": 14700 + }, + { + "epoch": 2.190199582960977, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02999174884294676, + "loss": 0.7855, + "num_input_tokens_seen": 8531880, + "step": 14705 + }, + { + "epoch": 2.1909442955019363, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02999168405800057, + "loss": 0.7891, + "num_input_tokens_seen": 8534888, + "step": 14710 + }, + { + "epoch": 2.1916890080428955, + "grad_norm": 0.0135498046875, + "learning_rate": 0.02999161901978512, + "loss": 0.7993, + "num_input_tokens_seen": 8537992, + "step": 14715 + }, + { + "epoch": 2.1924337205838547, + "grad_norm": 0.0281982421875, + "learning_rate": 0.029991553728301496, + "loss": 0.8222, + "num_input_tokens_seen": 8541032, + "step": 14720 + }, + { + "epoch": 2.193178433124814, + "grad_norm": 0.017822265625, + "learning_rate": 0.02999148818355081, + "loss": 0.7913, + "num_input_tokens_seen": 8543944, + "step": 14725 + }, + { + "epoch": 2.193923145665773, + "grad_norm": 0.03076171875, + "learning_rate": 0.02999142238553417, + "loss": 0.8071, + "num_input_tokens_seen": 8546824, + "step": 14730 + }, + { + "epoch": 2.1946678582067323, + "grad_norm": 0.027587890625, + "learning_rate": 0.02999135633425269, + "loss": 0.8433, + "num_input_tokens_seen": 8549704, + "step": 14735 + }, + { + "epoch": 2.1954125707476915, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029991290029707476, + "loss": 0.7842, + "num_input_tokens_seen": 8552488, + "step": 14740 + }, + { + "epoch": 2.1961572832886507, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029991223471899652, + "loss": 0.8182, + "num_input_tokens_seen": 8555560, + "step": 14745 + }, + { + "epoch": 2.19690199582961, + "grad_norm": 0.01483154296875, + "learning_rate": 0.029991156660830347, + "loss": 0.8071, + "num_input_tokens_seen": 8558760, + "step": 14750 + }, + { + "epoch": 2.197646708370569, + "grad_norm": 0.01251220703125, + "learning_rate": 0.029991089596500684, + "loss": 0.7868, + "num_input_tokens_seen": 8561544, + "step": 14755 + }, + { + "epoch": 2.1983914209115283, + "grad_norm": 0.022216796875, + "learning_rate": 0.029991022278911803, + "loss": 0.7933, + "num_input_tokens_seen": 8564456, + "step": 14760 + }, + { + "epoch": 2.1991361334524875, + "grad_norm": 0.026611328125, + "learning_rate": 0.029990954708064833, + "loss": 0.8026, + "num_input_tokens_seen": 8567016, + "step": 14765 + }, + { + "epoch": 2.1998808459934467, + "grad_norm": 0.02099609375, + "learning_rate": 0.029990886883960923, + "loss": 0.8145, + "num_input_tokens_seen": 8569736, + "step": 14770 + }, + { + "epoch": 2.200625558534406, + "grad_norm": 0.0284423828125, + "learning_rate": 0.029990818806601212, + "loss": 0.8002, + "num_input_tokens_seen": 8572680, + "step": 14775 + }, + { + "epoch": 2.201370271075365, + "grad_norm": 0.0303955078125, + "learning_rate": 0.029990750475986854, + "loss": 0.8248, + "num_input_tokens_seen": 8575656, + "step": 14780 + }, + { + "epoch": 2.2021149836163243, + "grad_norm": 0.01275634765625, + "learning_rate": 0.029990681892119002, + "loss": 0.8048, + "num_input_tokens_seen": 8578632, + "step": 14785 + }, + { + "epoch": 2.202859696157283, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02999061305499882, + "loss": 0.815, + "num_input_tokens_seen": 8581512, + "step": 14790 + }, + { + "epoch": 2.2036044086982427, + "grad_norm": 0.02734375, + "learning_rate": 0.02999054396462746, + "loss": 0.8262, + "num_input_tokens_seen": 8584584, + "step": 14795 + }, + { + "epoch": 2.2043491212392015, + "grad_norm": 0.03564453125, + "learning_rate": 0.029990474621006097, + "loss": 0.8018, + "num_input_tokens_seen": 8587688, + "step": 14800 + }, + { + "epoch": 2.2050938337801607, + "grad_norm": 0.03369140625, + "learning_rate": 0.029990405024135903, + "loss": 0.8002, + "num_input_tokens_seen": 8590664, + "step": 14805 + }, + { + "epoch": 2.20583854632112, + "grad_norm": 0.01373291015625, + "learning_rate": 0.029990335174018046, + "loss": 0.8064, + "num_input_tokens_seen": 8593480, + "step": 14810 + }, + { + "epoch": 2.206583258862079, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02999026507065372, + "loss": 0.8034, + "num_input_tokens_seen": 8596168, + "step": 14815 + }, + { + "epoch": 2.2073279714030383, + "grad_norm": 0.024169921875, + "learning_rate": 0.029990194714044096, + "loss": 0.7989, + "num_input_tokens_seen": 8599048, + "step": 14820 + }, + { + "epoch": 2.2080726839439975, + "grad_norm": 0.03369140625, + "learning_rate": 0.029990124104190368, + "loss": 0.8045, + "num_input_tokens_seen": 8601864, + "step": 14825 + }, + { + "epoch": 2.2088173964849567, + "grad_norm": 0.021484375, + "learning_rate": 0.029990053241093727, + "loss": 0.8079, + "num_input_tokens_seen": 8604776, + "step": 14830 + }, + { + "epoch": 2.209562109025916, + "grad_norm": 0.020751953125, + "learning_rate": 0.029989982124755375, + "loss": 0.808, + "num_input_tokens_seen": 8607816, + "step": 14835 + }, + { + "epoch": 2.210306821566875, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02998991075517651, + "loss": 0.8121, + "num_input_tokens_seen": 8610536, + "step": 14840 + }, + { + "epoch": 2.2110515341078343, + "grad_norm": 0.0341796875, + "learning_rate": 0.029989839132358337, + "loss": 0.7963, + "num_input_tokens_seen": 8613480, + "step": 14845 + }, + { + "epoch": 2.2117962466487935, + "grad_norm": 0.021728515625, + "learning_rate": 0.029989767256302067, + "loss": 0.804, + "num_input_tokens_seen": 8616296, + "step": 14850 + }, + { + "epoch": 2.2125409591897527, + "grad_norm": 0.013916015625, + "learning_rate": 0.029989695127008914, + "loss": 0.8021, + "num_input_tokens_seen": 8619464, + "step": 14855 + }, + { + "epoch": 2.213285671730712, + "grad_norm": 0.033447265625, + "learning_rate": 0.029989622744480096, + "loss": 0.8188, + "num_input_tokens_seen": 8622312, + "step": 14860 + }, + { + "epoch": 2.214030384271671, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029989550108716834, + "loss": 0.8116, + "num_input_tokens_seen": 8625000, + "step": 14865 + }, + { + "epoch": 2.2147750968126303, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02998947721972036, + "loss": 0.8235, + "num_input_tokens_seen": 8628040, + "step": 14870 + }, + { + "epoch": 2.2155198093535895, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029989404077491906, + "loss": 0.8082, + "num_input_tokens_seen": 8630792, + "step": 14875 + }, + { + "epoch": 2.2162645218945487, + "grad_norm": 0.0269775390625, + "learning_rate": 0.029989330682032703, + "loss": 0.8029, + "num_input_tokens_seen": 8633736, + "step": 14880 + }, + { + "epoch": 2.217009234435508, + "grad_norm": 0.0147705078125, + "learning_rate": 0.029989257033343993, + "loss": 0.809, + "num_input_tokens_seen": 8636840, + "step": 14885 + }, + { + "epoch": 2.217753946976467, + "grad_norm": 0.035888671875, + "learning_rate": 0.029989183131427018, + "loss": 0.8032, + "num_input_tokens_seen": 8639816, + "step": 14890 + }, + { + "epoch": 2.2184986595174263, + "grad_norm": 0.0250244140625, + "learning_rate": 0.029989108976283027, + "loss": 0.8014, + "num_input_tokens_seen": 8642536, + "step": 14895 + }, + { + "epoch": 2.2192433720583855, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02998903456791328, + "loss": 0.8014, + "num_input_tokens_seen": 8645576, + "step": 14900 + }, + { + "epoch": 2.2199880845993447, + "grad_norm": 0.02734375, + "learning_rate": 0.029988959906319022, + "loss": 0.8062, + "num_input_tokens_seen": 8648712, + "step": 14905 + }, + { + "epoch": 2.220732797140304, + "grad_norm": 0.0159912109375, + "learning_rate": 0.029988884991501524, + "loss": 0.7973, + "num_input_tokens_seen": 8651976, + "step": 14910 + }, + { + "epoch": 2.221477509681263, + "grad_norm": 0.0279541015625, + "learning_rate": 0.029988809823462043, + "loss": 0.8041, + "num_input_tokens_seen": 8654952, + "step": 14915 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029988734402201858, + "loss": 0.8047, + "num_input_tokens_seen": 8658056, + "step": 14920 + }, + { + "epoch": 2.2229669347631815, + "grad_norm": 0.0322265625, + "learning_rate": 0.02998865872772224, + "loss": 0.8123, + "num_input_tokens_seen": 8661160, + "step": 14925 + }, + { + "epoch": 2.2237116473041407, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029988582800024467, + "loss": 0.806, + "num_input_tokens_seen": 8664040, + "step": 14930 + }, + { + "epoch": 2.2244563598451, + "grad_norm": 0.025634765625, + "learning_rate": 0.029988506619109817, + "loss": 0.7965, + "num_input_tokens_seen": 8667016, + "step": 14935 + }, + { + "epoch": 2.225201072386059, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029988430184979584, + "loss": 0.8079, + "num_input_tokens_seen": 8669928, + "step": 14940 + }, + { + "epoch": 2.2259457849270183, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029988353497635057, + "loss": 0.8034, + "num_input_tokens_seen": 8673032, + "step": 14945 + }, + { + "epoch": 2.2266904974679775, + "grad_norm": 0.023193359375, + "learning_rate": 0.02998827655707753, + "loss": 0.8037, + "num_input_tokens_seen": 8675976, + "step": 14950 + }, + { + "epoch": 2.2274352100089367, + "grad_norm": 0.022216796875, + "learning_rate": 0.029988199363308307, + "loss": 0.7947, + "num_input_tokens_seen": 8678856, + "step": 14955 + }, + { + "epoch": 2.228179922549896, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029988121916328686, + "loss": 0.8147, + "num_input_tokens_seen": 8681896, + "step": 14960 + }, + { + "epoch": 2.2289246350908547, + "grad_norm": 0.0294189453125, + "learning_rate": 0.029988044216139983, + "loss": 0.7926, + "num_input_tokens_seen": 8684968, + "step": 14965 + }, + { + "epoch": 2.2296693476318143, + "grad_norm": 0.022216796875, + "learning_rate": 0.0299879662627435, + "loss": 0.8117, + "num_input_tokens_seen": 8687912, + "step": 14970 + }, + { + "epoch": 2.230414060172773, + "grad_norm": 0.013916015625, + "learning_rate": 0.029987888056140565, + "loss": 0.793, + "num_input_tokens_seen": 8691176, + "step": 14975 + }, + { + "epoch": 2.2311587727137323, + "grad_norm": 0.01312255859375, + "learning_rate": 0.02998780959633249, + "loss": 0.8088, + "num_input_tokens_seen": 8694248, + "step": 14980 + }, + { + "epoch": 2.2319034852546915, + "grad_norm": 0.0140380859375, + "learning_rate": 0.029987730883320605, + "loss": 0.7969, + "num_input_tokens_seen": 8697096, + "step": 14985 + }, + { + "epoch": 2.2326481977956507, + "grad_norm": 0.03369140625, + "learning_rate": 0.029987651917106246, + "loss": 0.8031, + "num_input_tokens_seen": 8699880, + "step": 14990 + }, + { + "epoch": 2.23339291033661, + "grad_norm": 0.0322265625, + "learning_rate": 0.029987572697690732, + "loss": 0.8099, + "num_input_tokens_seen": 8702728, + "step": 14995 + }, + { + "epoch": 2.234137622877569, + "grad_norm": 0.01251220703125, + "learning_rate": 0.029987493225075417, + "loss": 0.7809, + "num_input_tokens_seen": 8705608, + "step": 15000 + }, + { + "epoch": 2.2348823354185283, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029987413499261633, + "loss": 0.7979, + "num_input_tokens_seen": 8708296, + "step": 15005 + }, + { + "epoch": 2.2356270479594875, + "grad_norm": 0.02099609375, + "learning_rate": 0.02998733352025073, + "loss": 0.7893, + "num_input_tokens_seen": 8711080, + "step": 15010 + }, + { + "epoch": 2.2363717605004467, + "grad_norm": 0.021484375, + "learning_rate": 0.02998725328804406, + "loss": 0.7744, + "num_input_tokens_seen": 8713896, + "step": 15015 + }, + { + "epoch": 2.237116473041406, + "grad_norm": 0.041259765625, + "learning_rate": 0.02998717280264298, + "loss": 0.8147, + "num_input_tokens_seen": 8717000, + "step": 15020 + }, + { + "epoch": 2.237861185582365, + "grad_norm": 0.034423828125, + "learning_rate": 0.029987092064048846, + "loss": 0.8107, + "num_input_tokens_seen": 8720264, + "step": 15025 + }, + { + "epoch": 2.2386058981233243, + "grad_norm": 0.021484375, + "learning_rate": 0.029987011072263023, + "loss": 0.8262, + "num_input_tokens_seen": 8723272, + "step": 15030 + }, + { + "epoch": 2.2393506106642835, + "grad_norm": 0.02734375, + "learning_rate": 0.02998692982728688, + "loss": 0.7972, + "num_input_tokens_seen": 8725960, + "step": 15035 + }, + { + "epoch": 2.2400953232052427, + "grad_norm": 0.020751953125, + "learning_rate": 0.02998684832912179, + "loss": 0.8108, + "num_input_tokens_seen": 8729064, + "step": 15040 + }, + { + "epoch": 2.240840035746202, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029986766577769133, + "loss": 0.7994, + "num_input_tokens_seen": 8732040, + "step": 15045 + }, + { + "epoch": 2.241584748287161, + "grad_norm": 0.0291748046875, + "learning_rate": 0.029986684573230282, + "loss": 0.8237, + "num_input_tokens_seen": 8735080, + "step": 15050 + }, + { + "epoch": 2.2423294608281203, + "grad_norm": 0.03515625, + "learning_rate": 0.029986602315506632, + "loss": 0.8013, + "num_input_tokens_seen": 8737800, + "step": 15055 + }, + { + "epoch": 2.2430741733690795, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029986519804599564, + "loss": 0.7889, + "num_input_tokens_seen": 8740648, + "step": 15060 + }, + { + "epoch": 2.2438188859100388, + "grad_norm": 0.012939453125, + "learning_rate": 0.029986437040510475, + "loss": 0.8082, + "num_input_tokens_seen": 8743560, + "step": 15065 + }, + { + "epoch": 2.244563598450998, + "grad_norm": 0.0205078125, + "learning_rate": 0.029986354023240763, + "loss": 0.8038, + "num_input_tokens_seen": 8746824, + "step": 15070 + }, + { + "epoch": 2.245308310991957, + "grad_norm": 0.021240234375, + "learning_rate": 0.029986270752791835, + "loss": 0.8213, + "num_input_tokens_seen": 8749672, + "step": 15075 + }, + { + "epoch": 2.2460530235329164, + "grad_norm": 0.02392578125, + "learning_rate": 0.029986187229165087, + "loss": 0.8019, + "num_input_tokens_seen": 8752392, + "step": 15080 + }, + { + "epoch": 2.2467977360738756, + "grad_norm": 0.021240234375, + "learning_rate": 0.029986103452361943, + "loss": 0.784, + "num_input_tokens_seen": 8755080, + "step": 15085 + }, + { + "epoch": 2.2475424486148348, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02998601942238381, + "loss": 0.8162, + "num_input_tokens_seen": 8758152, + "step": 15090 + }, + { + "epoch": 2.248287161155794, + "grad_norm": 0.021484375, + "learning_rate": 0.029985935139232114, + "loss": 0.7955, + "num_input_tokens_seen": 8761160, + "step": 15095 + }, + { + "epoch": 2.249031873696753, + "grad_norm": 0.022216796875, + "learning_rate": 0.02998585060290827, + "loss": 0.7869, + "num_input_tokens_seen": 8764104, + "step": 15100 + }, + { + "epoch": 2.2497765862377124, + "grad_norm": 0.029541015625, + "learning_rate": 0.029985765813413712, + "loss": 0.8175, + "num_input_tokens_seen": 8766952, + "step": 15105 + }, + { + "epoch": 2.2505212987786716, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029985680770749874, + "loss": 0.8162, + "num_input_tokens_seen": 8769800, + "step": 15110 + }, + { + "epoch": 2.2512660113196308, + "grad_norm": 0.032958984375, + "learning_rate": 0.02998559547491819, + "loss": 0.7814, + "num_input_tokens_seen": 8773224, + "step": 15115 + }, + { + "epoch": 2.25201072386059, + "grad_norm": 0.02197265625, + "learning_rate": 0.029985509925920098, + "loss": 0.813, + "num_input_tokens_seen": 8775848, + "step": 15120 + }, + { + "epoch": 2.252755436401549, + "grad_norm": 0.02783203125, + "learning_rate": 0.02998542412375705, + "loss": 0.8005, + "num_input_tokens_seen": 8778952, + "step": 15125 + }, + { + "epoch": 2.2535001489425084, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02998533806843049, + "loss": 0.8161, + "num_input_tokens_seen": 8782280, + "step": 15130 + }, + { + "epoch": 2.2542448614834676, + "grad_norm": 0.02587890625, + "learning_rate": 0.029985251759941874, + "loss": 0.7925, + "num_input_tokens_seen": 8785416, + "step": 15135 + }, + { + "epoch": 2.2549895740244263, + "grad_norm": 0.024658203125, + "learning_rate": 0.029985165198292658, + "loss": 0.7955, + "num_input_tokens_seen": 8788328, + "step": 15140 + }, + { + "epoch": 2.255734286565386, + "grad_norm": 0.029052734375, + "learning_rate": 0.029985078383484312, + "loss": 0.8046, + "num_input_tokens_seen": 8791240, + "step": 15145 + }, + { + "epoch": 2.2564789991063448, + "grad_norm": 0.034423828125, + "learning_rate": 0.029984991315518288, + "loss": 0.8018, + "num_input_tokens_seen": 8794344, + "step": 15150 + }, + { + "epoch": 2.257223711647304, + "grad_norm": 0.0277099609375, + "learning_rate": 0.029984903994396073, + "loss": 0.8017, + "num_input_tokens_seen": 8797544, + "step": 15155 + }, + { + "epoch": 2.257968424188263, + "grad_norm": 0.033203125, + "learning_rate": 0.02998481642011913, + "loss": 0.8123, + "num_input_tokens_seen": 8800680, + "step": 15160 + }, + { + "epoch": 2.2587131367292224, + "grad_norm": 0.032958984375, + "learning_rate": 0.02998472859268895, + "loss": 0.793, + "num_input_tokens_seen": 8803592, + "step": 15165 + }, + { + "epoch": 2.2594578492701816, + "grad_norm": 0.016845703125, + "learning_rate": 0.029984640512107003, + "loss": 0.7852, + "num_input_tokens_seen": 8806472, + "step": 15170 + }, + { + "epoch": 2.2602025618111408, + "grad_norm": 0.03955078125, + "learning_rate": 0.02998455217837479, + "loss": 0.8221, + "num_input_tokens_seen": 8809224, + "step": 15175 + }, + { + "epoch": 2.2609472743521, + "grad_norm": 0.024169921875, + "learning_rate": 0.029984463591493794, + "loss": 0.7951, + "num_input_tokens_seen": 8811976, + "step": 15180 + }, + { + "epoch": 2.261691986893059, + "grad_norm": 0.035400390625, + "learning_rate": 0.029984374751465512, + "loss": 0.7923, + "num_input_tokens_seen": 8814824, + "step": 15185 + }, + { + "epoch": 2.2624366994340184, + "grad_norm": 0.017822265625, + "learning_rate": 0.029984285658291455, + "loss": 0.8072, + "num_input_tokens_seen": 8817832, + "step": 15190 + }, + { + "epoch": 2.2631814119749776, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02998419631197312, + "loss": 0.815, + "num_input_tokens_seen": 8820968, + "step": 15195 + }, + { + "epoch": 2.2639261245159368, + "grad_norm": 0.05029296875, + "learning_rate": 0.029984106712512015, + "loss": 0.8062, + "num_input_tokens_seen": 8823656, + "step": 15200 + }, + { + "epoch": 2.264670837056896, + "grad_norm": 0.0380859375, + "learning_rate": 0.029984016859909656, + "loss": 0.8009, + "num_input_tokens_seen": 8826408, + "step": 15205 + }, + { + "epoch": 2.265415549597855, + "grad_norm": 0.028564453125, + "learning_rate": 0.029983926754167563, + "loss": 0.8023, + "num_input_tokens_seen": 8829224, + "step": 15210 + }, + { + "epoch": 2.2661602621388144, + "grad_norm": 0.03759765625, + "learning_rate": 0.029983836395287258, + "loss": 0.8084, + "num_input_tokens_seen": 8831976, + "step": 15215 + }, + { + "epoch": 2.2669049746797736, + "grad_norm": 0.035888671875, + "learning_rate": 0.02998374578327026, + "loss": 0.8124, + "num_input_tokens_seen": 8835048, + "step": 15220 + }, + { + "epoch": 2.267649687220733, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02998365491811811, + "loss": 0.8114, + "num_input_tokens_seen": 8838024, + "step": 15225 + }, + { + "epoch": 2.268394399761692, + "grad_norm": 0.041259765625, + "learning_rate": 0.02998356379983234, + "loss": 0.8131, + "num_input_tokens_seen": 8841032, + "step": 15230 + }, + { + "epoch": 2.269139112302651, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029983472428414485, + "loss": 0.8178, + "num_input_tokens_seen": 8844008, + "step": 15235 + }, + { + "epoch": 2.2698838248436104, + "grad_norm": 0.023681640625, + "learning_rate": 0.029983380803866096, + "loss": 0.8031, + "num_input_tokens_seen": 8846856, + "step": 15240 + }, + { + "epoch": 2.2706285373845696, + "grad_norm": 0.022705078125, + "learning_rate": 0.029983288926188716, + "loss": 0.7956, + "num_input_tokens_seen": 8849512, + "step": 15245 + }, + { + "epoch": 2.271373249925529, + "grad_norm": 0.026123046875, + "learning_rate": 0.029983196795383894, + "loss": 0.7844, + "num_input_tokens_seen": 8852552, + "step": 15250 + }, + { + "epoch": 2.272117962466488, + "grad_norm": 0.01470947265625, + "learning_rate": 0.02998310441145319, + "loss": 0.808, + "num_input_tokens_seen": 8855720, + "step": 15255 + }, + { + "epoch": 2.272862675007447, + "grad_norm": 0.01531982421875, + "learning_rate": 0.02998301177439817, + "loss": 0.7908, + "num_input_tokens_seen": 8858536, + "step": 15260 + }, + { + "epoch": 2.2736073875484064, + "grad_norm": 0.03515625, + "learning_rate": 0.029982918884220392, + "loss": 0.7984, + "num_input_tokens_seen": 8861352, + "step": 15265 + }, + { + "epoch": 2.2743521000893656, + "grad_norm": 0.038330078125, + "learning_rate": 0.029982825740921426, + "loss": 0.8309, + "num_input_tokens_seen": 8864392, + "step": 15270 + }, + { + "epoch": 2.275096812630325, + "grad_norm": 0.0146484375, + "learning_rate": 0.029982732344502848, + "loss": 0.8007, + "num_input_tokens_seen": 8867368, + "step": 15275 + }, + { + "epoch": 2.275841525171284, + "grad_norm": 0.033203125, + "learning_rate": 0.029982638694966236, + "loss": 0.7891, + "num_input_tokens_seen": 8870184, + "step": 15280 + }, + { + "epoch": 2.276586237712243, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02998254479231317, + "loss": 0.8041, + "num_input_tokens_seen": 8873160, + "step": 15285 + }, + { + "epoch": 2.2773309502532024, + "grad_norm": 0.032958984375, + "learning_rate": 0.029982450636545237, + "loss": 0.8124, + "num_input_tokens_seen": 8875880, + "step": 15290 + }, + { + "epoch": 2.2780756627941616, + "grad_norm": 0.039306640625, + "learning_rate": 0.029982356227664028, + "loss": 0.7997, + "num_input_tokens_seen": 8878952, + "step": 15295 + }, + { + "epoch": 2.278820375335121, + "grad_norm": 0.047607421875, + "learning_rate": 0.029982261565671138, + "loss": 0.8738, + "num_input_tokens_seen": 8881672, + "step": 15300 + }, + { + "epoch": 2.2795650878760796, + "grad_norm": 0.029296875, + "learning_rate": 0.029982166650568166, + "loss": 0.7914, + "num_input_tokens_seen": 8884936, + "step": 15305 + }, + { + "epoch": 2.2803098004170392, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029982071482356716, + "loss": 0.813, + "num_input_tokens_seen": 8887720, + "step": 15310 + }, + { + "epoch": 2.281054512957998, + "grad_norm": 0.0224609375, + "learning_rate": 0.029981976061038394, + "loss": 0.7876, + "num_input_tokens_seen": 8891240, + "step": 15315 + }, + { + "epoch": 2.2817992254989576, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02998188038661482, + "loss": 0.7998, + "num_input_tokens_seen": 8894120, + "step": 15320 + }, + { + "epoch": 2.2825439380399164, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029981784459087595, + "loss": 0.8186, + "num_input_tokens_seen": 8897032, + "step": 15325 + }, + { + "epoch": 2.2832886505808756, + "grad_norm": 0.021484375, + "learning_rate": 0.029981688278458353, + "loss": 0.8039, + "num_input_tokens_seen": 8899688, + "step": 15330 + }, + { + "epoch": 2.284033363121835, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029981591844728712, + "loss": 0.7997, + "num_input_tokens_seen": 8902504, + "step": 15335 + }, + { + "epoch": 2.284778075662794, + "grad_norm": 0.01324462890625, + "learning_rate": 0.029981495157900304, + "loss": 0.8022, + "num_input_tokens_seen": 8905224, + "step": 15340 + }, + { + "epoch": 2.285522788203753, + "grad_norm": 0.021728515625, + "learning_rate": 0.029981398217974763, + "loss": 0.7909, + "num_input_tokens_seen": 8908040, + "step": 15345 + }, + { + "epoch": 2.2862675007447124, + "grad_norm": 0.01361083984375, + "learning_rate": 0.029981301024953725, + "loss": 0.8139, + "num_input_tokens_seen": 8911208, + "step": 15350 + }, + { + "epoch": 2.2870122132856716, + "grad_norm": 0.021728515625, + "learning_rate": 0.029981203578838836, + "loss": 0.791, + "num_input_tokens_seen": 8914472, + "step": 15355 + }, + { + "epoch": 2.287756925826631, + "grad_norm": 0.02001953125, + "learning_rate": 0.02998110587963173, + "loss": 0.7975, + "num_input_tokens_seen": 8917160, + "step": 15360 + }, + { + "epoch": 2.28850163836759, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02998100792733407, + "loss": 0.79, + "num_input_tokens_seen": 8920008, + "step": 15365 + }, + { + "epoch": 2.289246350908549, + "grad_norm": 0.023681640625, + "learning_rate": 0.029980909721947512, + "loss": 0.8059, + "num_input_tokens_seen": 8922888, + "step": 15370 + }, + { + "epoch": 2.2899910634495084, + "grad_norm": 0.03564453125, + "learning_rate": 0.029980811263473708, + "loss": 0.795, + "num_input_tokens_seen": 8925576, + "step": 15375 + }, + { + "epoch": 2.2907357759904676, + "grad_norm": 0.01446533203125, + "learning_rate": 0.02998071255191432, + "loss": 0.8138, + "num_input_tokens_seen": 8928328, + "step": 15380 + }, + { + "epoch": 2.291480488531427, + "grad_norm": 0.01416015625, + "learning_rate": 0.02998061358727102, + "loss": 0.8408, + "num_input_tokens_seen": 8931208, + "step": 15385 + }, + { + "epoch": 2.292225201072386, + "grad_norm": 0.032470703125, + "learning_rate": 0.02998051436954548, + "loss": 0.7943, + "num_input_tokens_seen": 8933896, + "step": 15390 + }, + { + "epoch": 2.2929699136133452, + "grad_norm": 0.02099609375, + "learning_rate": 0.02998041489873938, + "loss": 0.8062, + "num_input_tokens_seen": 8936712, + "step": 15395 + }, + { + "epoch": 2.2937146261543044, + "grad_norm": 0.0205078125, + "learning_rate": 0.02998031517485439, + "loss": 0.7924, + "num_input_tokens_seen": 8939304, + "step": 15400 + }, + { + "epoch": 2.2944593386952636, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0299802151978922, + "loss": 0.7938, + "num_input_tokens_seen": 8942120, + "step": 15405 + }, + { + "epoch": 2.295204051236223, + "grad_norm": 0.051025390625, + "learning_rate": 0.0299801149678545, + "loss": 0.7994, + "num_input_tokens_seen": 8945192, + "step": 15410 + }, + { + "epoch": 2.295948763777182, + "grad_norm": 0.01483154296875, + "learning_rate": 0.029980014484742985, + "loss": 0.7895, + "num_input_tokens_seen": 8948392, + "step": 15415 + }, + { + "epoch": 2.2966934763181412, + "grad_norm": 0.0133056640625, + "learning_rate": 0.02997991374855935, + "loss": 0.7897, + "num_input_tokens_seen": 8951016, + "step": 15420 + }, + { + "epoch": 2.2974381888591004, + "grad_norm": 0.02685546875, + "learning_rate": 0.029979812759305294, + "loss": 0.7943, + "num_input_tokens_seen": 8953800, + "step": 15425 + }, + { + "epoch": 2.2981829014000597, + "grad_norm": 0.0322265625, + "learning_rate": 0.02997971151698253, + "loss": 0.8188, + "num_input_tokens_seen": 8956808, + "step": 15430 + }, + { + "epoch": 2.298927613941019, + "grad_norm": 0.01611328125, + "learning_rate": 0.029979610021592763, + "loss": 0.8042, + "num_input_tokens_seen": 8959592, + "step": 15435 + }, + { + "epoch": 2.299672326481978, + "grad_norm": 0.020751953125, + "learning_rate": 0.02997950827313771, + "loss": 0.7745, + "num_input_tokens_seen": 8962504, + "step": 15440 + }, + { + "epoch": 2.3004170390229373, + "grad_norm": 0.02099609375, + "learning_rate": 0.02997940627161909, + "loss": 0.7954, + "num_input_tokens_seen": 8965416, + "step": 15445 + }, + { + "epoch": 2.3011617515638965, + "grad_norm": 0.02197265625, + "learning_rate": 0.029979304017038626, + "loss": 0.8053, + "num_input_tokens_seen": 8968264, + "step": 15450 + }, + { + "epoch": 2.3019064641048557, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02997920150939804, + "loss": 0.8173, + "num_input_tokens_seen": 8970888, + "step": 15455 + }, + { + "epoch": 2.302651176645815, + "grad_norm": 0.01190185546875, + "learning_rate": 0.02997909874869907, + "loss": 0.8039, + "num_input_tokens_seen": 8973896, + "step": 15460 + }, + { + "epoch": 2.303395889186774, + "grad_norm": 0.022705078125, + "learning_rate": 0.029978995734943455, + "loss": 0.7893, + "num_input_tokens_seen": 8976712, + "step": 15465 + }, + { + "epoch": 2.3041406017277333, + "grad_norm": 0.02880859375, + "learning_rate": 0.02997889246813293, + "loss": 0.8037, + "num_input_tokens_seen": 8979784, + "step": 15470 + }, + { + "epoch": 2.3048853142686925, + "grad_norm": 0.0224609375, + "learning_rate": 0.029978788948269237, + "loss": 0.8078, + "num_input_tokens_seen": 8982760, + "step": 15475 + }, + { + "epoch": 2.3056300268096512, + "grad_norm": 0.03564453125, + "learning_rate": 0.02997868517535413, + "loss": 0.8155, + "num_input_tokens_seen": 8985448, + "step": 15480 + }, + { + "epoch": 2.306374739350611, + "grad_norm": 0.0322265625, + "learning_rate": 0.02997858114938936, + "loss": 0.7943, + "num_input_tokens_seen": 8988904, + "step": 15485 + }, + { + "epoch": 2.3071194518915696, + "grad_norm": 0.01226806640625, + "learning_rate": 0.029978476870376688, + "loss": 0.8295, + "num_input_tokens_seen": 8991368, + "step": 15490 + }, + { + "epoch": 2.3078641644325293, + "grad_norm": 0.02197265625, + "learning_rate": 0.029978372338317873, + "loss": 0.7978, + "num_input_tokens_seen": 8994152, + "step": 15495 + }, + { + "epoch": 2.308608876973488, + "grad_norm": 0.021484375, + "learning_rate": 0.02997826755321468, + "loss": 0.7956, + "num_input_tokens_seen": 8997000, + "step": 15500 + }, + { + "epoch": 2.3093535895144472, + "grad_norm": 0.02587890625, + "learning_rate": 0.029978162515068876, + "loss": 0.7963, + "num_input_tokens_seen": 8999912, + "step": 15505 + }, + { + "epoch": 2.3100983020554064, + "grad_norm": 0.01556396484375, + "learning_rate": 0.02997805722388224, + "loss": 0.807, + "num_input_tokens_seen": 9002760, + "step": 15510 + }, + { + "epoch": 2.3108430145963657, + "grad_norm": 0.02099609375, + "learning_rate": 0.029977951679656556, + "loss": 0.806, + "num_input_tokens_seen": 9005672, + "step": 15515 + }, + { + "epoch": 2.311587727137325, + "grad_norm": 0.021240234375, + "learning_rate": 0.029977845882393595, + "loss": 0.7861, + "num_input_tokens_seen": 9008648, + "step": 15520 + }, + { + "epoch": 2.312332439678284, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029977739832095153, + "loss": 0.7888, + "num_input_tokens_seen": 9011464, + "step": 15525 + }, + { + "epoch": 2.3130771522192433, + "grad_norm": 0.01373291015625, + "learning_rate": 0.02997763352876302, + "loss": 0.807, + "num_input_tokens_seen": 9014280, + "step": 15530 + }, + { + "epoch": 2.3138218647602025, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02997752697239899, + "loss": 0.7897, + "num_input_tokens_seen": 9017032, + "step": 15535 + }, + { + "epoch": 2.3145665773011617, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029977420163004864, + "loss": 0.7949, + "num_input_tokens_seen": 9020072, + "step": 15540 + }, + { + "epoch": 2.315311289842121, + "grad_norm": 0.021484375, + "learning_rate": 0.02997731310058245, + "loss": 0.8038, + "num_input_tokens_seen": 9022856, + "step": 15545 + }, + { + "epoch": 2.31605600238308, + "grad_norm": 0.029052734375, + "learning_rate": 0.02997720578513355, + "loss": 0.814, + "num_input_tokens_seen": 9025736, + "step": 15550 + }, + { + "epoch": 2.3168007149240393, + "grad_norm": 0.025390625, + "learning_rate": 0.02997709821665998, + "loss": 0.7938, + "num_input_tokens_seen": 9029000, + "step": 15555 + }, + { + "epoch": 2.3175454274649985, + "grad_norm": 0.02197265625, + "learning_rate": 0.02997699039516356, + "loss": 0.7751, + "num_input_tokens_seen": 9031816, + "step": 15560 + }, + { + "epoch": 2.3182901400059577, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029976882320646107, + "loss": 0.7904, + "num_input_tokens_seen": 9034792, + "step": 15565 + }, + { + "epoch": 2.319034852546917, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02997677399310945, + "loss": 0.7792, + "num_input_tokens_seen": 9037608, + "step": 15570 + }, + { + "epoch": 2.319779565087876, + "grad_norm": 0.022216796875, + "learning_rate": 0.029976665412555416, + "loss": 0.7795, + "num_input_tokens_seen": 9040392, + "step": 15575 + }, + { + "epoch": 2.3205242776288353, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029976556578985847, + "loss": 0.814, + "num_input_tokens_seen": 9043272, + "step": 15580 + }, + { + "epoch": 2.3212689901697945, + "grad_norm": 0.022705078125, + "learning_rate": 0.02997644749240257, + "loss": 0.8141, + "num_input_tokens_seen": 9046376, + "step": 15585 + }, + { + "epoch": 2.3220137027107537, + "grad_norm": 0.044921875, + "learning_rate": 0.02997633815280744, + "loss": 0.8095, + "num_input_tokens_seen": 9049128, + "step": 15590 + }, + { + "epoch": 2.322758415251713, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02997622856020229, + "loss": 0.7988, + "num_input_tokens_seen": 9051944, + "step": 15595 + }, + { + "epoch": 2.323503127792672, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029976118714588988, + "loss": 0.8362, + "num_input_tokens_seen": 9054600, + "step": 15600 + }, + { + "epoch": 2.3242478403336313, + "grad_norm": 0.020263671875, + "learning_rate": 0.029976008615969378, + "loss": 0.7863, + "num_input_tokens_seen": 9057352, + "step": 15605 + }, + { + "epoch": 2.3249925528745905, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02997589826434532, + "loss": 0.791, + "num_input_tokens_seen": 9060040, + "step": 15610 + }, + { + "epoch": 2.3257372654155497, + "grad_norm": 0.023193359375, + "learning_rate": 0.02997578765971869, + "loss": 0.7943, + "num_input_tokens_seen": 9062984, + "step": 15615 + }, + { + "epoch": 2.326481977956509, + "grad_norm": 0.033447265625, + "learning_rate": 0.029975676802091338, + "loss": 0.8169, + "num_input_tokens_seen": 9065928, + "step": 15620 + }, + { + "epoch": 2.327226690497468, + "grad_norm": 0.02099609375, + "learning_rate": 0.029975565691465155, + "loss": 0.7992, + "num_input_tokens_seen": 9068872, + "step": 15625 + }, + { + "epoch": 2.3279714030384273, + "grad_norm": 0.035888671875, + "learning_rate": 0.02997545432784201, + "loss": 0.8184, + "num_input_tokens_seen": 9071432, + "step": 15630 + }, + { + "epoch": 2.3287161155793865, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02997534271122378, + "loss": 0.7791, + "num_input_tokens_seen": 9074312, + "step": 15635 + }, + { + "epoch": 2.3294608281203457, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029975230841612358, + "loss": 0.805, + "num_input_tokens_seen": 9077128, + "step": 15640 + }, + { + "epoch": 2.330205540661305, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029975118719009628, + "loss": 0.7913, + "num_input_tokens_seen": 9080008, + "step": 15645 + }, + { + "epoch": 2.330950253202264, + "grad_norm": 0.031005859375, + "learning_rate": 0.02997500634341749, + "loss": 0.8157, + "num_input_tokens_seen": 9083016, + "step": 15650 + }, + { + "epoch": 2.331694965743223, + "grad_norm": 0.028564453125, + "learning_rate": 0.029974893714837837, + "loss": 0.8097, + "num_input_tokens_seen": 9085672, + "step": 15655 + }, + { + "epoch": 2.3324396782841825, + "grad_norm": 0.01446533203125, + "learning_rate": 0.02997478083327258, + "loss": 0.8182, + "num_input_tokens_seen": 9088392, + "step": 15660 + }, + { + "epoch": 2.3331843908251413, + "grad_norm": 0.028076171875, + "learning_rate": 0.029974667698723617, + "loss": 0.7827, + "num_input_tokens_seen": 9091368, + "step": 15665 + }, + { + "epoch": 2.333929103366101, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02997455431119286, + "loss": 0.8026, + "num_input_tokens_seen": 9094216, + "step": 15670 + }, + { + "epoch": 2.3346738159070597, + "grad_norm": 0.029541015625, + "learning_rate": 0.029974440670682233, + "loss": 0.837, + "num_input_tokens_seen": 9096968, + "step": 15675 + }, + { + "epoch": 2.335418528448019, + "grad_norm": 0.034423828125, + "learning_rate": 0.029974326777193645, + "loss": 0.8036, + "num_input_tokens_seen": 9099976, + "step": 15680 + }, + { + "epoch": 2.336163240988978, + "grad_norm": 0.0146484375, + "learning_rate": 0.029974212630729028, + "loss": 0.7864, + "num_input_tokens_seen": 9102856, + "step": 15685 + }, + { + "epoch": 2.3369079535299373, + "grad_norm": 0.031982421875, + "learning_rate": 0.02997409823129031, + "loss": 0.8024, + "num_input_tokens_seen": 9106088, + "step": 15690 + }, + { + "epoch": 2.3376526660708965, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02997398357887942, + "loss": 0.8104, + "num_input_tokens_seen": 9109096, + "step": 15695 + }, + { + "epoch": 2.3383973786118557, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029973868673498296, + "loss": 0.8084, + "num_input_tokens_seen": 9111848, + "step": 15700 + }, + { + "epoch": 2.339142091152815, + "grad_norm": 0.0341796875, + "learning_rate": 0.029973753515148874, + "loss": 0.8073, + "num_input_tokens_seen": 9114504, + "step": 15705 + }, + { + "epoch": 2.339886803693774, + "grad_norm": 0.01422119140625, + "learning_rate": 0.029973638103833113, + "loss": 0.7975, + "num_input_tokens_seen": 9117256, + "step": 15710 + }, + { + "epoch": 2.3406315162347333, + "grad_norm": 0.015380859375, + "learning_rate": 0.02997352243955295, + "loss": 0.8011, + "num_input_tokens_seen": 9120552, + "step": 15715 + }, + { + "epoch": 2.3413762287756925, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029973406522310342, + "loss": 0.8182, + "num_input_tokens_seen": 9123144, + "step": 15720 + }, + { + "epoch": 2.3421209413166517, + "grad_norm": 0.021484375, + "learning_rate": 0.029973290352107256, + "loss": 0.804, + "num_input_tokens_seen": 9126120, + "step": 15725 + }, + { + "epoch": 2.342865653857611, + "grad_norm": 0.01446533203125, + "learning_rate": 0.029973173928945645, + "loss": 0.7936, + "num_input_tokens_seen": 9129064, + "step": 15730 + }, + { + "epoch": 2.34361036639857, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029973057252827475, + "loss": 0.8118, + "num_input_tokens_seen": 9131880, + "step": 15735 + }, + { + "epoch": 2.3443550789395293, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029972940323754724, + "loss": 0.7791, + "num_input_tokens_seen": 9134600, + "step": 15740 + }, + { + "epoch": 2.3450997914804885, + "grad_norm": 0.0289306640625, + "learning_rate": 0.029972823141729363, + "loss": 0.8124, + "num_input_tokens_seen": 9137448, + "step": 15745 + }, + { + "epoch": 2.3458445040214477, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029972705706753373, + "loss": 0.797, + "num_input_tokens_seen": 9140072, + "step": 15750 + }, + { + "epoch": 2.346589216562407, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02997258801882874, + "loss": 0.7949, + "num_input_tokens_seen": 9142728, + "step": 15755 + }, + { + "epoch": 2.347333929103366, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029972470077957447, + "loss": 0.8038, + "num_input_tokens_seen": 9145448, + "step": 15760 + }, + { + "epoch": 2.3480786416443253, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029972351884141487, + "loss": 0.7996, + "num_input_tokens_seen": 9148232, + "step": 15765 + }, + { + "epoch": 2.3488233541852845, + "grad_norm": 0.033203125, + "learning_rate": 0.029972233437382867, + "loss": 0.7735, + "num_input_tokens_seen": 9151400, + "step": 15770 + }, + { + "epoch": 2.3495680667262437, + "grad_norm": 0.035888671875, + "learning_rate": 0.02997211473768357, + "loss": 0.802, + "num_input_tokens_seen": 9154440, + "step": 15775 + }, + { + "epoch": 2.350312779267203, + "grad_norm": 0.031982421875, + "learning_rate": 0.029971995785045617, + "loss": 0.8192, + "num_input_tokens_seen": 9157352, + "step": 15780 + }, + { + "epoch": 2.351057491808162, + "grad_norm": 0.045166015625, + "learning_rate": 0.02997187657947101, + "loss": 0.8064, + "num_input_tokens_seen": 9160200, + "step": 15785 + }, + { + "epoch": 2.3518022043491214, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02997175712096177, + "loss": 0.8129, + "num_input_tokens_seen": 9163144, + "step": 15790 + }, + { + "epoch": 2.3525469168900806, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029971637409519906, + "loss": 0.8197, + "num_input_tokens_seen": 9166344, + "step": 15795 + }, + { + "epoch": 2.3532916294310398, + "grad_norm": 0.0277099609375, + "learning_rate": 0.029971517445147446, + "loss": 0.8124, + "num_input_tokens_seen": 9169032, + "step": 15800 + }, + { + "epoch": 2.354036341971999, + "grad_norm": 0.0205078125, + "learning_rate": 0.029971397227846416, + "loss": 0.7713, + "num_input_tokens_seen": 9172104, + "step": 15805 + }, + { + "epoch": 2.354781054512958, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029971276757618844, + "loss": 0.7907, + "num_input_tokens_seen": 9174920, + "step": 15810 + }, + { + "epoch": 2.3555257670539174, + "grad_norm": 0.0230712890625, + "learning_rate": 0.029971156034466765, + "loss": 0.8126, + "num_input_tokens_seen": 9177864, + "step": 15815 + }, + { + "epoch": 2.3562704795948766, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029971035058392225, + "loss": 0.7967, + "num_input_tokens_seen": 9180552, + "step": 15820 + }, + { + "epoch": 2.3570151921358358, + "grad_norm": 0.0147705078125, + "learning_rate": 0.02997091382939726, + "loss": 0.8037, + "num_input_tokens_seen": 9183304, + "step": 15825 + }, + { + "epoch": 2.3577599046767945, + "grad_norm": 0.0279541015625, + "learning_rate": 0.029970792347483927, + "loss": 0.7863, + "num_input_tokens_seen": 9186088, + "step": 15830 + }, + { + "epoch": 2.358504617217754, + "grad_norm": 0.032958984375, + "learning_rate": 0.02997067061265427, + "loss": 0.8176, + "num_input_tokens_seen": 9188968, + "step": 15835 + }, + { + "epoch": 2.359249329758713, + "grad_norm": 0.021240234375, + "learning_rate": 0.029970548624910345, + "loss": 0.8024, + "num_input_tokens_seen": 9191592, + "step": 15840 + }, + { + "epoch": 2.359994042299672, + "grad_norm": 0.0322265625, + "learning_rate": 0.02997042638425422, + "loss": 0.7885, + "num_input_tokens_seen": 9194504, + "step": 15845 + }, + { + "epoch": 2.3607387548406313, + "grad_norm": 0.0341796875, + "learning_rate": 0.029970303890687958, + "loss": 0.8295, + "num_input_tokens_seen": 9197224, + "step": 15850 + }, + { + "epoch": 2.3614834673815905, + "grad_norm": 0.01287841796875, + "learning_rate": 0.029970181144213626, + "loss": 0.7795, + "num_input_tokens_seen": 9200264, + "step": 15855 + }, + { + "epoch": 2.3622281799225497, + "grad_norm": 0.02099609375, + "learning_rate": 0.029970058144833295, + "loss": 0.8081, + "num_input_tokens_seen": 9203112, + "step": 15860 + }, + { + "epoch": 2.362972892463509, + "grad_norm": 0.035888671875, + "learning_rate": 0.029969934892549052, + "loss": 0.8026, + "num_input_tokens_seen": 9205928, + "step": 15865 + }, + { + "epoch": 2.363717605004468, + "grad_norm": 0.0269775390625, + "learning_rate": 0.029969811387362967, + "loss": 0.8055, + "num_input_tokens_seen": 9208584, + "step": 15870 + }, + { + "epoch": 2.3644623175454274, + "grad_norm": 0.032958984375, + "learning_rate": 0.02996968762927714, + "loss": 0.7905, + "num_input_tokens_seen": 9211240, + "step": 15875 + }, + { + "epoch": 2.3652070300863866, + "grad_norm": 0.03515625, + "learning_rate": 0.029969563618293654, + "loss": 0.7961, + "num_input_tokens_seen": 9214568, + "step": 15880 + }, + { + "epoch": 2.3659517426273458, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0299694393544146, + "loss": 0.8024, + "num_input_tokens_seen": 9217384, + "step": 15885 + }, + { + "epoch": 2.366696455168305, + "grad_norm": 0.03125, + "learning_rate": 0.029969314837642085, + "loss": 0.8032, + "num_input_tokens_seen": 9220136, + "step": 15890 + }, + { + "epoch": 2.367441167709264, + "grad_norm": 0.013671875, + "learning_rate": 0.029969190067978213, + "loss": 0.8001, + "num_input_tokens_seen": 9222984, + "step": 15895 + }, + { + "epoch": 2.3681858802502234, + "grad_norm": 0.02099609375, + "learning_rate": 0.029969065045425083, + "loss": 0.7927, + "num_input_tokens_seen": 9225896, + "step": 15900 + }, + { + "epoch": 2.3689305927911826, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029968939769984815, + "loss": 0.8017, + "num_input_tokens_seen": 9228776, + "step": 15905 + }, + { + "epoch": 2.3696753053321418, + "grad_norm": 0.034423828125, + "learning_rate": 0.029968814241659526, + "loss": 0.7734, + "num_input_tokens_seen": 9231720, + "step": 15910 + }, + { + "epoch": 2.370420017873101, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029968688460451334, + "loss": 0.7981, + "num_input_tokens_seen": 9234600, + "step": 15915 + }, + { + "epoch": 2.37116473041406, + "grad_norm": 0.032470703125, + "learning_rate": 0.029968562426362363, + "loss": 0.8094, + "num_input_tokens_seen": 9237288, + "step": 15920 + }, + { + "epoch": 2.3719094429550194, + "grad_norm": 0.0294189453125, + "learning_rate": 0.029968436139394743, + "loss": 0.7745, + "num_input_tokens_seen": 9240104, + "step": 15925 + }, + { + "epoch": 2.3726541554959786, + "grad_norm": 0.060791015625, + "learning_rate": 0.029968309599550606, + "loss": 0.8383, + "num_input_tokens_seen": 9242888, + "step": 15930 + }, + { + "epoch": 2.373398868036938, + "grad_norm": 0.02197265625, + "learning_rate": 0.029968182806832094, + "loss": 0.7948, + "num_input_tokens_seen": 9246024, + "step": 15935 + }, + { + "epoch": 2.374143580577897, + "grad_norm": 0.0286865234375, + "learning_rate": 0.029968055761241345, + "loss": 0.8159, + "num_input_tokens_seen": 9248936, + "step": 15940 + }, + { + "epoch": 2.374888293118856, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029967928462780506, + "loss": 0.7799, + "num_input_tokens_seen": 9251912, + "step": 15945 + }, + { + "epoch": 2.3756330056598154, + "grad_norm": 0.033935546875, + "learning_rate": 0.02996780091145173, + "loss": 0.8146, + "num_input_tokens_seen": 9254600, + "step": 15950 + }, + { + "epoch": 2.3763777182007746, + "grad_norm": 0.0166015625, + "learning_rate": 0.02996767310725717, + "loss": 0.7904, + "num_input_tokens_seen": 9257864, + "step": 15955 + }, + { + "epoch": 2.377122430741734, + "grad_norm": 0.03125, + "learning_rate": 0.029967545050198982, + "loss": 0.7856, + "num_input_tokens_seen": 9260680, + "step": 15960 + }, + { + "epoch": 2.377867143282693, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029967416740279335, + "loss": 0.801, + "num_input_tokens_seen": 9263432, + "step": 15965 + }, + { + "epoch": 2.378611855823652, + "grad_norm": 0.024169921875, + "learning_rate": 0.029967288177500398, + "loss": 0.816, + "num_input_tokens_seen": 9266056, + "step": 15970 + }, + { + "epoch": 2.3793565683646114, + "grad_norm": 0.02392578125, + "learning_rate": 0.029967159361864334, + "loss": 0.8293, + "num_input_tokens_seen": 9269224, + "step": 15975 + }, + { + "epoch": 2.3801012809055706, + "grad_norm": 0.0130615234375, + "learning_rate": 0.029967030293373324, + "loss": 0.8205, + "num_input_tokens_seen": 9272136, + "step": 15980 + }, + { + "epoch": 2.38084599344653, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029966900972029555, + "loss": 0.7811, + "num_input_tokens_seen": 9275048, + "step": 15985 + }, + { + "epoch": 2.381590705987489, + "grad_norm": 0.03955078125, + "learning_rate": 0.029966771397835205, + "loss": 0.8169, + "num_input_tokens_seen": 9277768, + "step": 15990 + }, + { + "epoch": 2.382335418528448, + "grad_norm": 0.03125, + "learning_rate": 0.029966641570792457, + "loss": 0.8008, + "num_input_tokens_seen": 9280584, + "step": 15995 + }, + { + "epoch": 2.3830801310694074, + "grad_norm": 0.023681640625, + "learning_rate": 0.029966511490903517, + "loss": 0.8137, + "num_input_tokens_seen": 9283624, + "step": 16000 + }, + { + "epoch": 2.383824843610366, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029966381158170574, + "loss": 0.8003, + "num_input_tokens_seen": 9286472, + "step": 16005 + }, + { + "epoch": 2.384569556151326, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029966250572595835, + "loss": 0.8027, + "num_input_tokens_seen": 9289416, + "step": 16010 + }, + { + "epoch": 2.3853142686922846, + "grad_norm": 0.047119140625, + "learning_rate": 0.029966119734181503, + "loss": 0.8253, + "num_input_tokens_seen": 9292488, + "step": 16015 + }, + { + "epoch": 2.386058981233244, + "grad_norm": 0.014404296875, + "learning_rate": 0.029965988642929785, + "loss": 0.8047, + "num_input_tokens_seen": 9295528, + "step": 16020 + }, + { + "epoch": 2.386803693774203, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029965857298842904, + "loss": 0.8049, + "num_input_tokens_seen": 9298664, + "step": 16025 + }, + { + "epoch": 2.387548406315162, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029965725701923074, + "loss": 0.7893, + "num_input_tokens_seen": 9301352, + "step": 16030 + }, + { + "epoch": 2.3882931188561214, + "grad_norm": 0.033203125, + "learning_rate": 0.02996559385217252, + "loss": 0.8006, + "num_input_tokens_seen": 9304232, + "step": 16035 + }, + { + "epoch": 2.3890378313970806, + "grad_norm": 0.0277099609375, + "learning_rate": 0.029965461749593462, + "loss": 0.7887, + "num_input_tokens_seen": 9307176, + "step": 16040 + }, + { + "epoch": 2.38978254393804, + "grad_norm": 0.037109375, + "learning_rate": 0.029965329394188146, + "loss": 0.8017, + "num_input_tokens_seen": 9309960, + "step": 16045 + }, + { + "epoch": 2.390527256478999, + "grad_norm": 0.02685546875, + "learning_rate": 0.029965196785958794, + "loss": 0.7937, + "num_input_tokens_seen": 9312968, + "step": 16050 + }, + { + "epoch": 2.391271969019958, + "grad_norm": 0.0250244140625, + "learning_rate": 0.029965063924907658, + "loss": 0.8209, + "num_input_tokens_seen": 9315912, + "step": 16055 + }, + { + "epoch": 2.3920166815609174, + "grad_norm": 0.01422119140625, + "learning_rate": 0.02996493081103697, + "loss": 0.7917, + "num_input_tokens_seen": 9318984, + "step": 16060 + }, + { + "epoch": 2.3927613941018766, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02996479744434899, + "loss": 0.7942, + "num_input_tokens_seen": 9321832, + "step": 16065 + }, + { + "epoch": 2.393506106642836, + "grad_norm": 0.032470703125, + "learning_rate": 0.02996466382484596, + "loss": 0.807, + "num_input_tokens_seen": 9324968, + "step": 16070 + }, + { + "epoch": 2.394250819183795, + "grad_norm": 0.013916015625, + "learning_rate": 0.029964529952530156, + "loss": 0.8129, + "num_input_tokens_seen": 9327816, + "step": 16075 + }, + { + "epoch": 2.394995531724754, + "grad_norm": 0.0458984375, + "learning_rate": 0.029964395827403817, + "loss": 0.8087, + "num_input_tokens_seen": 9330472, + "step": 16080 + }, + { + "epoch": 2.3957402442657134, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029964261449469225, + "loss": 0.8, + "num_input_tokens_seen": 9333544, + "step": 16085 + }, + { + "epoch": 2.3964849568066726, + "grad_norm": 0.03271484375, + "learning_rate": 0.029964126818728645, + "loss": 0.7858, + "num_input_tokens_seen": 9336424, + "step": 16090 + }, + { + "epoch": 2.397229669347632, + "grad_norm": 0.01806640625, + "learning_rate": 0.02996399193518435, + "loss": 0.8202, + "num_input_tokens_seen": 9339464, + "step": 16095 + }, + { + "epoch": 2.397974381888591, + "grad_norm": 0.041015625, + "learning_rate": 0.02996385679883862, + "loss": 0.8134, + "num_input_tokens_seen": 9342344, + "step": 16100 + }, + { + "epoch": 2.3987190944295502, + "grad_norm": 0.019775390625, + "learning_rate": 0.02996372140969374, + "loss": 0.7902, + "num_input_tokens_seen": 9345160, + "step": 16105 + }, + { + "epoch": 2.3994638069705094, + "grad_norm": 0.029541015625, + "learning_rate": 0.02996358576775199, + "loss": 0.8157, + "num_input_tokens_seen": 9348200, + "step": 16110 + }, + { + "epoch": 2.4002085195114686, + "grad_norm": 0.023193359375, + "learning_rate": 0.02996344987301567, + "loss": 0.8097, + "num_input_tokens_seen": 9350824, + "step": 16115 + }, + { + "epoch": 2.400953232052428, + "grad_norm": 0.02685546875, + "learning_rate": 0.02996331372548707, + "loss": 0.811, + "num_input_tokens_seen": 9353832, + "step": 16120 + }, + { + "epoch": 2.401697944593387, + "grad_norm": 0.0302734375, + "learning_rate": 0.029963177325168497, + "loss": 0.8196, + "num_input_tokens_seen": 9356456, + "step": 16125 + }, + { + "epoch": 2.4024426571343462, + "grad_norm": 0.01416015625, + "learning_rate": 0.02996304067206225, + "loss": 0.7958, + "num_input_tokens_seen": 9359464, + "step": 16130 + }, + { + "epoch": 2.4031873696753054, + "grad_norm": 0.0230712890625, + "learning_rate": 0.029962903766170636, + "loss": 0.8043, + "num_input_tokens_seen": 9362600, + "step": 16135 + }, + { + "epoch": 2.4039320822162646, + "grad_norm": 0.03076171875, + "learning_rate": 0.029962766607495972, + "loss": 0.8028, + "num_input_tokens_seen": 9365352, + "step": 16140 + }, + { + "epoch": 2.404676794757224, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029962629196040577, + "loss": 0.7999, + "num_input_tokens_seen": 9368104, + "step": 16145 + }, + { + "epoch": 2.405421507298183, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029962491531806766, + "loss": 0.803, + "num_input_tokens_seen": 9370952, + "step": 16150 + }, + { + "epoch": 2.4061662198391423, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029962353614796867, + "loss": 0.7964, + "num_input_tokens_seen": 9373832, + "step": 16155 + }, + { + "epoch": 2.4069109323801015, + "grad_norm": 0.015869140625, + "learning_rate": 0.029962215445013214, + "loss": 0.8017, + "num_input_tokens_seen": 9376808, + "step": 16160 + }, + { + "epoch": 2.4076556449210607, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029962077022458136, + "loss": 0.7993, + "num_input_tokens_seen": 9379400, + "step": 16165 + }, + { + "epoch": 2.4084003574620194, + "grad_norm": 0.0283203125, + "learning_rate": 0.02996193834713397, + "loss": 0.7966, + "num_input_tokens_seen": 9382024, + "step": 16170 + }, + { + "epoch": 2.409145070002979, + "grad_norm": 0.0150146484375, + "learning_rate": 0.029961799419043065, + "loss": 0.7947, + "num_input_tokens_seen": 9385032, + "step": 16175 + }, + { + "epoch": 2.409889782543938, + "grad_norm": 0.054443359375, + "learning_rate": 0.029961660238187766, + "loss": 0.8139, + "num_input_tokens_seen": 9388040, + "step": 16180 + }, + { + "epoch": 2.4106344950848975, + "grad_norm": 0.023193359375, + "learning_rate": 0.029961520804570423, + "loss": 0.7759, + "num_input_tokens_seen": 9390856, + "step": 16185 + }, + { + "epoch": 2.4113792076258562, + "grad_norm": 0.0322265625, + "learning_rate": 0.029961381118193395, + "loss": 0.7973, + "num_input_tokens_seen": 9394088, + "step": 16190 + }, + { + "epoch": 2.4121239201668154, + "grad_norm": 0.03271484375, + "learning_rate": 0.029961241179059035, + "loss": 0.7846, + "num_input_tokens_seen": 9397416, + "step": 16195 + }, + { + "epoch": 2.4128686327077746, + "grad_norm": 0.021240234375, + "learning_rate": 0.02996110098716971, + "loss": 0.7799, + "num_input_tokens_seen": 9400360, + "step": 16200 + }, + { + "epoch": 2.413613345248734, + "grad_norm": 0.0174560546875, + "learning_rate": 0.029960960542527795, + "loss": 0.7866, + "num_input_tokens_seen": 9403176, + "step": 16205 + }, + { + "epoch": 2.414358057789693, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02996081984513565, + "loss": 0.7958, + "num_input_tokens_seen": 9405896, + "step": 16210 + }, + { + "epoch": 2.4151027703306522, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029960678894995665, + "loss": 0.8103, + "num_input_tokens_seen": 9408872, + "step": 16215 + }, + { + "epoch": 2.4158474828716114, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029960537692110212, + "loss": 0.8065, + "num_input_tokens_seen": 9411976, + "step": 16220 + }, + { + "epoch": 2.4165921954125706, + "grad_norm": 0.0289306640625, + "learning_rate": 0.029960396236481685, + "loss": 0.776, + "num_input_tokens_seen": 9415112, + "step": 16225 + }, + { + "epoch": 2.41733690795353, + "grad_norm": 0.031982421875, + "learning_rate": 0.029960254528112466, + "loss": 0.7972, + "num_input_tokens_seen": 9417992, + "step": 16230 + }, + { + "epoch": 2.418081620494489, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02996011256700495, + "loss": 0.7789, + "num_input_tokens_seen": 9420808, + "step": 16235 + }, + { + "epoch": 2.4188263330354483, + "grad_norm": 0.043212890625, + "learning_rate": 0.02995997035316154, + "loss": 0.7525, + "num_input_tokens_seen": 9423656, + "step": 16240 + }, + { + "epoch": 2.4195710455764075, + "grad_norm": 0.030517578125, + "learning_rate": 0.029959827886584633, + "loss": 0.828, + "num_input_tokens_seen": 9426600, + "step": 16245 + }, + { + "epoch": 2.4203157581173667, + "grad_norm": 0.024658203125, + "learning_rate": 0.029959685167276637, + "loss": 0.8096, + "num_input_tokens_seen": 9429608, + "step": 16250 + }, + { + "epoch": 2.421060470658326, + "grad_norm": 0.02294921875, + "learning_rate": 0.02995954219523997, + "loss": 0.8104, + "num_input_tokens_seen": 9432648, + "step": 16255 + }, + { + "epoch": 2.421805183199285, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02995939897047704, + "loss": 0.8365, + "num_input_tokens_seen": 9435624, + "step": 16260 + }, + { + "epoch": 2.4225498957402443, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029959255492990264, + "loss": 0.7989, + "num_input_tokens_seen": 9438632, + "step": 16265 + }, + { + "epoch": 2.4232946082812035, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029959111762782074, + "loss": 0.7922, + "num_input_tokens_seen": 9441416, + "step": 16270 + }, + { + "epoch": 2.4240393208221627, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029958967779854892, + "loss": 0.8193, + "num_input_tokens_seen": 9443944, + "step": 16275 + }, + { + "epoch": 2.424784033363122, + "grad_norm": 0.03466796875, + "learning_rate": 0.029958823544211158, + "loss": 0.8077, + "num_input_tokens_seen": 9446952, + "step": 16280 + }, + { + "epoch": 2.425528745904081, + "grad_norm": 0.0419921875, + "learning_rate": 0.029958679055853296, + "loss": 0.783, + "num_input_tokens_seen": 9449736, + "step": 16285 + }, + { + "epoch": 2.4262734584450403, + "grad_norm": 0.0341796875, + "learning_rate": 0.02995853431478376, + "loss": 0.8209, + "num_input_tokens_seen": 9452424, + "step": 16290 + }, + { + "epoch": 2.4270181709859995, + "grad_norm": 0.0224609375, + "learning_rate": 0.029958389321004992, + "loss": 0.8209, + "num_input_tokens_seen": 9455208, + "step": 16295 + }, + { + "epoch": 2.4277628835269587, + "grad_norm": 0.036865234375, + "learning_rate": 0.029958244074519434, + "loss": 0.775, + "num_input_tokens_seen": 9458312, + "step": 16300 + }, + { + "epoch": 2.428507596067918, + "grad_norm": 0.036376953125, + "learning_rate": 0.02995809857532955, + "loss": 0.8019, + "num_input_tokens_seen": 9461160, + "step": 16305 + }, + { + "epoch": 2.429252308608877, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02995795282343779, + "loss": 0.815, + "num_input_tokens_seen": 9464040, + "step": 16310 + }, + { + "epoch": 2.4299970211498363, + "grad_norm": 0.037109375, + "learning_rate": 0.029957806818846622, + "loss": 0.7965, + "num_input_tokens_seen": 9466984, + "step": 16315 + }, + { + "epoch": 2.4307417336907955, + "grad_norm": 0.0341796875, + "learning_rate": 0.02995766056155851, + "loss": 0.8122, + "num_input_tokens_seen": 9470056, + "step": 16320 + }, + { + "epoch": 2.4314864462317547, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02995751405157592, + "loss": 0.7904, + "num_input_tokens_seen": 9472936, + "step": 16325 + }, + { + "epoch": 2.432231158772714, + "grad_norm": 0.0184326171875, + "learning_rate": 0.029957367288901336, + "loss": 0.8058, + "num_input_tokens_seen": 9475592, + "step": 16330 + }, + { + "epoch": 2.432975871313673, + "grad_norm": 0.032958984375, + "learning_rate": 0.029957220273537234, + "loss": 0.8122, + "num_input_tokens_seen": 9478312, + "step": 16335 + }, + { + "epoch": 2.4337205838546323, + "grad_norm": 0.025390625, + "learning_rate": 0.0299570730054861, + "loss": 0.8227, + "num_input_tokens_seen": 9481384, + "step": 16340 + }, + { + "epoch": 2.434465296395591, + "grad_norm": 0.0419921875, + "learning_rate": 0.029956925484750412, + "loss": 0.816, + "num_input_tokens_seen": 9484200, + "step": 16345 + }, + { + "epoch": 2.4352100089365507, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029956777711332677, + "loss": 0.7909, + "num_input_tokens_seen": 9487208, + "step": 16350 + }, + { + "epoch": 2.4359547214775095, + "grad_norm": 0.0167236328125, + "learning_rate": 0.02995662968523538, + "loss": 0.8331, + "num_input_tokens_seen": 9490152, + "step": 16355 + }, + { + "epoch": 2.436699434018469, + "grad_norm": 0.04150390625, + "learning_rate": 0.029956481406461025, + "loss": 0.8046, + "num_input_tokens_seen": 9493064, + "step": 16360 + }, + { + "epoch": 2.437444146559428, + "grad_norm": 0.040771484375, + "learning_rate": 0.02995633287501212, + "loss": 0.805, + "num_input_tokens_seen": 9496008, + "step": 16365 + }, + { + "epoch": 2.438188859100387, + "grad_norm": 0.0245361328125, + "learning_rate": 0.029956184090891173, + "loss": 0.797, + "num_input_tokens_seen": 9498792, + "step": 16370 + }, + { + "epoch": 2.4389335716413463, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02995603505410069, + "loss": 0.797, + "num_input_tokens_seen": 9502056, + "step": 16375 + }, + { + "epoch": 2.4396782841823055, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029955885764643204, + "loss": 0.7984, + "num_input_tokens_seen": 9504744, + "step": 16380 + }, + { + "epoch": 2.4404229967232647, + "grad_norm": 0.013916015625, + "learning_rate": 0.029955736222521224, + "loss": 0.8109, + "num_input_tokens_seen": 9507816, + "step": 16385 + }, + { + "epoch": 2.441167709264224, + "grad_norm": 0.0296630859375, + "learning_rate": 0.029955586427737282, + "loss": 0.8129, + "num_input_tokens_seen": 9510888, + "step": 16390 + }, + { + "epoch": 2.441912421805183, + "grad_norm": 0.028564453125, + "learning_rate": 0.02995543638029391, + "loss": 0.7926, + "num_input_tokens_seen": 9513768, + "step": 16395 + }, + { + "epoch": 2.4426571343461423, + "grad_norm": 0.02392578125, + "learning_rate": 0.029955286080193638, + "loss": 0.8079, + "num_input_tokens_seen": 9516648, + "step": 16400 + }, + { + "epoch": 2.4434018468871015, + "grad_norm": 0.021728515625, + "learning_rate": 0.029955135527439006, + "loss": 0.807, + "num_input_tokens_seen": 9519784, + "step": 16405 + }, + { + "epoch": 2.4441465594280607, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02995498472203256, + "loss": 0.802, + "num_input_tokens_seen": 9522888, + "step": 16410 + }, + { + "epoch": 2.44489127196902, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029954833663976846, + "loss": 0.7876, + "num_input_tokens_seen": 9525608, + "step": 16415 + }, + { + "epoch": 2.445635984509979, + "grad_norm": 0.0419921875, + "learning_rate": 0.029954682353274417, + "loss": 0.7871, + "num_input_tokens_seen": 9528840, + "step": 16420 + }, + { + "epoch": 2.4463806970509383, + "grad_norm": 0.0181884765625, + "learning_rate": 0.029954530789927828, + "loss": 0.8131, + "num_input_tokens_seen": 9531752, + "step": 16425 + }, + { + "epoch": 2.4471254095918975, + "grad_norm": 0.0306396484375, + "learning_rate": 0.029954378973939644, + "loss": 0.8038, + "num_input_tokens_seen": 9534696, + "step": 16430 + }, + { + "epoch": 2.4478701221328567, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029954226905312423, + "loss": 0.797, + "num_input_tokens_seen": 9537352, + "step": 16435 + }, + { + "epoch": 2.448614834673816, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029954074584048737, + "loss": 0.8043, + "num_input_tokens_seen": 9540392, + "step": 16440 + }, + { + "epoch": 2.449359547214775, + "grad_norm": 0.033203125, + "learning_rate": 0.029953922010151162, + "loss": 0.7945, + "num_input_tokens_seen": 9543144, + "step": 16445 + }, + { + "epoch": 2.4501042597557343, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029953769183622272, + "loss": 0.8004, + "num_input_tokens_seen": 9546184, + "step": 16450 + }, + { + "epoch": 2.4508489722966935, + "grad_norm": 0.023681640625, + "learning_rate": 0.02995361610446465, + "loss": 0.8028, + "num_input_tokens_seen": 9549032, + "step": 16455 + }, + { + "epoch": 2.4515936848376527, + "grad_norm": 0.022705078125, + "learning_rate": 0.02995346277268088, + "loss": 0.8088, + "num_input_tokens_seen": 9552296, + "step": 16460 + }, + { + "epoch": 2.452338397378612, + "grad_norm": 0.034423828125, + "learning_rate": 0.029953309188273557, + "loss": 0.8139, + "num_input_tokens_seen": 9555272, + "step": 16465 + }, + { + "epoch": 2.453083109919571, + "grad_norm": 0.028564453125, + "learning_rate": 0.02995315535124527, + "loss": 0.8091, + "num_input_tokens_seen": 9558184, + "step": 16470 + }, + { + "epoch": 2.4538278224605303, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029953001261598625, + "loss": 0.803, + "num_input_tokens_seen": 9561064, + "step": 16475 + }, + { + "epoch": 2.4545725350014895, + "grad_norm": 0.02490234375, + "learning_rate": 0.02995284691933622, + "loss": 0.8007, + "num_input_tokens_seen": 9564072, + "step": 16480 + }, + { + "epoch": 2.4553172475424487, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029952692324460666, + "loss": 0.805, + "num_input_tokens_seen": 9566728, + "step": 16485 + }, + { + "epoch": 2.456061960083408, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02995253747697457, + "loss": 0.8113, + "num_input_tokens_seen": 9569608, + "step": 16490 + }, + { + "epoch": 2.456806672624367, + "grad_norm": 0.02392578125, + "learning_rate": 0.02995238237688055, + "loss": 0.8, + "num_input_tokens_seen": 9572392, + "step": 16495 + }, + { + "epoch": 2.4575513851653263, + "grad_norm": 0.0274658203125, + "learning_rate": 0.029952227024181224, + "loss": 0.8242, + "num_input_tokens_seen": 9575400, + "step": 16500 + }, + { + "epoch": 2.4582960977062855, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02995207141887922, + "loss": 0.819, + "num_input_tokens_seen": 9578312, + "step": 16505 + }, + { + "epoch": 2.4590408102472447, + "grad_norm": 0.020263671875, + "learning_rate": 0.02995191556097717, + "loss": 0.8011, + "num_input_tokens_seen": 9581000, + "step": 16510 + }, + { + "epoch": 2.459785522788204, + "grad_norm": 0.022705078125, + "learning_rate": 0.0299517594504777, + "loss": 0.7807, + "num_input_tokens_seen": 9583688, + "step": 16515 + }, + { + "epoch": 2.4605302353291627, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029951603087383455, + "loss": 0.8095, + "num_input_tokens_seen": 9587080, + "step": 16520 + }, + { + "epoch": 2.4612749478701224, + "grad_norm": 0.027587890625, + "learning_rate": 0.029951446471697066, + "loss": 0.801, + "num_input_tokens_seen": 9589960, + "step": 16525 + }, + { + "epoch": 2.462019660411081, + "grad_norm": 0.033935546875, + "learning_rate": 0.029951289603421187, + "loss": 0.8145, + "num_input_tokens_seen": 9592936, + "step": 16530 + }, + { + "epoch": 2.4627643729520408, + "grad_norm": 0.01446533203125, + "learning_rate": 0.029951132482558467, + "loss": 0.7942, + "num_input_tokens_seen": 9595848, + "step": 16535 + }, + { + "epoch": 2.4635090854929995, + "grad_norm": 0.0205078125, + "learning_rate": 0.02995097510911156, + "loss": 0.8052, + "num_input_tokens_seen": 9598600, + "step": 16540 + }, + { + "epoch": 2.4642537980339587, + "grad_norm": 0.0205078125, + "learning_rate": 0.029950817483083123, + "loss": 0.8034, + "num_input_tokens_seen": 9601320, + "step": 16545 + }, + { + "epoch": 2.464998510574918, + "grad_norm": 0.046875, + "learning_rate": 0.029950659604475818, + "loss": 0.7845, + "num_input_tokens_seen": 9604584, + "step": 16550 + }, + { + "epoch": 2.465743223115877, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029950501473292316, + "loss": 0.8035, + "num_input_tokens_seen": 9607560, + "step": 16555 + }, + { + "epoch": 2.4664879356568363, + "grad_norm": 0.031005859375, + "learning_rate": 0.029950343089535286, + "loss": 0.8011, + "num_input_tokens_seen": 9610216, + "step": 16560 + }, + { + "epoch": 2.4672326481977955, + "grad_norm": 0.011962890625, + "learning_rate": 0.029950184453207407, + "loss": 0.8182, + "num_input_tokens_seen": 9613224, + "step": 16565 + }, + { + "epoch": 2.4679773607387547, + "grad_norm": 0.0205078125, + "learning_rate": 0.029950025564311353, + "loss": 0.7867, + "num_input_tokens_seen": 9615848, + "step": 16570 + }, + { + "epoch": 2.468722073279714, + "grad_norm": 0.0322265625, + "learning_rate": 0.02994986642284981, + "loss": 0.817, + "num_input_tokens_seen": 9618600, + "step": 16575 + }, + { + "epoch": 2.469466785820673, + "grad_norm": 0.0205078125, + "learning_rate": 0.029949707028825476, + "loss": 0.7777, + "num_input_tokens_seen": 9621480, + "step": 16580 + }, + { + "epoch": 2.4702114983616323, + "grad_norm": 0.02685546875, + "learning_rate": 0.029949547382241027, + "loss": 0.8322, + "num_input_tokens_seen": 9624488, + "step": 16585 + }, + { + "epoch": 2.4709562109025915, + "grad_norm": 0.0277099609375, + "learning_rate": 0.029949387483099176, + "loss": 0.786, + "num_input_tokens_seen": 9627720, + "step": 16590 + }, + { + "epoch": 2.4717009234435507, + "grad_norm": 0.042236328125, + "learning_rate": 0.029949227331402612, + "loss": 0.8309, + "num_input_tokens_seen": 9630856, + "step": 16595 + }, + { + "epoch": 2.47244563598451, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02994906692715405, + "loss": 0.7893, + "num_input_tokens_seen": 9633928, + "step": 16600 + }, + { + "epoch": 2.473190348525469, + "grad_norm": 0.02001953125, + "learning_rate": 0.029948906270356193, + "loss": 0.7946, + "num_input_tokens_seen": 9637064, + "step": 16605 + }, + { + "epoch": 2.4739350610664284, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029948745361011758, + "loss": 0.7956, + "num_input_tokens_seen": 9640168, + "step": 16610 + }, + { + "epoch": 2.4746797736073876, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029948584199123465, + "loss": 0.8056, + "num_input_tokens_seen": 9642888, + "step": 16615 + }, + { + "epoch": 2.4754244861483468, + "grad_norm": 0.021240234375, + "learning_rate": 0.02994842278469403, + "loss": 0.808, + "num_input_tokens_seen": 9645832, + "step": 16620 + }, + { + "epoch": 2.476169198689306, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029948261117726194, + "loss": 0.7911, + "num_input_tokens_seen": 9648680, + "step": 16625 + }, + { + "epoch": 2.476913911230265, + "grad_norm": 0.017822265625, + "learning_rate": 0.029948099198222673, + "loss": 0.795, + "num_input_tokens_seen": 9651304, + "step": 16630 + }, + { + "epoch": 2.4776586237712244, + "grad_norm": 0.013916015625, + "learning_rate": 0.02994793702618621, + "loss": 0.8209, + "num_input_tokens_seen": 9654184, + "step": 16635 + }, + { + "epoch": 2.4784033363121836, + "grad_norm": 0.018798828125, + "learning_rate": 0.029947774601619546, + "loss": 0.8191, + "num_input_tokens_seen": 9657000, + "step": 16640 + }, + { + "epoch": 2.4791480488531428, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029947611924525417, + "loss": 0.804, + "num_input_tokens_seen": 9660040, + "step": 16645 + }, + { + "epoch": 2.479892761394102, + "grad_norm": 0.021728515625, + "learning_rate": 0.02994744899490658, + "loss": 0.7945, + "num_input_tokens_seen": 9662984, + "step": 16650 + }, + { + "epoch": 2.480637473935061, + "grad_norm": 0.0186767578125, + "learning_rate": 0.029947285812765782, + "loss": 0.7914, + "num_input_tokens_seen": 9665704, + "step": 16655 + }, + { + "epoch": 2.4813821864760204, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029947122378105782, + "loss": 0.7793, + "num_input_tokens_seen": 9668296, + "step": 16660 + }, + { + "epoch": 2.4821268990169796, + "grad_norm": 0.0272216796875, + "learning_rate": 0.029946958690929344, + "loss": 0.8015, + "num_input_tokens_seen": 9671112, + "step": 16665 + }, + { + "epoch": 2.482871611557939, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02994679475123923, + "loss": 0.7958, + "num_input_tokens_seen": 9674248, + "step": 16670 + }, + { + "epoch": 2.483616324098898, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02994663055903821, + "loss": 0.7791, + "num_input_tokens_seen": 9677608, + "step": 16675 + }, + { + "epoch": 2.484361036639857, + "grad_norm": 0.023193359375, + "learning_rate": 0.02994646611432905, + "loss": 0.8147, + "num_input_tokens_seen": 9680712, + "step": 16680 + }, + { + "epoch": 2.4851057491808164, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029946301417114546, + "loss": 0.8249, + "num_input_tokens_seen": 9683496, + "step": 16685 + }, + { + "epoch": 2.4858504617217756, + "grad_norm": 0.0184326171875, + "learning_rate": 0.029946136467397466, + "loss": 0.7869, + "num_input_tokens_seen": 9686344, + "step": 16690 + }, + { + "epoch": 2.4865951742627344, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0299459712651806, + "loss": 0.7899, + "num_input_tokens_seen": 9689576, + "step": 16695 + }, + { + "epoch": 2.487339886803694, + "grad_norm": 0.03271484375, + "learning_rate": 0.029945805810466745, + "loss": 0.8053, + "num_input_tokens_seen": 9692136, + "step": 16700 + }, + { + "epoch": 2.4880845993446528, + "grad_norm": 0.0274658203125, + "learning_rate": 0.029945640103258685, + "loss": 0.8536, + "num_input_tokens_seen": 9694856, + "step": 16705 + }, + { + "epoch": 2.488829311885612, + "grad_norm": 0.013671875, + "learning_rate": 0.02994547414355923, + "loss": 0.7898, + "num_input_tokens_seen": 9697832, + "step": 16710 + }, + { + "epoch": 2.489574024426571, + "grad_norm": 0.0244140625, + "learning_rate": 0.029945307931371175, + "loss": 0.7855, + "num_input_tokens_seen": 9700456, + "step": 16715 + }, + { + "epoch": 2.4903187369675304, + "grad_norm": 0.031494140625, + "learning_rate": 0.029945141466697335, + "loss": 0.8322, + "num_input_tokens_seen": 9703272, + "step": 16720 + }, + { + "epoch": 2.4910634495084896, + "grad_norm": 0.0289306640625, + "learning_rate": 0.029944974749540523, + "loss": 0.8101, + "num_input_tokens_seen": 9706056, + "step": 16725 + }, + { + "epoch": 2.4918081620494488, + "grad_norm": 0.031494140625, + "learning_rate": 0.029944807779903545, + "loss": 0.7977, + "num_input_tokens_seen": 9708712, + "step": 16730 + }, + { + "epoch": 2.492552874590408, + "grad_norm": 0.01373291015625, + "learning_rate": 0.029944640557789233, + "loss": 0.7859, + "num_input_tokens_seen": 9711176, + "step": 16735 + }, + { + "epoch": 2.493297587131367, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029944473083200404, + "loss": 0.8075, + "num_input_tokens_seen": 9713768, + "step": 16740 + }, + { + "epoch": 2.4940422996723264, + "grad_norm": 0.0137939453125, + "learning_rate": 0.029944305356139896, + "loss": 0.8009, + "num_input_tokens_seen": 9716296, + "step": 16745 + }, + { + "epoch": 2.4947870122132856, + "grad_norm": 0.015625, + "learning_rate": 0.029944137376610537, + "loss": 0.8029, + "num_input_tokens_seen": 9719144, + "step": 16750 + }, + { + "epoch": 2.495531724754245, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029943969144615164, + "loss": 0.8212, + "num_input_tokens_seen": 9722472, + "step": 16755 + }, + { + "epoch": 2.496276437295204, + "grad_norm": 0.03076171875, + "learning_rate": 0.02994380066015662, + "loss": 0.7846, + "num_input_tokens_seen": 9725288, + "step": 16760 + }, + { + "epoch": 2.497021149836163, + "grad_norm": 0.0224609375, + "learning_rate": 0.029943631923237752, + "loss": 0.794, + "num_input_tokens_seen": 9728136, + "step": 16765 + }, + { + "epoch": 2.4977658623771224, + "grad_norm": 0.03515625, + "learning_rate": 0.02994346293386141, + "loss": 0.8312, + "num_input_tokens_seen": 9730984, + "step": 16770 + }, + { + "epoch": 2.4985105749180816, + "grad_norm": 0.0162353515625, + "learning_rate": 0.029943293692030453, + "loss": 0.7856, + "num_input_tokens_seen": 9733864, + "step": 16775 + }, + { + "epoch": 2.499255287459041, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029943124197747736, + "loss": 0.8, + "num_input_tokens_seen": 9736712, + "step": 16780 + }, + { + "epoch": 2.5, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02994295445101612, + "loss": 0.789, + "num_input_tokens_seen": 9739400, + "step": 16785 + }, + { + "epoch": 2.500744712540959, + "grad_norm": 0.0205078125, + "learning_rate": 0.02994278445183848, + "loss": 0.8034, + "num_input_tokens_seen": 9742216, + "step": 16790 + }, + { + "epoch": 2.5014894250819184, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02994261420021768, + "loss": 0.8108, + "num_input_tokens_seen": 9745224, + "step": 16795 + }, + { + "epoch": 2.5022341376228776, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0299424436961566, + "loss": 0.8189, + "num_input_tokens_seen": 9748328, + "step": 16800 + }, + { + "epoch": 2.502978850163837, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029942272939658125, + "loss": 0.7749, + "num_input_tokens_seen": 9751048, + "step": 16805 + }, + { + "epoch": 2.503723562704796, + "grad_norm": 0.02880859375, + "learning_rate": 0.02994210193072513, + "loss": 0.8023, + "num_input_tokens_seen": 9753704, + "step": 16810 + }, + { + "epoch": 2.504468275245755, + "grad_norm": 0.021484375, + "learning_rate": 0.029941930669360517, + "loss": 0.8241, + "num_input_tokens_seen": 9756456, + "step": 16815 + }, + { + "epoch": 2.5052129877867144, + "grad_norm": 0.01324462890625, + "learning_rate": 0.02994175915556716, + "loss": 0.8227, + "num_input_tokens_seen": 9759400, + "step": 16820 + }, + { + "epoch": 2.5059577003276736, + "grad_norm": 0.01177978515625, + "learning_rate": 0.02994158738934798, + "loss": 0.7885, + "num_input_tokens_seen": 9762120, + "step": 16825 + }, + { + "epoch": 2.506702412868633, + "grad_norm": 0.018310546875, + "learning_rate": 0.02994141537070586, + "loss": 0.7986, + "num_input_tokens_seen": 9765096, + "step": 16830 + }, + { + "epoch": 2.507447125409592, + "grad_norm": 0.0284423828125, + "learning_rate": 0.029941243099643713, + "loss": 0.805, + "num_input_tokens_seen": 9767944, + "step": 16835 + }, + { + "epoch": 2.5081918379505512, + "grad_norm": 0.026611328125, + "learning_rate": 0.02994107057616445, + "loss": 0.8096, + "num_input_tokens_seen": 9770696, + "step": 16840 + }, + { + "epoch": 2.5089365504915104, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02994089780027099, + "loss": 0.7869, + "num_input_tokens_seen": 9773384, + "step": 16845 + }, + { + "epoch": 2.509681263032469, + "grad_norm": 0.02099609375, + "learning_rate": 0.029940724771966243, + "loss": 0.8116, + "num_input_tokens_seen": 9776232, + "step": 16850 + }, + { + "epoch": 2.510425975573429, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029940551491253138, + "loss": 0.8147, + "num_input_tokens_seen": 9778952, + "step": 16855 + }, + { + "epoch": 2.5111706881143876, + "grad_norm": 0.01287841796875, + "learning_rate": 0.029940377958134597, + "loss": 0.7889, + "num_input_tokens_seen": 9781864, + "step": 16860 + }, + { + "epoch": 2.5119154006553472, + "grad_norm": 0.01397705078125, + "learning_rate": 0.029940204172613557, + "loss": 0.7977, + "num_input_tokens_seen": 9784936, + "step": 16865 + }, + { + "epoch": 2.512660113196306, + "grad_norm": 0.03125, + "learning_rate": 0.029940030134692954, + "loss": 0.8017, + "num_input_tokens_seen": 9787976, + "step": 16870 + }, + { + "epoch": 2.5134048257372656, + "grad_norm": 0.01324462890625, + "learning_rate": 0.02993985584437572, + "loss": 0.8194, + "num_input_tokens_seen": 9790952, + "step": 16875 + }, + { + "epoch": 2.5141495382782244, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029939681301664814, + "loss": 0.7904, + "num_input_tokens_seen": 9793832, + "step": 16880 + }, + { + "epoch": 2.514894250819184, + "grad_norm": 0.019287109375, + "learning_rate": 0.02993950650656317, + "loss": 0.7781, + "num_input_tokens_seen": 9796872, + "step": 16885 + }, + { + "epoch": 2.515638963360143, + "grad_norm": 0.0234375, + "learning_rate": 0.029939331459073754, + "loss": 0.8145, + "num_input_tokens_seen": 9799912, + "step": 16890 + }, + { + "epoch": 2.516383675901102, + "grad_norm": 0.021240234375, + "learning_rate": 0.02993915615919951, + "loss": 0.7985, + "num_input_tokens_seen": 9802696, + "step": 16895 + }, + { + "epoch": 2.517128388442061, + "grad_norm": 0.01373291015625, + "learning_rate": 0.02993898060694341, + "loss": 0.7891, + "num_input_tokens_seen": 9805384, + "step": 16900 + }, + { + "epoch": 2.5178731009830204, + "grad_norm": 0.017578125, + "learning_rate": 0.029938804802308414, + "loss": 0.8216, + "num_input_tokens_seen": 9808616, + "step": 16905 + }, + { + "epoch": 2.5186178135239796, + "grad_norm": 0.0135498046875, + "learning_rate": 0.029938628745297492, + "loss": 0.8119, + "num_input_tokens_seen": 9811720, + "step": 16910 + }, + { + "epoch": 2.519362526064939, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029938452435913625, + "loss": 0.8043, + "num_input_tokens_seen": 9814536, + "step": 16915 + }, + { + "epoch": 2.520107238605898, + "grad_norm": 0.0205078125, + "learning_rate": 0.029938275874159783, + "loss": 0.8023, + "num_input_tokens_seen": 9817256, + "step": 16920 + }, + { + "epoch": 2.5208519511468572, + "grad_norm": 0.027587890625, + "learning_rate": 0.029938099060038956, + "loss": 0.8392, + "num_input_tokens_seen": 9820328, + "step": 16925 + }, + { + "epoch": 2.5215966636878164, + "grad_norm": 0.02783203125, + "learning_rate": 0.029937921993554127, + "loss": 0.7817, + "num_input_tokens_seen": 9823304, + "step": 16930 + }, + { + "epoch": 2.5223413762287756, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029937744674708288, + "loss": 0.8182, + "num_input_tokens_seen": 9825992, + "step": 16935 + }, + { + "epoch": 2.523086088769735, + "grad_norm": 0.0205078125, + "learning_rate": 0.029937567103504432, + "loss": 0.7777, + "num_input_tokens_seen": 9828936, + "step": 16940 + }, + { + "epoch": 2.523830801310694, + "grad_norm": 0.0205078125, + "learning_rate": 0.029937389279945565, + "loss": 0.7973, + "num_input_tokens_seen": 9831656, + "step": 16945 + }, + { + "epoch": 2.5245755138516532, + "grad_norm": 0.020263671875, + "learning_rate": 0.029937211204034687, + "loss": 0.7953, + "num_input_tokens_seen": 9834632, + "step": 16950 + }, + { + "epoch": 2.5253202263926124, + "grad_norm": 0.030029296875, + "learning_rate": 0.029937032875774806, + "loss": 0.8352, + "num_input_tokens_seen": 9837576, + "step": 16955 + }, + { + "epoch": 2.5260649389335716, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02993685429516894, + "loss": 0.832, + "num_input_tokens_seen": 9840616, + "step": 16960 + }, + { + "epoch": 2.526809651474531, + "grad_norm": 0.0301513671875, + "learning_rate": 0.029936675462220095, + "loss": 0.8031, + "num_input_tokens_seen": 9843720, + "step": 16965 + }, + { + "epoch": 2.52755436401549, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029936496376931306, + "loss": 0.7964, + "num_input_tokens_seen": 9846728, + "step": 16970 + }, + { + "epoch": 2.5282990765564493, + "grad_norm": 0.026123046875, + "learning_rate": 0.029936317039305584, + "loss": 0.7992, + "num_input_tokens_seen": 9849608, + "step": 16975 + }, + { + "epoch": 2.5290437890974085, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02993613744934597, + "loss": 0.8183, + "num_input_tokens_seen": 9852392, + "step": 16980 + }, + { + "epoch": 2.5297885016383677, + "grad_norm": 0.0135498046875, + "learning_rate": 0.029935957607055495, + "loss": 0.8037, + "num_input_tokens_seen": 9855048, + "step": 16985 + }, + { + "epoch": 2.530533214179327, + "grad_norm": 0.01513671875, + "learning_rate": 0.029935777512437197, + "loss": 0.7992, + "num_input_tokens_seen": 9858056, + "step": 16990 + }, + { + "epoch": 2.531277926720286, + "grad_norm": 0.01409912109375, + "learning_rate": 0.029935597165494113, + "loss": 0.8068, + "num_input_tokens_seen": 9860776, + "step": 16995 + }, + { + "epoch": 2.5320226392612453, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029935416566229302, + "loss": 0.7993, + "num_input_tokens_seen": 9864008, + "step": 17000 + }, + { + "epoch": 2.5327673518022045, + "grad_norm": 0.02294921875, + "learning_rate": 0.029935235714645802, + "loss": 0.7981, + "num_input_tokens_seen": 9867144, + "step": 17005 + }, + { + "epoch": 2.5335120643431637, + "grad_norm": 0.0145263671875, + "learning_rate": 0.029935054610746678, + "loss": 0.7931, + "num_input_tokens_seen": 9869832, + "step": 17010 + }, + { + "epoch": 2.534256776884123, + "grad_norm": 0.028564453125, + "learning_rate": 0.029934873254534984, + "loss": 0.8076, + "num_input_tokens_seen": 9872936, + "step": 17015 + }, + { + "epoch": 2.535001489425082, + "grad_norm": 0.02490234375, + "learning_rate": 0.029934691646013787, + "loss": 0.795, + "num_input_tokens_seen": 9875624, + "step": 17020 + }, + { + "epoch": 2.535746201966041, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02993450978518615, + "loss": 0.7906, + "num_input_tokens_seen": 9878472, + "step": 17025 + }, + { + "epoch": 2.5364909145070005, + "grad_norm": 0.01611328125, + "learning_rate": 0.02993432767205515, + "loss": 0.8217, + "num_input_tokens_seen": 9881320, + "step": 17030 + }, + { + "epoch": 2.5372356270479592, + "grad_norm": 0.023193359375, + "learning_rate": 0.029934145306623867, + "loss": 0.7784, + "num_input_tokens_seen": 9884072, + "step": 17035 + }, + { + "epoch": 2.537980339588919, + "grad_norm": 0.0263671875, + "learning_rate": 0.029933962688895373, + "loss": 0.8016, + "num_input_tokens_seen": 9886984, + "step": 17040 + }, + { + "epoch": 2.5387250521298776, + "grad_norm": 0.0205078125, + "learning_rate": 0.02993377981887276, + "loss": 0.7762, + "num_input_tokens_seen": 9889960, + "step": 17045 + }, + { + "epoch": 2.5394697646708373, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029933596696559112, + "loss": 0.7947, + "num_input_tokens_seen": 9892744, + "step": 17050 + }, + { + "epoch": 2.540214477211796, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029933413321957526, + "loss": 0.8124, + "num_input_tokens_seen": 9895656, + "step": 17055 + }, + { + "epoch": 2.5409591897527557, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029933229695071104, + "loss": 0.7973, + "num_input_tokens_seen": 9898664, + "step": 17060 + }, + { + "epoch": 2.5417039022937145, + "grad_norm": 0.03076171875, + "learning_rate": 0.029933045815902937, + "loss": 0.8093, + "num_input_tokens_seen": 9901640, + "step": 17065 + }, + { + "epoch": 2.5424486148346737, + "grad_norm": 0.02294921875, + "learning_rate": 0.029932861684456146, + "loss": 0.7979, + "num_input_tokens_seen": 9904616, + "step": 17070 + }, + { + "epoch": 2.543193327375633, + "grad_norm": 0.031005859375, + "learning_rate": 0.02993267730073383, + "loss": 0.8017, + "num_input_tokens_seen": 9907240, + "step": 17075 + }, + { + "epoch": 2.543938039916592, + "grad_norm": 0.017578125, + "learning_rate": 0.029932492664739106, + "loss": 0.7839, + "num_input_tokens_seen": 9910440, + "step": 17080 + }, + { + "epoch": 2.5446827524575513, + "grad_norm": 0.03173828125, + "learning_rate": 0.029932307776475098, + "loss": 0.7823, + "num_input_tokens_seen": 9913672, + "step": 17085 + }, + { + "epoch": 2.5454274649985105, + "grad_norm": 0.02001953125, + "learning_rate": 0.029932122635944927, + "loss": 0.7922, + "num_input_tokens_seen": 9916584, + "step": 17090 + }, + { + "epoch": 2.5461721775394697, + "grad_norm": 0.0380859375, + "learning_rate": 0.029931937243151716, + "loss": 0.8039, + "num_input_tokens_seen": 9919432, + "step": 17095 + }, + { + "epoch": 2.546916890080429, + "grad_norm": 0.034912109375, + "learning_rate": 0.029931751598098605, + "loss": 0.7881, + "num_input_tokens_seen": 9922184, + "step": 17100 + }, + { + "epoch": 2.547661602621388, + "grad_norm": 0.0277099609375, + "learning_rate": 0.029931565700788726, + "loss": 0.7894, + "num_input_tokens_seen": 9925064, + "step": 17105 + }, + { + "epoch": 2.5484063151623473, + "grad_norm": 0.01611328125, + "learning_rate": 0.029931379551225223, + "loss": 0.791, + "num_input_tokens_seen": 9928040, + "step": 17110 + }, + { + "epoch": 2.5491510277033065, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02993119314941123, + "loss": 0.8215, + "num_input_tokens_seen": 9930824, + "step": 17115 + }, + { + "epoch": 2.5498957402442657, + "grad_norm": 0.031494140625, + "learning_rate": 0.02993100649534991, + "loss": 0.798, + "num_input_tokens_seen": 9933480, + "step": 17120 + }, + { + "epoch": 2.550640452785225, + "grad_norm": 0.0224609375, + "learning_rate": 0.02993081958904441, + "loss": 0.8479, + "num_input_tokens_seen": 9936456, + "step": 17125 + }, + { + "epoch": 2.551385165326184, + "grad_norm": 0.023193359375, + "learning_rate": 0.02993063243049789, + "loss": 0.8122, + "num_input_tokens_seen": 9939368, + "step": 17130 + }, + { + "epoch": 2.5521298778671433, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029930445019713505, + "loss": 0.813, + "num_input_tokens_seen": 9942216, + "step": 17135 + }, + { + "epoch": 2.5528745904081025, + "grad_norm": 0.00982666015625, + "learning_rate": 0.029930257356694433, + "loss": 0.8017, + "num_input_tokens_seen": 9944904, + "step": 17140 + }, + { + "epoch": 2.5536193029490617, + "grad_norm": 0.01953125, + "learning_rate": 0.029930069441443833, + "loss": 0.8024, + "num_input_tokens_seen": 9947752, + "step": 17145 + }, + { + "epoch": 2.554364015490021, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029929881273964884, + "loss": 0.7894, + "num_input_tokens_seen": 9950536, + "step": 17150 + }, + { + "epoch": 2.55510872803098, + "grad_norm": 0.0302734375, + "learning_rate": 0.029929692854260764, + "loss": 0.802, + "num_input_tokens_seen": 9953448, + "step": 17155 + }, + { + "epoch": 2.5558534405719393, + "grad_norm": 0.01434326171875, + "learning_rate": 0.029929504182334657, + "loss": 0.8403, + "num_input_tokens_seen": 9956424, + "step": 17160 + }, + { + "epoch": 2.5565981531128985, + "grad_norm": 0.0277099609375, + "learning_rate": 0.029929315258189752, + "loss": 0.8411, + "num_input_tokens_seen": 9959176, + "step": 17165 + }, + { + "epoch": 2.5573428656538577, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02992912608182924, + "loss": 0.8176, + "num_input_tokens_seen": 9962568, + "step": 17170 + }, + { + "epoch": 2.558087578194817, + "grad_norm": 0.021728515625, + "learning_rate": 0.029928936653256317, + "loss": 0.795, + "num_input_tokens_seen": 9965672, + "step": 17175 + }, + { + "epoch": 2.558832290735776, + "grad_norm": 0.027587890625, + "learning_rate": 0.02992874697247418, + "loss": 0.7775, + "num_input_tokens_seen": 9968520, + "step": 17180 + }, + { + "epoch": 2.5595770032767353, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029928557039486037, + "loss": 0.8069, + "num_input_tokens_seen": 9971240, + "step": 17185 + }, + { + "epoch": 2.5603217158176945, + "grad_norm": 0.03125, + "learning_rate": 0.029928366854295094, + "loss": 0.7956, + "num_input_tokens_seen": 9974120, + "step": 17190 + }, + { + "epoch": 2.5610664283586537, + "grad_norm": 0.018310546875, + "learning_rate": 0.029928176416904564, + "loss": 0.7713, + "num_input_tokens_seen": 9977192, + "step": 17195 + }, + { + "epoch": 2.5618111408996125, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029927985727317672, + "loss": 0.824, + "num_input_tokens_seen": 9979848, + "step": 17200 + }, + { + "epoch": 2.562555853440572, + "grad_norm": 0.01422119140625, + "learning_rate": 0.02992779478553763, + "loss": 0.8184, + "num_input_tokens_seen": 9982984, + "step": 17205 + }, + { + "epoch": 2.563300565981531, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029927603591567663, + "loss": 0.777, + "num_input_tokens_seen": 9985960, + "step": 17210 + }, + { + "epoch": 2.5640452785224905, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02992741214541101, + "loss": 0.7922, + "num_input_tokens_seen": 9988872, + "step": 17215 + }, + { + "epoch": 2.5647899910634493, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0299272204470709, + "loss": 0.7879, + "num_input_tokens_seen": 9991720, + "step": 17220 + }, + { + "epoch": 2.565534703604409, + "grad_norm": 0.01373291015625, + "learning_rate": 0.02992702849655057, + "loss": 0.7918, + "num_input_tokens_seen": 9994472, + "step": 17225 + }, + { + "epoch": 2.5662794161453677, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029926836293853266, + "loss": 0.7902, + "num_input_tokens_seen": 9997128, + "step": 17230 + }, + { + "epoch": 2.5670241286863273, + "grad_norm": 0.0234375, + "learning_rate": 0.029926643838982234, + "loss": 0.788, + "num_input_tokens_seen": 9999848, + "step": 17235 + }, + { + "epoch": 2.567768841227286, + "grad_norm": 0.0308837890625, + "learning_rate": 0.029926451131940722, + "loss": 0.8223, + "num_input_tokens_seen": 10002664, + "step": 17240 + }, + { + "epoch": 2.5685135537682453, + "grad_norm": 0.026611328125, + "learning_rate": 0.029926258172731993, + "loss": 0.8006, + "num_input_tokens_seen": 10005320, + "step": 17245 + }, + { + "epoch": 2.5692582663092045, + "grad_norm": 0.015380859375, + "learning_rate": 0.0299260649613593, + "loss": 0.7806, + "num_input_tokens_seen": 10008232, + "step": 17250 + }, + { + "epoch": 2.5700029788501637, + "grad_norm": 0.042236328125, + "learning_rate": 0.02992587149782591, + "loss": 0.7903, + "num_input_tokens_seen": 10011112, + "step": 17255 + }, + { + "epoch": 2.570747691391123, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02992567778213509, + "loss": 0.8121, + "num_input_tokens_seen": 10014312, + "step": 17260 + }, + { + "epoch": 2.571492403932082, + "grad_norm": 0.023681640625, + "learning_rate": 0.029925483814290114, + "loss": 0.82, + "num_input_tokens_seen": 10017320, + "step": 17265 + }, + { + "epoch": 2.5722371164730413, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02992528959429426, + "loss": 0.7837, + "num_input_tokens_seen": 10020520, + "step": 17270 + }, + { + "epoch": 2.5729818290140005, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029925095122150806, + "loss": 0.8268, + "num_input_tokens_seen": 10023720, + "step": 17275 + }, + { + "epoch": 2.5737265415549597, + "grad_norm": 0.020263671875, + "learning_rate": 0.029924900397863042, + "loss": 0.7686, + "num_input_tokens_seen": 10026760, + "step": 17280 + }, + { + "epoch": 2.574471254095919, + "grad_norm": 0.01507568359375, + "learning_rate": 0.029924705421434255, + "loss": 0.8071, + "num_input_tokens_seen": 10029640, + "step": 17285 + }, + { + "epoch": 2.575215966636878, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029924510192867734, + "loss": 0.7941, + "num_input_tokens_seen": 10032744, + "step": 17290 + }, + { + "epoch": 2.5759606791778373, + "grad_norm": 0.030029296875, + "learning_rate": 0.029924314712166784, + "loss": 0.8061, + "num_input_tokens_seen": 10035816, + "step": 17295 + }, + { + "epoch": 2.5767053917187965, + "grad_norm": 0.038818359375, + "learning_rate": 0.029924118979334707, + "loss": 0.8363, + "num_input_tokens_seen": 10038792, + "step": 17300 + }, + { + "epoch": 2.5774501042597557, + "grad_norm": 0.03857421875, + "learning_rate": 0.029923922994374803, + "loss": 0.8135, + "num_input_tokens_seen": 10042056, + "step": 17305 + }, + { + "epoch": 2.578194816800715, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029923726757290395, + "loss": 0.7925, + "num_input_tokens_seen": 10044968, + "step": 17310 + }, + { + "epoch": 2.578939529341674, + "grad_norm": 0.01513671875, + "learning_rate": 0.02992353026808479, + "loss": 0.7851, + "num_input_tokens_seen": 10047816, + "step": 17315 + }, + { + "epoch": 2.5796842418826333, + "grad_norm": 0.01458740234375, + "learning_rate": 0.029923333526761305, + "loss": 0.8232, + "num_input_tokens_seen": 10050728, + "step": 17320 + }, + { + "epoch": 2.5804289544235925, + "grad_norm": 0.033203125, + "learning_rate": 0.029923136533323267, + "loss": 0.8323, + "num_input_tokens_seen": 10053736, + "step": 17325 + }, + { + "epoch": 2.5811736669645517, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02992293928777401, + "loss": 0.7752, + "num_input_tokens_seen": 10056424, + "step": 17330 + }, + { + "epoch": 2.581918379505511, + "grad_norm": 0.01708984375, + "learning_rate": 0.029922741790116857, + "loss": 0.7876, + "num_input_tokens_seen": 10059496, + "step": 17335 + }, + { + "epoch": 2.58266309204647, + "grad_norm": 0.026611328125, + "learning_rate": 0.029922544040355145, + "loss": 0.782, + "num_input_tokens_seen": 10062248, + "step": 17340 + }, + { + "epoch": 2.5834078045874294, + "grad_norm": 0.04248046875, + "learning_rate": 0.029922346038492226, + "loss": 0.7964, + "num_input_tokens_seen": 10065128, + "step": 17345 + }, + { + "epoch": 2.5841525171283886, + "grad_norm": 0.0277099609375, + "learning_rate": 0.029922147784531433, + "loss": 0.8199, + "num_input_tokens_seen": 10068072, + "step": 17350 + }, + { + "epoch": 2.5848972296693478, + "grad_norm": 0.021240234375, + "learning_rate": 0.02992194927847612, + "loss": 0.7643, + "num_input_tokens_seen": 10071080, + "step": 17355 + }, + { + "epoch": 2.585641942210307, + "grad_norm": 0.01373291015625, + "learning_rate": 0.029921750520329637, + "loss": 0.8004, + "num_input_tokens_seen": 10073832, + "step": 17360 + }, + { + "epoch": 2.586386654751266, + "grad_norm": 0.0230712890625, + "learning_rate": 0.029921551510095346, + "loss": 0.8302, + "num_input_tokens_seen": 10076840, + "step": 17365 + }, + { + "epoch": 2.5871313672922254, + "grad_norm": 0.0296630859375, + "learning_rate": 0.029921352247776607, + "loss": 0.803, + "num_input_tokens_seen": 10079560, + "step": 17370 + }, + { + "epoch": 2.587876079833184, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029921152733376793, + "loss": 0.8034, + "num_input_tokens_seen": 10082568, + "step": 17375 + }, + { + "epoch": 2.5886207923741438, + "grad_norm": 0.02490234375, + "learning_rate": 0.029920952966899263, + "loss": 0.8062, + "num_input_tokens_seen": 10085352, + "step": 17380 + }, + { + "epoch": 2.5893655049151025, + "grad_norm": 0.031982421875, + "learning_rate": 0.0299207529483474, + "loss": 0.8043, + "num_input_tokens_seen": 10088232, + "step": 17385 + }, + { + "epoch": 2.590110217456062, + "grad_norm": 0.02685546875, + "learning_rate": 0.02992055267772458, + "loss": 0.8291, + "num_input_tokens_seen": 10091272, + "step": 17390 + }, + { + "epoch": 2.590854929997021, + "grad_norm": 0.020263671875, + "learning_rate": 0.029920352155034188, + "loss": 0.8124, + "num_input_tokens_seen": 10094088, + "step": 17395 + }, + { + "epoch": 2.5915996425379806, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02992015138027961, + "loss": 0.8167, + "num_input_tokens_seen": 10096808, + "step": 17400 + }, + { + "epoch": 2.5923443550789393, + "grad_norm": 0.024658203125, + "learning_rate": 0.029919950353464238, + "loss": 0.7951, + "num_input_tokens_seen": 10099752, + "step": 17405 + }, + { + "epoch": 2.593089067619899, + "grad_norm": 0.0133056640625, + "learning_rate": 0.02991974907459147, + "loss": 0.8123, + "num_input_tokens_seen": 10102632, + "step": 17410 + }, + { + "epoch": 2.5938337801608577, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0299195475436647, + "loss": 0.8044, + "num_input_tokens_seen": 10105480, + "step": 17415 + }, + { + "epoch": 2.594578492701817, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029919345760687343, + "loss": 0.784, + "num_input_tokens_seen": 10108616, + "step": 17420 + }, + { + "epoch": 2.595323205242776, + "grad_norm": 0.020263671875, + "learning_rate": 0.029919143725662806, + "loss": 0.7971, + "num_input_tokens_seen": 10111560, + "step": 17425 + }, + { + "epoch": 2.5960679177837354, + "grad_norm": 0.022705078125, + "learning_rate": 0.02991894143859449, + "loss": 0.8103, + "num_input_tokens_seen": 10114824, + "step": 17430 + }, + { + "epoch": 2.5968126303246946, + "grad_norm": 0.026611328125, + "learning_rate": 0.02991873889948583, + "loss": 0.8143, + "num_input_tokens_seen": 10117768, + "step": 17435 + }, + { + "epoch": 2.5975573428656538, + "grad_norm": 0.019775390625, + "learning_rate": 0.029918536108340235, + "loss": 0.8001, + "num_input_tokens_seen": 10121064, + "step": 17440 + }, + { + "epoch": 2.598302055406613, + "grad_norm": 0.0186767578125, + "learning_rate": 0.029918333065161132, + "loss": 0.7822, + "num_input_tokens_seen": 10123912, + "step": 17445 + }, + { + "epoch": 2.599046767947572, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029918129769951958, + "loss": 0.7879, + "num_input_tokens_seen": 10126760, + "step": 17450 + }, + { + "epoch": 2.5997914804885314, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029917926222716145, + "loss": 0.7852, + "num_input_tokens_seen": 10129512, + "step": 17455 + }, + { + "epoch": 2.6005361930294906, + "grad_norm": 0.01361083984375, + "learning_rate": 0.029917722423457128, + "loss": 0.8028, + "num_input_tokens_seen": 10132424, + "step": 17460 + }, + { + "epoch": 2.6012809055704498, + "grad_norm": 0.0224609375, + "learning_rate": 0.02991751837217835, + "loss": 0.814, + "num_input_tokens_seen": 10135304, + "step": 17465 + }, + { + "epoch": 2.602025618111409, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02991731406888326, + "loss": 0.8053, + "num_input_tokens_seen": 10137864, + "step": 17470 + }, + { + "epoch": 2.602770330652368, + "grad_norm": 0.01171875, + "learning_rate": 0.029917109513575315, + "loss": 0.7927, + "num_input_tokens_seen": 10140648, + "step": 17475 + }, + { + "epoch": 2.6035150431933274, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029916904706257963, + "loss": 0.8163, + "num_input_tokens_seen": 10143464, + "step": 17480 + }, + { + "epoch": 2.6042597557342866, + "grad_norm": 0.01348876953125, + "learning_rate": 0.02991669964693467, + "loss": 0.8095, + "num_input_tokens_seen": 10146536, + "step": 17485 + }, + { + "epoch": 2.605004468275246, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029916494335608897, + "loss": 0.7917, + "num_input_tokens_seen": 10149480, + "step": 17490 + }, + { + "epoch": 2.605749180816205, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02991628877228411, + "loss": 0.7902, + "num_input_tokens_seen": 10152456, + "step": 17495 + }, + { + "epoch": 2.606493893357164, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029916082956963786, + "loss": 0.7972, + "num_input_tokens_seen": 10155304, + "step": 17500 + }, + { + "epoch": 2.6072386058981234, + "grad_norm": 0.01531982421875, + "learning_rate": 0.0299158768896514, + "loss": 0.8045, + "num_input_tokens_seen": 10158312, + "step": 17505 + }, + { + "epoch": 2.6079833184390826, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029915670570350432, + "loss": 0.8167, + "num_input_tokens_seen": 10161128, + "step": 17510 + }, + { + "epoch": 2.608728030980042, + "grad_norm": 0.01373291015625, + "learning_rate": 0.029915463999064373, + "loss": 0.8047, + "num_input_tokens_seen": 10163880, + "step": 17515 + }, + { + "epoch": 2.609472743521001, + "grad_norm": 0.035888671875, + "learning_rate": 0.029915257175796708, + "loss": 0.7931, + "num_input_tokens_seen": 10166568, + "step": 17520 + }, + { + "epoch": 2.61021745606196, + "grad_norm": 0.01513671875, + "learning_rate": 0.02991505010055093, + "loss": 0.794, + "num_input_tokens_seen": 10169512, + "step": 17525 + }, + { + "epoch": 2.6109621686029194, + "grad_norm": 0.02099609375, + "learning_rate": 0.02991484277333054, + "loss": 0.7796, + "num_input_tokens_seen": 10172264, + "step": 17530 + }, + { + "epoch": 2.6117068811438786, + "grad_norm": 0.0244140625, + "learning_rate": 0.02991463519413904, + "loss": 0.8038, + "num_input_tokens_seen": 10175112, + "step": 17535 + }, + { + "epoch": 2.612451593684838, + "grad_norm": 0.021240234375, + "learning_rate": 0.029914427362979935, + "loss": 0.7948, + "num_input_tokens_seen": 10177928, + "step": 17540 + }, + { + "epoch": 2.613196306225797, + "grad_norm": 0.03076171875, + "learning_rate": 0.02991421927985674, + "loss": 0.7834, + "num_input_tokens_seen": 10180840, + "step": 17545 + }, + { + "epoch": 2.6139410187667558, + "grad_norm": 0.01312255859375, + "learning_rate": 0.02991401094477297, + "loss": 0.7958, + "num_input_tokens_seen": 10183624, + "step": 17550 + }, + { + "epoch": 2.6146857313077154, + "grad_norm": 0.015869140625, + "learning_rate": 0.029913802357732142, + "loss": 0.785, + "num_input_tokens_seen": 10186728, + "step": 17555 + }, + { + "epoch": 2.615430443848674, + "grad_norm": 0.020263671875, + "learning_rate": 0.02991359351873778, + "loss": 0.8288, + "num_input_tokens_seen": 10189512, + "step": 17560 + }, + { + "epoch": 2.616175156389634, + "grad_norm": 0.035400390625, + "learning_rate": 0.029913384427793416, + "loss": 0.8524, + "num_input_tokens_seen": 10192104, + "step": 17565 + }, + { + "epoch": 2.6169198689305926, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029913175084902578, + "loss": 0.7943, + "num_input_tokens_seen": 10194568, + "step": 17570 + }, + { + "epoch": 2.6176645814715522, + "grad_norm": 0.0211181640625, + "learning_rate": 0.0299129654900688, + "loss": 0.8072, + "num_input_tokens_seen": 10197224, + "step": 17575 + }, + { + "epoch": 2.618409294012511, + "grad_norm": 0.022216796875, + "learning_rate": 0.02991275564329563, + "loss": 0.7918, + "num_input_tokens_seen": 10200168, + "step": 17580 + }, + { + "epoch": 2.6191540065534706, + "grad_norm": 0.041259765625, + "learning_rate": 0.029912545544586614, + "loss": 0.817, + "num_input_tokens_seen": 10202792, + "step": 17585 + }, + { + "epoch": 2.6198987190944294, + "grad_norm": 0.028564453125, + "learning_rate": 0.029912335193945296, + "loss": 0.8219, + "num_input_tokens_seen": 10205640, + "step": 17590 + }, + { + "epoch": 2.6206434316353886, + "grad_norm": 0.0244140625, + "learning_rate": 0.029912124591375225, + "loss": 0.805, + "num_input_tokens_seen": 10208200, + "step": 17595 + }, + { + "epoch": 2.621388144176348, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02991191373687997, + "loss": 0.7984, + "num_input_tokens_seen": 10210920, + "step": 17600 + }, + { + "epoch": 2.622132856717307, + "grad_norm": 0.027587890625, + "learning_rate": 0.029911702630463086, + "loss": 0.8189, + "num_input_tokens_seen": 10213960, + "step": 17605 + }, + { + "epoch": 2.622877569258266, + "grad_norm": 0.01361083984375, + "learning_rate": 0.029911491272128148, + "loss": 0.8096, + "num_input_tokens_seen": 10216616, + "step": 17610 + }, + { + "epoch": 2.6236222817992254, + "grad_norm": 0.0128173828125, + "learning_rate": 0.02991127966187871, + "loss": 0.8124, + "num_input_tokens_seen": 10219336, + "step": 17615 + }, + { + "epoch": 2.6243669943401846, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02991106779971836, + "loss": 0.8058, + "num_input_tokens_seen": 10222120, + "step": 17620 + }, + { + "epoch": 2.625111706881144, + "grad_norm": 0.021728515625, + "learning_rate": 0.029910855685650682, + "loss": 0.835, + "num_input_tokens_seen": 10224904, + "step": 17625 + }, + { + "epoch": 2.625856419422103, + "grad_norm": 0.01287841796875, + "learning_rate": 0.029910643319679242, + "loss": 0.7908, + "num_input_tokens_seen": 10227816, + "step": 17630 + }, + { + "epoch": 2.626601131963062, + "grad_norm": 0.020263671875, + "learning_rate": 0.02991043070180765, + "loss": 0.7938, + "num_input_tokens_seen": 10230568, + "step": 17635 + }, + { + "epoch": 2.6273458445040214, + "grad_norm": 0.021728515625, + "learning_rate": 0.029910217832039475, + "loss": 0.7795, + "num_input_tokens_seen": 10233960, + "step": 17640 + }, + { + "epoch": 2.6280905570449806, + "grad_norm": 0.01165771484375, + "learning_rate": 0.029910004710378323, + "loss": 0.8, + "num_input_tokens_seen": 10236776, + "step": 17645 + }, + { + "epoch": 2.62883526958594, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0299097913368278, + "loss": 0.7975, + "num_input_tokens_seen": 10240072, + "step": 17650 + }, + { + "epoch": 2.629579982126899, + "grad_norm": 0.01385498046875, + "learning_rate": 0.029909577711391508, + "loss": 0.806, + "num_input_tokens_seen": 10242920, + "step": 17655 + }, + { + "epoch": 2.6303246946678582, + "grad_norm": 0.023681640625, + "learning_rate": 0.029909363834073048, + "loss": 0.8191, + "num_input_tokens_seen": 10246408, + "step": 17660 + }, + { + "epoch": 2.6310694072088174, + "grad_norm": 0.020751953125, + "learning_rate": 0.029909149704876046, + "loss": 0.8059, + "num_input_tokens_seen": 10249256, + "step": 17665 + }, + { + "epoch": 2.6318141197497766, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029908935323804106, + "loss": 0.812, + "num_input_tokens_seen": 10252264, + "step": 17670 + }, + { + "epoch": 2.632558832290736, + "grad_norm": 0.0263671875, + "learning_rate": 0.029908720690860858, + "loss": 0.7891, + "num_input_tokens_seen": 10255304, + "step": 17675 + }, + { + "epoch": 2.633303544831695, + "grad_norm": 0.01904296875, + "learning_rate": 0.02990850580604993, + "loss": 0.7974, + "num_input_tokens_seen": 10258120, + "step": 17680 + }, + { + "epoch": 2.6340482573726542, + "grad_norm": 0.0322265625, + "learning_rate": 0.029908290669374945, + "loss": 0.8008, + "num_input_tokens_seen": 10260872, + "step": 17685 + }, + { + "epoch": 2.6347929699136134, + "grad_norm": 0.02197265625, + "learning_rate": 0.029908075280839545, + "loss": 0.7959, + "num_input_tokens_seen": 10263752, + "step": 17690 + }, + { + "epoch": 2.6355376824545726, + "grad_norm": 0.02294921875, + "learning_rate": 0.02990785964044736, + "loss": 0.7935, + "num_input_tokens_seen": 10266824, + "step": 17695 + }, + { + "epoch": 2.636282394995532, + "grad_norm": 0.020263671875, + "learning_rate": 0.02990764374820204, + "loss": 0.8007, + "num_input_tokens_seen": 10269960, + "step": 17700 + }, + { + "epoch": 2.637027107536491, + "grad_norm": 0.028564453125, + "learning_rate": 0.029907427604107233, + "loss": 0.8042, + "num_input_tokens_seen": 10272968, + "step": 17705 + }, + { + "epoch": 2.6377718200774503, + "grad_norm": 0.02392578125, + "learning_rate": 0.029907211208166584, + "loss": 0.82, + "num_input_tokens_seen": 10276104, + "step": 17710 + }, + { + "epoch": 2.6385165326184095, + "grad_norm": 0.018798828125, + "learning_rate": 0.02990699456038376, + "loss": 0.8047, + "num_input_tokens_seen": 10278888, + "step": 17715 + }, + { + "epoch": 2.6392612451593687, + "grad_norm": 0.02197265625, + "learning_rate": 0.02990677766076241, + "loss": 0.8335, + "num_input_tokens_seen": 10281832, + "step": 17720 + }, + { + "epoch": 2.6400059577003274, + "grad_norm": 0.01953125, + "learning_rate": 0.029906560509306197, + "loss": 0.7961, + "num_input_tokens_seen": 10284680, + "step": 17725 + }, + { + "epoch": 2.640750670241287, + "grad_norm": 0.03369140625, + "learning_rate": 0.0299063431060188, + "loss": 0.8291, + "num_input_tokens_seen": 10287912, + "step": 17730 + }, + { + "epoch": 2.641495382782246, + "grad_norm": 0.0128173828125, + "learning_rate": 0.029906125450903882, + "loss": 0.8202, + "num_input_tokens_seen": 10290856, + "step": 17735 + }, + { + "epoch": 2.6422400953232055, + "grad_norm": 0.0130615234375, + "learning_rate": 0.02990590754396513, + "loss": 0.7891, + "num_input_tokens_seen": 10294024, + "step": 17740 + }, + { + "epoch": 2.6429848078641642, + "grad_norm": 0.014892578125, + "learning_rate": 0.029905689385206213, + "loss": 0.7976, + "num_input_tokens_seen": 10296904, + "step": 17745 + }, + { + "epoch": 2.643729520405124, + "grad_norm": 0.021240234375, + "learning_rate": 0.029905470974630824, + "loss": 0.7976, + "num_input_tokens_seen": 10299656, + "step": 17750 + }, + { + "epoch": 2.6444742329460826, + "grad_norm": 0.0234375, + "learning_rate": 0.02990525231224266, + "loss": 0.7788, + "num_input_tokens_seen": 10302568, + "step": 17755 + }, + { + "epoch": 2.645218945487042, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0299050333980454, + "loss": 0.8077, + "num_input_tokens_seen": 10305416, + "step": 17760 + }, + { + "epoch": 2.645963658028001, + "grad_norm": 0.02490234375, + "learning_rate": 0.02990481423204275, + "loss": 0.8301, + "num_input_tokens_seen": 10308360, + "step": 17765 + }, + { + "epoch": 2.6467083705689602, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029904594814238412, + "loss": 0.7963, + "num_input_tokens_seen": 10311208, + "step": 17770 + }, + { + "epoch": 2.6474530831099194, + "grad_norm": 0.0155029296875, + "learning_rate": 0.0299043751446361, + "loss": 0.7939, + "num_input_tokens_seen": 10314280, + "step": 17775 + }, + { + "epoch": 2.6481977956508786, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029904155223239512, + "loss": 0.8053, + "num_input_tokens_seen": 10317384, + "step": 17780 + }, + { + "epoch": 2.648942508191838, + "grad_norm": 0.0279541015625, + "learning_rate": 0.029903935050052368, + "loss": 0.7968, + "num_input_tokens_seen": 10320200, + "step": 17785 + }, + { + "epoch": 2.649687220732797, + "grad_norm": 0.01519775390625, + "learning_rate": 0.029903714625078393, + "loss": 0.7937, + "num_input_tokens_seen": 10323464, + "step": 17790 + }, + { + "epoch": 2.6504319332737563, + "grad_norm": 0.0269775390625, + "learning_rate": 0.029903493948321303, + "loss": 0.8237, + "num_input_tokens_seen": 10326408, + "step": 17795 + }, + { + "epoch": 2.6511766458147155, + "grad_norm": 0.02978515625, + "learning_rate": 0.029903273019784835, + "loss": 0.7951, + "num_input_tokens_seen": 10329384, + "step": 17800 + }, + { + "epoch": 2.6519213583556747, + "grad_norm": 0.025146484375, + "learning_rate": 0.02990305183947271, + "loss": 0.7916, + "num_input_tokens_seen": 10332520, + "step": 17805 + }, + { + "epoch": 2.652666070896634, + "grad_norm": 0.021728515625, + "learning_rate": 0.029902830407388677, + "loss": 0.8075, + "num_input_tokens_seen": 10335496, + "step": 17810 + }, + { + "epoch": 2.653410783437593, + "grad_norm": 0.044189453125, + "learning_rate": 0.029902608723536468, + "loss": 0.8075, + "num_input_tokens_seen": 10338376, + "step": 17815 + }, + { + "epoch": 2.6541554959785523, + "grad_norm": 0.03466796875, + "learning_rate": 0.02990238678791983, + "loss": 0.8073, + "num_input_tokens_seen": 10341288, + "step": 17820 + }, + { + "epoch": 2.6549002085195115, + "grad_norm": 0.02783203125, + "learning_rate": 0.029902164600542517, + "loss": 0.7993, + "num_input_tokens_seen": 10344552, + "step": 17825 + }, + { + "epoch": 2.6556449210604707, + "grad_norm": 0.036376953125, + "learning_rate": 0.029901942161408273, + "loss": 0.8086, + "num_input_tokens_seen": 10347528, + "step": 17830 + }, + { + "epoch": 2.65638963360143, + "grad_norm": 0.0250244140625, + "learning_rate": 0.029901719470520866, + "loss": 0.7914, + "num_input_tokens_seen": 10350344, + "step": 17835 + }, + { + "epoch": 2.657134346142389, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029901496527884056, + "loss": 0.7988, + "num_input_tokens_seen": 10353256, + "step": 17840 + }, + { + "epoch": 2.6578790586833483, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029901273333501604, + "loss": 0.8074, + "num_input_tokens_seen": 10356040, + "step": 17845 + }, + { + "epoch": 2.6586237712243075, + "grad_norm": 0.026611328125, + "learning_rate": 0.029901049887377285, + "loss": 0.7911, + "num_input_tokens_seen": 10359144, + "step": 17850 + }, + { + "epoch": 2.6593684837652667, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02990082618951487, + "loss": 0.8136, + "num_input_tokens_seen": 10362440, + "step": 17855 + }, + { + "epoch": 2.660113196306226, + "grad_norm": 0.021240234375, + "learning_rate": 0.029900602239918146, + "loss": 0.7835, + "num_input_tokens_seen": 10365000, + "step": 17860 + }, + { + "epoch": 2.660857908847185, + "grad_norm": 0.01220703125, + "learning_rate": 0.02990037803859089, + "loss": 0.7956, + "num_input_tokens_seen": 10368040, + "step": 17865 + }, + { + "epoch": 2.6616026213881443, + "grad_norm": 0.020263671875, + "learning_rate": 0.02990015358553689, + "loss": 0.817, + "num_input_tokens_seen": 10370792, + "step": 17870 + }, + { + "epoch": 2.6623473339291035, + "grad_norm": 0.021240234375, + "learning_rate": 0.029899928880759936, + "loss": 0.8067, + "num_input_tokens_seen": 10373352, + "step": 17875 + }, + { + "epoch": 2.6630920464700627, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029899703924263832, + "loss": 0.7934, + "num_input_tokens_seen": 10376296, + "step": 17880 + }, + { + "epoch": 2.663836759011022, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02989947871605237, + "loss": 0.7989, + "num_input_tokens_seen": 10378984, + "step": 17885 + }, + { + "epoch": 2.6645814715519807, + "grad_norm": 0.02099609375, + "learning_rate": 0.02989925325612936, + "loss": 0.8134, + "num_input_tokens_seen": 10382120, + "step": 17890 + }, + { + "epoch": 2.6653261840929403, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02989902754449861, + "loss": 0.8159, + "num_input_tokens_seen": 10384872, + "step": 17895 + }, + { + "epoch": 2.666070896633899, + "grad_norm": 0.01263427734375, + "learning_rate": 0.029898801581163932, + "loss": 0.8326, + "num_input_tokens_seen": 10387560, + "step": 17900 + }, + { + "epoch": 2.6668156091748587, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029898575366129145, + "loss": 0.7918, + "num_input_tokens_seen": 10390440, + "step": 17905 + }, + { + "epoch": 2.6675603217158175, + "grad_norm": 0.025390625, + "learning_rate": 0.029898348899398065, + "loss": 0.796, + "num_input_tokens_seen": 10393672, + "step": 17910 + }, + { + "epoch": 2.668305034256777, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029898122180974526, + "loss": 0.7971, + "num_input_tokens_seen": 10396424, + "step": 17915 + }, + { + "epoch": 2.669049746797736, + "grad_norm": 0.01953125, + "learning_rate": 0.029897895210862356, + "loss": 0.7772, + "num_input_tokens_seen": 10399368, + "step": 17920 + }, + { + "epoch": 2.6697944593386955, + "grad_norm": 0.021240234375, + "learning_rate": 0.029897667989065384, + "loss": 0.8152, + "num_input_tokens_seen": 10402120, + "step": 17925 + }, + { + "epoch": 2.6705391718796543, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029897440515587455, + "loss": 0.8391, + "num_input_tokens_seen": 10404712, + "step": 17930 + }, + { + "epoch": 2.6712838844206135, + "grad_norm": 0.01239013671875, + "learning_rate": 0.029897212790432406, + "loss": 0.83, + "num_input_tokens_seen": 10407656, + "step": 17935 + }, + { + "epoch": 2.6720285969615727, + "grad_norm": 0.022216796875, + "learning_rate": 0.029896984813604092, + "loss": 0.8044, + "num_input_tokens_seen": 10410472, + "step": 17940 + }, + { + "epoch": 2.672773309502532, + "grad_norm": 0.01953125, + "learning_rate": 0.029896756585106363, + "loss": 0.8108, + "num_input_tokens_seen": 10413352, + "step": 17945 + }, + { + "epoch": 2.673518022043491, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029896528104943067, + "loss": 0.7968, + "num_input_tokens_seen": 10416456, + "step": 17950 + }, + { + "epoch": 2.6742627345844503, + "grad_norm": 0.01226806640625, + "learning_rate": 0.029896299373118065, + "loss": 0.7994, + "num_input_tokens_seen": 10419240, + "step": 17955 + }, + { + "epoch": 2.6750074471254095, + "grad_norm": 0.01312255859375, + "learning_rate": 0.02989607038963523, + "loss": 0.7955, + "num_input_tokens_seen": 10422696, + "step": 17960 + }, + { + "epoch": 2.6757521596663687, + "grad_norm": 0.031494140625, + "learning_rate": 0.029895841154498424, + "loss": 0.8106, + "num_input_tokens_seen": 10425672, + "step": 17965 + }, + { + "epoch": 2.676496872207328, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029895611667711524, + "loss": 0.7987, + "num_input_tokens_seen": 10428744, + "step": 17970 + }, + { + "epoch": 2.677241584748287, + "grad_norm": 0.01312255859375, + "learning_rate": 0.0298953819292784, + "loss": 0.801, + "num_input_tokens_seen": 10431816, + "step": 17975 + }, + { + "epoch": 2.6779862972892463, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029895151939202943, + "loss": 0.8057, + "num_input_tokens_seen": 10434696, + "step": 17980 + }, + { + "epoch": 2.6787310098302055, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02989492169748903, + "loss": 0.7969, + "num_input_tokens_seen": 10437640, + "step": 17985 + }, + { + "epoch": 2.6794757223711647, + "grad_norm": 0.026123046875, + "learning_rate": 0.029894691204140553, + "loss": 0.7911, + "num_input_tokens_seen": 10440392, + "step": 17990 + }, + { + "epoch": 2.680220434912124, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029894460459161405, + "loss": 0.799, + "num_input_tokens_seen": 10443368, + "step": 17995 + }, + { + "epoch": 2.680965147453083, + "grad_norm": 0.0311279296875, + "learning_rate": 0.02989422946255549, + "loss": 0.8204, + "num_input_tokens_seen": 10446216, + "step": 18000 + }, + { + "epoch": 2.6817098599940423, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0298939982143267, + "loss": 0.8216, + "num_input_tokens_seen": 10448936, + "step": 18005 + }, + { + "epoch": 2.6824545725350015, + "grad_norm": 0.0130615234375, + "learning_rate": 0.02989376671447895, + "loss": 0.8164, + "num_input_tokens_seen": 10451880, + "step": 18010 + }, + { + "epoch": 2.6831992850759607, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02989353496301615, + "loss": 0.7926, + "num_input_tokens_seen": 10454792, + "step": 18015 + }, + { + "epoch": 2.68394399761692, + "grad_norm": 0.021728515625, + "learning_rate": 0.02989330295994221, + "loss": 0.7914, + "num_input_tokens_seen": 10457480, + "step": 18020 + }, + { + "epoch": 2.684688710157879, + "grad_norm": 0.02490234375, + "learning_rate": 0.029893070705261055, + "loss": 0.8005, + "num_input_tokens_seen": 10460360, + "step": 18025 + }, + { + "epoch": 2.6854334226988383, + "grad_norm": 0.022705078125, + "learning_rate": 0.02989283819897661, + "loss": 0.7992, + "num_input_tokens_seen": 10462920, + "step": 18030 + }, + { + "epoch": 2.6861781352397975, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0298926054410928, + "loss": 0.8065, + "num_input_tokens_seen": 10465416, + "step": 18035 + }, + { + "epoch": 2.6869228477807567, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029892372431613558, + "loss": 0.7889, + "num_input_tokens_seen": 10468136, + "step": 18040 + }, + { + "epoch": 2.687667560321716, + "grad_norm": 0.01324462890625, + "learning_rate": 0.029892139170542814, + "loss": 0.8284, + "num_input_tokens_seen": 10470952, + "step": 18045 + }, + { + "epoch": 2.688412272862675, + "grad_norm": 0.0289306640625, + "learning_rate": 0.029891905657884518, + "loss": 0.7876, + "num_input_tokens_seen": 10473832, + "step": 18050 + }, + { + "epoch": 2.6891569854036343, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029891671893642613, + "loss": 0.8104, + "num_input_tokens_seen": 10476744, + "step": 18055 + }, + { + "epoch": 2.6899016979445936, + "grad_norm": 0.0303955078125, + "learning_rate": 0.029891437877821045, + "loss": 0.7988, + "num_input_tokens_seen": 10479432, + "step": 18060 + }, + { + "epoch": 2.6906464104855523, + "grad_norm": 0.0205078125, + "learning_rate": 0.02989120361042377, + "loss": 0.8023, + "num_input_tokens_seen": 10482152, + "step": 18065 + }, + { + "epoch": 2.691391123026512, + "grad_norm": 0.01251220703125, + "learning_rate": 0.02989096909145474, + "loss": 0.8282, + "num_input_tokens_seen": 10485128, + "step": 18070 + }, + { + "epoch": 2.6921358355674707, + "grad_norm": 0.01275634765625, + "learning_rate": 0.02989073432091793, + "loss": 0.8022, + "num_input_tokens_seen": 10488296, + "step": 18075 + }, + { + "epoch": 2.6928805481084304, + "grad_norm": 0.0186767578125, + "learning_rate": 0.029890499298817288, + "loss": 0.8, + "num_input_tokens_seen": 10490984, + "step": 18080 + }, + { + "epoch": 2.693625260649389, + "grad_norm": 0.020263671875, + "learning_rate": 0.029890264025156803, + "loss": 0.8079, + "num_input_tokens_seen": 10493832, + "step": 18085 + }, + { + "epoch": 2.6943699731903488, + "grad_norm": 0.01251220703125, + "learning_rate": 0.02989002849994044, + "loss": 0.7956, + "num_input_tokens_seen": 10496616, + "step": 18090 + }, + { + "epoch": 2.6951146857313075, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029889792723172177, + "loss": 0.7837, + "num_input_tokens_seen": 10499400, + "step": 18095 + }, + { + "epoch": 2.695859398272267, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029889556694855997, + "loss": 0.7968, + "num_input_tokens_seen": 10502248, + "step": 18100 + }, + { + "epoch": 2.696604110813226, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029889320414995892, + "loss": 0.7827, + "num_input_tokens_seen": 10504936, + "step": 18105 + }, + { + "epoch": 2.697348823354185, + "grad_norm": 0.026611328125, + "learning_rate": 0.02988908388359585, + "loss": 0.8388, + "num_input_tokens_seen": 10507944, + "step": 18110 + }, + { + "epoch": 2.6980935358951443, + "grad_norm": 0.020263671875, + "learning_rate": 0.02988884710065987, + "loss": 0.8176, + "num_input_tokens_seen": 10510696, + "step": 18115 + }, + { + "epoch": 2.6988382484361035, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029888610066191947, + "loss": 0.7858, + "num_input_tokens_seen": 10513576, + "step": 18120 + }, + { + "epoch": 2.6995829609770627, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029888372780196095, + "loss": 0.8063, + "num_input_tokens_seen": 10516520, + "step": 18125 + }, + { + "epoch": 2.700327673518022, + "grad_norm": 0.01470947265625, + "learning_rate": 0.029888135242676316, + "loss": 0.7998, + "num_input_tokens_seen": 10519400, + "step": 18130 + }, + { + "epoch": 2.701072386058981, + "grad_norm": 0.021240234375, + "learning_rate": 0.02988789745363662, + "loss": 0.8061, + "num_input_tokens_seen": 10522696, + "step": 18135 + }, + { + "epoch": 2.7018170985999403, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02988765941308103, + "loss": 0.8013, + "num_input_tokens_seen": 10525736, + "step": 18140 + }, + { + "epoch": 2.7025618111408996, + "grad_norm": 0.029052734375, + "learning_rate": 0.029887421121013565, + "loss": 0.81, + "num_input_tokens_seen": 10528808, + "step": 18145 + }, + { + "epoch": 2.7033065236818588, + "grad_norm": 0.02587890625, + "learning_rate": 0.02988718257743825, + "loss": 0.7743, + "num_input_tokens_seen": 10531880, + "step": 18150 + }, + { + "epoch": 2.704051236222818, + "grad_norm": 0.03857421875, + "learning_rate": 0.029886943782359114, + "loss": 0.8134, + "num_input_tokens_seen": 10534696, + "step": 18155 + }, + { + "epoch": 2.704795948763777, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029886704735780197, + "loss": 0.8315, + "num_input_tokens_seen": 10537608, + "step": 18160 + }, + { + "epoch": 2.7055406613047364, + "grad_norm": 0.020751953125, + "learning_rate": 0.029886465437705536, + "loss": 0.7939, + "num_input_tokens_seen": 10540040, + "step": 18165 + }, + { + "epoch": 2.7062853738456956, + "grad_norm": 0.022216796875, + "learning_rate": 0.029886225888139165, + "loss": 0.7995, + "num_input_tokens_seen": 10542632, + "step": 18170 + }, + { + "epoch": 2.7070300863866548, + "grad_norm": 0.01275634765625, + "learning_rate": 0.029885986087085144, + "loss": 0.7958, + "num_input_tokens_seen": 10545512, + "step": 18175 + }, + { + "epoch": 2.707774798927614, + "grad_norm": 0.021728515625, + "learning_rate": 0.029885746034547512, + "loss": 0.7994, + "num_input_tokens_seen": 10548200, + "step": 18180 + }, + { + "epoch": 2.708519511468573, + "grad_norm": 0.0123291015625, + "learning_rate": 0.02988550573053033, + "loss": 0.8051, + "num_input_tokens_seen": 10551016, + "step": 18185 + }, + { + "epoch": 2.7092642240095324, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029885265175037663, + "loss": 0.8061, + "num_input_tokens_seen": 10553416, + "step": 18190 + }, + { + "epoch": 2.7100089365504916, + "grad_norm": 0.015380859375, + "learning_rate": 0.029885024368073564, + "loss": 0.8079, + "num_input_tokens_seen": 10556456, + "step": 18195 + }, + { + "epoch": 2.710753649091451, + "grad_norm": 0.01513671875, + "learning_rate": 0.029884783309642112, + "loss": 0.815, + "num_input_tokens_seen": 10559528, + "step": 18200 + }, + { + "epoch": 2.71149836163241, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02988454199974737, + "loss": 0.7941, + "num_input_tokens_seen": 10562664, + "step": 18205 + }, + { + "epoch": 2.712243074173369, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02988430043839342, + "loss": 0.8025, + "num_input_tokens_seen": 10565576, + "step": 18210 + }, + { + "epoch": 2.7129877867143284, + "grad_norm": 0.0303955078125, + "learning_rate": 0.029884058625584343, + "loss": 0.7844, + "num_input_tokens_seen": 10568360, + "step": 18215 + }, + { + "epoch": 2.7137324992552876, + "grad_norm": 0.0272216796875, + "learning_rate": 0.029883816561324224, + "loss": 0.8247, + "num_input_tokens_seen": 10571432, + "step": 18220 + }, + { + "epoch": 2.714477211796247, + "grad_norm": 0.0205078125, + "learning_rate": 0.029883574245617148, + "loss": 0.7998, + "num_input_tokens_seen": 10574600, + "step": 18225 + }, + { + "epoch": 2.715221924337206, + "grad_norm": 0.025634765625, + "learning_rate": 0.02988333167846721, + "loss": 0.8152, + "num_input_tokens_seen": 10577672, + "step": 18230 + }, + { + "epoch": 2.715966636878165, + "grad_norm": 0.0311279296875, + "learning_rate": 0.029883088859878514, + "loss": 0.7996, + "num_input_tokens_seen": 10580392, + "step": 18235 + }, + { + "epoch": 2.716711349419124, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02988284578985516, + "loss": 0.7824, + "num_input_tokens_seen": 10583304, + "step": 18240 + }, + { + "epoch": 2.7174560619600836, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029882602468401246, + "loss": 0.8068, + "num_input_tokens_seen": 10585960, + "step": 18245 + }, + { + "epoch": 2.7182007745010424, + "grad_norm": 0.0205078125, + "learning_rate": 0.029882358895520895, + "loss": 0.8159, + "num_input_tokens_seen": 10588648, + "step": 18250 + }, + { + "epoch": 2.718945487042002, + "grad_norm": 0.0250244140625, + "learning_rate": 0.029882115071218216, + "loss": 0.7936, + "num_input_tokens_seen": 10591560, + "step": 18255 + }, + { + "epoch": 2.7196901995829608, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029881870995497325, + "loss": 0.792, + "num_input_tokens_seen": 10594248, + "step": 18260 + }, + { + "epoch": 2.7204349121239204, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029881626668362353, + "loss": 0.8112, + "num_input_tokens_seen": 10597064, + "step": 18265 + }, + { + "epoch": 2.721179624664879, + "grad_norm": 0.022216796875, + "learning_rate": 0.029881382089817422, + "loss": 0.8005, + "num_input_tokens_seen": 10599944, + "step": 18270 + }, + { + "epoch": 2.721924337205839, + "grad_norm": 0.0118408203125, + "learning_rate": 0.029881137259866665, + "loss": 0.8142, + "num_input_tokens_seen": 10602760, + "step": 18275 + }, + { + "epoch": 2.7226690497467976, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02988089217851422, + "loss": 0.8028, + "num_input_tokens_seen": 10606184, + "step": 18280 + }, + { + "epoch": 2.723413762287757, + "grad_norm": 0.033203125, + "learning_rate": 0.02988064684576422, + "loss": 0.8041, + "num_input_tokens_seen": 10609064, + "step": 18285 + }, + { + "epoch": 2.724158474828716, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029880401261620823, + "loss": 0.8094, + "num_input_tokens_seen": 10611912, + "step": 18290 + }, + { + "epoch": 2.724903187369675, + "grad_norm": 0.025634765625, + "learning_rate": 0.029880155426088167, + "loss": 0.7962, + "num_input_tokens_seen": 10614984, + "step": 18295 + }, + { + "epoch": 2.7256478999106344, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02987990933917041, + "loss": 0.7907, + "num_input_tokens_seen": 10617640, + "step": 18300 + }, + { + "epoch": 2.7263926124515936, + "grad_norm": 0.02783203125, + "learning_rate": 0.029879663000871703, + "loss": 0.8159, + "num_input_tokens_seen": 10620680, + "step": 18305 + }, + { + "epoch": 2.727137324992553, + "grad_norm": 0.01220703125, + "learning_rate": 0.02987941641119622, + "loss": 0.8199, + "num_input_tokens_seen": 10623464, + "step": 18310 + }, + { + "epoch": 2.727882037533512, + "grad_norm": 0.01397705078125, + "learning_rate": 0.029879169570148113, + "loss": 0.7892, + "num_input_tokens_seen": 10626216, + "step": 18315 + }, + { + "epoch": 2.728626750074471, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02987892247773156, + "loss": 0.8119, + "num_input_tokens_seen": 10629512, + "step": 18320 + }, + { + "epoch": 2.7293714626154304, + "grad_norm": 0.02099609375, + "learning_rate": 0.029878675133950736, + "loss": 0.8118, + "num_input_tokens_seen": 10632264, + "step": 18325 + }, + { + "epoch": 2.7301161751563896, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02987842753880982, + "loss": 0.7845, + "num_input_tokens_seen": 10635304, + "step": 18330 + }, + { + "epoch": 2.730860887697349, + "grad_norm": 0.021484375, + "learning_rate": 0.02987817969231299, + "loss": 0.82, + "num_input_tokens_seen": 10637960, + "step": 18335 + }, + { + "epoch": 2.731605600238308, + "grad_norm": 0.0205078125, + "learning_rate": 0.02987793159446443, + "loss": 0.779, + "num_input_tokens_seen": 10640680, + "step": 18340 + }, + { + "epoch": 2.732350312779267, + "grad_norm": 0.0142822265625, + "learning_rate": 0.029877683245268342, + "loss": 0.792, + "num_input_tokens_seen": 10643496, + "step": 18345 + }, + { + "epoch": 2.7330950253202264, + "grad_norm": 0.021484375, + "learning_rate": 0.02987743464472892, + "loss": 0.7938, + "num_input_tokens_seen": 10646408, + "step": 18350 + }, + { + "epoch": 2.7338397378611856, + "grad_norm": 0.01220703125, + "learning_rate": 0.029877185792850355, + "loss": 0.787, + "num_input_tokens_seen": 10649064, + "step": 18355 + }, + { + "epoch": 2.734584450402145, + "grad_norm": 0.012451171875, + "learning_rate": 0.02987693668963686, + "loss": 0.8132, + "num_input_tokens_seen": 10651752, + "step": 18360 + }, + { + "epoch": 2.735329162943104, + "grad_norm": 0.031494140625, + "learning_rate": 0.029876687335092637, + "loss": 0.8022, + "num_input_tokens_seen": 10654376, + "step": 18365 + }, + { + "epoch": 2.7360738754840632, + "grad_norm": 0.021728515625, + "learning_rate": 0.0298764377292219, + "loss": 0.7928, + "num_input_tokens_seen": 10657448, + "step": 18370 + }, + { + "epoch": 2.7368185880250224, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029876187872028875, + "loss": 0.7861, + "num_input_tokens_seen": 10660264, + "step": 18375 + }, + { + "epoch": 2.7375633005659816, + "grad_norm": 0.021240234375, + "learning_rate": 0.02987593776351777, + "loss": 0.8203, + "num_input_tokens_seen": 10663336, + "step": 18380 + }, + { + "epoch": 2.738308013106941, + "grad_norm": 0.0306396484375, + "learning_rate": 0.029875687403692817, + "loss": 0.8006, + "num_input_tokens_seen": 10666216, + "step": 18385 + }, + { + "epoch": 2.7390527256479, + "grad_norm": 0.015625, + "learning_rate": 0.029875436792558243, + "loss": 0.8012, + "num_input_tokens_seen": 10669384, + "step": 18390 + }, + { + "epoch": 2.7397974381888592, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029875185930118283, + "loss": 0.7641, + "num_input_tokens_seen": 10672232, + "step": 18395 + }, + { + "epoch": 2.7405421507298184, + "grad_norm": 0.025146484375, + "learning_rate": 0.029874934816377176, + "loss": 0.8143, + "num_input_tokens_seen": 10675112, + "step": 18400 + }, + { + "epoch": 2.7412868632707776, + "grad_norm": 0.02783203125, + "learning_rate": 0.029874683451339163, + "loss": 0.8055, + "num_input_tokens_seen": 10678088, + "step": 18405 + }, + { + "epoch": 2.742031575811737, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029874431835008493, + "loss": 0.8322, + "num_input_tokens_seen": 10681256, + "step": 18410 + }, + { + "epoch": 2.7427762883526956, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02987417996738941, + "loss": 0.7972, + "num_input_tokens_seen": 10684008, + "step": 18415 + }, + { + "epoch": 2.7435210008936552, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02987392784848618, + "loss": 0.8029, + "num_input_tokens_seen": 10686728, + "step": 18420 + }, + { + "epoch": 2.744265713434614, + "grad_norm": 0.025390625, + "learning_rate": 0.029873675478303052, + "loss": 0.8098, + "num_input_tokens_seen": 10689384, + "step": 18425 + }, + { + "epoch": 2.7450104259755737, + "grad_norm": 0.0224609375, + "learning_rate": 0.029873422856844297, + "loss": 0.794, + "num_input_tokens_seen": 10692744, + "step": 18430 + }, + { + "epoch": 2.7457551385165324, + "grad_norm": 0.0284423828125, + "learning_rate": 0.029873169984114177, + "loss": 0.7695, + "num_input_tokens_seen": 10695656, + "step": 18435 + }, + { + "epoch": 2.746499851057492, + "grad_norm": 0.036376953125, + "learning_rate": 0.029872916860116967, + "loss": 0.8021, + "num_input_tokens_seen": 10698472, + "step": 18440 + }, + { + "epoch": 2.747244563598451, + "grad_norm": 0.0322265625, + "learning_rate": 0.02987266348485694, + "loss": 0.8181, + "num_input_tokens_seen": 10701640, + "step": 18445 + }, + { + "epoch": 2.7479892761394105, + "grad_norm": 0.03662109375, + "learning_rate": 0.02987240985833839, + "loss": 0.8082, + "num_input_tokens_seen": 10704392, + "step": 18450 + }, + { + "epoch": 2.7487339886803692, + "grad_norm": 0.031982421875, + "learning_rate": 0.029872155980565578, + "loss": 0.7971, + "num_input_tokens_seen": 10707080, + "step": 18455 + }, + { + "epoch": 2.7494787012213284, + "grad_norm": 0.02587890625, + "learning_rate": 0.02987190185154281, + "loss": 0.7878, + "num_input_tokens_seen": 10709928, + "step": 18460 + }, + { + "epoch": 2.7502234137622876, + "grad_norm": 0.0380859375, + "learning_rate": 0.02987164747127438, + "loss": 0.8221, + "num_input_tokens_seen": 10713128, + "step": 18465 + }, + { + "epoch": 2.750968126303247, + "grad_norm": 0.026611328125, + "learning_rate": 0.029871392839764577, + "loss": 0.7872, + "num_input_tokens_seen": 10716168, + "step": 18470 + }, + { + "epoch": 2.751712838844206, + "grad_norm": 0.01513671875, + "learning_rate": 0.02987113795701771, + "loss": 0.8228, + "num_input_tokens_seen": 10718888, + "step": 18475 + }, + { + "epoch": 2.7524575513851652, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029870882823038077, + "loss": 0.812, + "num_input_tokens_seen": 10722056, + "step": 18480 + }, + { + "epoch": 2.7532022639261244, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02987062743782999, + "loss": 0.7939, + "num_input_tokens_seen": 10725064, + "step": 18485 + }, + { + "epoch": 2.7539469764670836, + "grad_norm": 0.01324462890625, + "learning_rate": 0.029870371801397774, + "loss": 0.7939, + "num_input_tokens_seen": 10728008, + "step": 18490 + }, + { + "epoch": 2.754691689008043, + "grad_norm": 0.01318359375, + "learning_rate": 0.029870115913745734, + "loss": 0.8147, + "num_input_tokens_seen": 10730824, + "step": 18495 + }, + { + "epoch": 2.755436401549002, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029869859774878205, + "loss": 0.8098, + "num_input_tokens_seen": 10733640, + "step": 18500 + }, + { + "epoch": 2.7561811140899612, + "grad_norm": 0.023193359375, + "learning_rate": 0.029869603384799506, + "loss": 0.8169, + "num_input_tokens_seen": 10736360, + "step": 18505 + }, + { + "epoch": 2.7569258266309205, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029869346743513967, + "loss": 0.8096, + "num_input_tokens_seen": 10739272, + "step": 18510 + }, + { + "epoch": 2.7576705391718797, + "grad_norm": 0.035400390625, + "learning_rate": 0.02986908985102593, + "loss": 0.8088, + "num_input_tokens_seen": 10742184, + "step": 18515 + }, + { + "epoch": 2.758415251712839, + "grad_norm": 0.012939453125, + "learning_rate": 0.029868832707339737, + "loss": 0.8009, + "num_input_tokens_seen": 10745000, + "step": 18520 + }, + { + "epoch": 2.759159964253798, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02986857531245972, + "loss": 0.7867, + "num_input_tokens_seen": 10747752, + "step": 18525 + }, + { + "epoch": 2.7599046767947573, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029868317666390237, + "loss": 0.8029, + "num_input_tokens_seen": 10750792, + "step": 18530 + }, + { + "epoch": 2.7606493893357165, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029868059769135636, + "loss": 0.7989, + "num_input_tokens_seen": 10753832, + "step": 18535 + }, + { + "epoch": 2.7613941018766757, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02986780162070028, + "loss": 0.8223, + "num_input_tokens_seen": 10756584, + "step": 18540 + }, + { + "epoch": 2.762138814417635, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029867543221088526, + "loss": 0.8122, + "num_input_tokens_seen": 10759432, + "step": 18545 + }, + { + "epoch": 2.762883526958594, + "grad_norm": 0.023193359375, + "learning_rate": 0.029867284570304742, + "loss": 0.817, + "num_input_tokens_seen": 10762472, + "step": 18550 + }, + { + "epoch": 2.7636282394995533, + "grad_norm": 0.024658203125, + "learning_rate": 0.029867025668353295, + "loss": 0.8062, + "num_input_tokens_seen": 10765256, + "step": 18555 + }, + { + "epoch": 2.7643729520405125, + "grad_norm": 0.02392578125, + "learning_rate": 0.029866766515238556, + "loss": 0.8115, + "num_input_tokens_seen": 10767880, + "step": 18560 + }, + { + "epoch": 2.7651176645814717, + "grad_norm": 0.0174560546875, + "learning_rate": 0.029866507110964908, + "loss": 0.8183, + "num_input_tokens_seen": 10770664, + "step": 18565 + }, + { + "epoch": 2.765862377122431, + "grad_norm": 0.03564453125, + "learning_rate": 0.029866247455536732, + "loss": 0.81, + "num_input_tokens_seen": 10773352, + "step": 18570 + }, + { + "epoch": 2.76660708966339, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029865987548958417, + "loss": 0.804, + "num_input_tokens_seen": 10776360, + "step": 18575 + }, + { + "epoch": 2.7673518022043493, + "grad_norm": 0.033447265625, + "learning_rate": 0.02986572739123435, + "loss": 0.7939, + "num_input_tokens_seen": 10779336, + "step": 18580 + }, + { + "epoch": 2.7680965147453085, + "grad_norm": 0.031005859375, + "learning_rate": 0.02986546698236893, + "loss": 0.7854, + "num_input_tokens_seen": 10782152, + "step": 18585 + }, + { + "epoch": 2.7688412272862672, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029865206322366545, + "loss": 0.7888, + "num_input_tokens_seen": 10785096, + "step": 18590 + }, + { + "epoch": 2.769585939827227, + "grad_norm": 0.034912109375, + "learning_rate": 0.029864945411231615, + "loss": 0.7897, + "num_input_tokens_seen": 10787944, + "step": 18595 + }, + { + "epoch": 2.7703306523681857, + "grad_norm": 0.03759765625, + "learning_rate": 0.02986468424896854, + "loss": 0.8163, + "num_input_tokens_seen": 10791176, + "step": 18600 + }, + { + "epoch": 2.7710753649091453, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02986442283558173, + "loss": 0.8127, + "num_input_tokens_seen": 10793864, + "step": 18605 + }, + { + "epoch": 2.771820077450104, + "grad_norm": 0.0272216796875, + "learning_rate": 0.029864161171075605, + "loss": 0.816, + "num_input_tokens_seen": 10796328, + "step": 18610 + }, + { + "epoch": 2.7725647899910637, + "grad_norm": 0.02783203125, + "learning_rate": 0.029863899255454584, + "loss": 0.803, + "num_input_tokens_seen": 10798984, + "step": 18615 + }, + { + "epoch": 2.7733095025320225, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029863637088723093, + "loss": 0.8046, + "num_input_tokens_seen": 10801768, + "step": 18620 + }, + { + "epoch": 2.7740542150729817, + "grad_norm": 0.022705078125, + "learning_rate": 0.02986337467088556, + "loss": 0.8121, + "num_input_tokens_seen": 10804744, + "step": 18625 + }, + { + "epoch": 2.774798927613941, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02986311200194642, + "loss": 0.7976, + "num_input_tokens_seen": 10807688, + "step": 18630 + }, + { + "epoch": 2.7755436401549, + "grad_norm": 0.01239013671875, + "learning_rate": 0.029862849081910104, + "loss": 0.8029, + "num_input_tokens_seen": 10810472, + "step": 18635 + }, + { + "epoch": 2.7762883526958593, + "grad_norm": 0.044189453125, + "learning_rate": 0.029862585910781062, + "loss": 0.8096, + "num_input_tokens_seen": 10813224, + "step": 18640 + }, + { + "epoch": 2.7770330652368185, + "grad_norm": 0.01220703125, + "learning_rate": 0.029862322488563738, + "loss": 0.7903, + "num_input_tokens_seen": 10815784, + "step": 18645 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 0.0294189453125, + "learning_rate": 0.029862058815262577, + "loss": 0.8003, + "num_input_tokens_seen": 10818792, + "step": 18650 + }, + { + "epoch": 2.778522490318737, + "grad_norm": 0.01434326171875, + "learning_rate": 0.029861794890882042, + "loss": 0.8123, + "num_input_tokens_seen": 10821800, + "step": 18655 + }, + { + "epoch": 2.779267202859696, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029861530715426583, + "loss": 0.785, + "num_input_tokens_seen": 10824968, + "step": 18660 + }, + { + "epoch": 2.7800119154006553, + "grad_norm": 0.0145263671875, + "learning_rate": 0.02986126628890067, + "loss": 0.7924, + "num_input_tokens_seen": 10827912, + "step": 18665 + }, + { + "epoch": 2.7807566279416145, + "grad_norm": 0.0162353515625, + "learning_rate": 0.029861001611308766, + "loss": 0.7857, + "num_input_tokens_seen": 10830728, + "step": 18670 + }, + { + "epoch": 2.7815013404825737, + "grad_norm": 0.0291748046875, + "learning_rate": 0.029860736682655344, + "loss": 0.8034, + "num_input_tokens_seen": 10833544, + "step": 18675 + }, + { + "epoch": 2.782246053023533, + "grad_norm": 0.01409912109375, + "learning_rate": 0.029860471502944883, + "loss": 0.8076, + "num_input_tokens_seen": 10836296, + "step": 18680 + }, + { + "epoch": 2.782990765564492, + "grad_norm": 0.02783203125, + "learning_rate": 0.029860206072181854, + "loss": 0.8082, + "num_input_tokens_seen": 10839336, + "step": 18685 + }, + { + "epoch": 2.7837354781054513, + "grad_norm": 0.013671875, + "learning_rate": 0.029859940390370748, + "loss": 0.797, + "num_input_tokens_seen": 10841864, + "step": 18690 + }, + { + "epoch": 2.7844801906464105, + "grad_norm": 0.034423828125, + "learning_rate": 0.029859674457516052, + "loss": 0.7804, + "num_input_tokens_seen": 10844680, + "step": 18695 + }, + { + "epoch": 2.7852249031873697, + "grad_norm": 0.02783203125, + "learning_rate": 0.02985940827362226, + "loss": 0.8116, + "num_input_tokens_seen": 10847368, + "step": 18700 + }, + { + "epoch": 2.785969615728329, + "grad_norm": 0.02978515625, + "learning_rate": 0.029859141838693867, + "loss": 0.808, + "num_input_tokens_seen": 10850216, + "step": 18705 + }, + { + "epoch": 2.786714328269288, + "grad_norm": 0.0260009765625, + "learning_rate": 0.029858875152735373, + "loss": 0.7804, + "num_input_tokens_seen": 10853032, + "step": 18710 + }, + { + "epoch": 2.7874590408102473, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02985860821575129, + "loss": 0.8282, + "num_input_tokens_seen": 10856072, + "step": 18715 + }, + { + "epoch": 2.7882037533512065, + "grad_norm": 0.02197265625, + "learning_rate": 0.02985834102774612, + "loss": 0.805, + "num_input_tokens_seen": 10858632, + "step": 18720 + }, + { + "epoch": 2.7889484658921657, + "grad_norm": 0.024169921875, + "learning_rate": 0.02985807358872438, + "loss": 0.8026, + "num_input_tokens_seen": 10861704, + "step": 18725 + }, + { + "epoch": 2.789693178433125, + "grad_norm": 0.018798828125, + "learning_rate": 0.02985780589869059, + "loss": 0.7995, + "num_input_tokens_seen": 10864680, + "step": 18730 + }, + { + "epoch": 2.790437890974084, + "grad_norm": 0.03125, + "learning_rate": 0.029857537957649265, + "loss": 0.8101, + "num_input_tokens_seen": 10867720, + "step": 18735 + }, + { + "epoch": 2.7911826035150433, + "grad_norm": 0.021728515625, + "learning_rate": 0.02985726976560494, + "loss": 0.8016, + "num_input_tokens_seen": 10870408, + "step": 18740 + }, + { + "epoch": 2.7919273160560025, + "grad_norm": 0.02099609375, + "learning_rate": 0.02985700132256214, + "loss": 0.7893, + "num_input_tokens_seen": 10873320, + "step": 18745 + }, + { + "epoch": 2.7926720285969617, + "grad_norm": 0.01513671875, + "learning_rate": 0.02985673262852541, + "loss": 0.8038, + "num_input_tokens_seen": 10876552, + "step": 18750 + }, + { + "epoch": 2.7934167411379205, + "grad_norm": 0.02294921875, + "learning_rate": 0.02985646368349927, + "loss": 0.8047, + "num_input_tokens_seen": 10879592, + "step": 18755 + }, + { + "epoch": 2.79416145367888, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02985619448748828, + "loss": 0.7855, + "num_input_tokens_seen": 10882472, + "step": 18760 + }, + { + "epoch": 2.794906166219839, + "grad_norm": 0.031982421875, + "learning_rate": 0.02985592504049699, + "loss": 0.7812, + "num_input_tokens_seen": 10885352, + "step": 18765 + }, + { + "epoch": 2.7956508787607985, + "grad_norm": 0.0162353515625, + "learning_rate": 0.02985565534252994, + "loss": 0.8337, + "num_input_tokens_seen": 10888488, + "step": 18770 + }, + { + "epoch": 2.7963955913017573, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029855385393591696, + "loss": 0.7785, + "num_input_tokens_seen": 10891304, + "step": 18775 + }, + { + "epoch": 2.797140303842717, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029855115193686808, + "loss": 0.8135, + "num_input_tokens_seen": 10893928, + "step": 18780 + }, + { + "epoch": 2.7978850163836757, + "grad_norm": 0.018798828125, + "learning_rate": 0.02985484474281985, + "loss": 0.7831, + "num_input_tokens_seen": 10896936, + "step": 18785 + }, + { + "epoch": 2.7986297289246354, + "grad_norm": 0.01708984375, + "learning_rate": 0.029854574040995386, + "loss": 0.8024, + "num_input_tokens_seen": 10899880, + "step": 18790 + }, + { + "epoch": 2.799374441465594, + "grad_norm": 0.0302734375, + "learning_rate": 0.02985430308821799, + "loss": 0.8273, + "num_input_tokens_seen": 10903016, + "step": 18795 + }, + { + "epoch": 2.8001191540065533, + "grad_norm": 0.04541015625, + "learning_rate": 0.029854031884492246, + "loss": 0.8085, + "num_input_tokens_seen": 10905960, + "step": 18800 + }, + { + "epoch": 2.8008638665475125, + "grad_norm": 0.043212890625, + "learning_rate": 0.029853760429822726, + "loss": 0.7973, + "num_input_tokens_seen": 10908808, + "step": 18805 + }, + { + "epoch": 2.8016085790884717, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02985348872421402, + "loss": 0.7901, + "num_input_tokens_seen": 10911720, + "step": 18810 + }, + { + "epoch": 2.802353291629431, + "grad_norm": 0.032958984375, + "learning_rate": 0.02985321676767072, + "loss": 0.7885, + "num_input_tokens_seen": 10914920, + "step": 18815 + }, + { + "epoch": 2.80309800417039, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02985294456019742, + "loss": 0.798, + "num_input_tokens_seen": 10917832, + "step": 18820 + }, + { + "epoch": 2.8038427167113493, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02985267210179872, + "loss": 0.7891, + "num_input_tokens_seen": 10920616, + "step": 18825 + }, + { + "epoch": 2.8045874292523085, + "grad_norm": 0.021240234375, + "learning_rate": 0.02985239939247921, + "loss": 0.7937, + "num_input_tokens_seen": 10923656, + "step": 18830 + }, + { + "epoch": 2.8053321417932677, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029852126432243515, + "loss": 0.7915, + "num_input_tokens_seen": 10926664, + "step": 18835 + }, + { + "epoch": 2.806076854334227, + "grad_norm": 0.027587890625, + "learning_rate": 0.029851853221096236, + "loss": 0.8056, + "num_input_tokens_seen": 10929576, + "step": 18840 + }, + { + "epoch": 2.806821566875186, + "grad_norm": 0.0250244140625, + "learning_rate": 0.029851579759041996, + "loss": 0.7903, + "num_input_tokens_seen": 10932424, + "step": 18845 + }, + { + "epoch": 2.8075662794161453, + "grad_norm": 0.03271484375, + "learning_rate": 0.02985130604608541, + "loss": 0.813, + "num_input_tokens_seen": 10935208, + "step": 18850 + }, + { + "epoch": 2.8083109919571045, + "grad_norm": 0.019287109375, + "learning_rate": 0.0298510320822311, + "loss": 0.8056, + "num_input_tokens_seen": 10938024, + "step": 18855 + }, + { + "epoch": 2.8090557044980637, + "grad_norm": 0.0286865234375, + "learning_rate": 0.029850757867483698, + "loss": 0.8085, + "num_input_tokens_seen": 10941160, + "step": 18860 + }, + { + "epoch": 2.809800417039023, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029850483401847833, + "loss": 0.819, + "num_input_tokens_seen": 10943784, + "step": 18865 + }, + { + "epoch": 2.810545129579982, + "grad_norm": 0.029541015625, + "learning_rate": 0.029850208685328148, + "loss": 0.8168, + "num_input_tokens_seen": 10946792, + "step": 18870 + }, + { + "epoch": 2.8112898421209414, + "grad_norm": 0.052490234375, + "learning_rate": 0.029849933717929277, + "loss": 0.7863, + "num_input_tokens_seen": 10949576, + "step": 18875 + }, + { + "epoch": 2.8120345546619006, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029849658499655873, + "loss": 0.833, + "num_input_tokens_seen": 10952104, + "step": 18880 + }, + { + "epoch": 2.8127792672028598, + "grad_norm": 0.0306396484375, + "learning_rate": 0.029849383030512575, + "loss": 0.7865, + "num_input_tokens_seen": 10954888, + "step": 18885 + }, + { + "epoch": 2.813523979743819, + "grad_norm": 0.016845703125, + "learning_rate": 0.029849107310504046, + "loss": 0.8317, + "num_input_tokens_seen": 10957768, + "step": 18890 + }, + { + "epoch": 2.814268692284778, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02984883133963494, + "loss": 0.8066, + "num_input_tokens_seen": 10960488, + "step": 18895 + }, + { + "epoch": 2.8150134048257374, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02984855511790992, + "loss": 0.7901, + "num_input_tokens_seen": 10963464, + "step": 18900 + }, + { + "epoch": 2.8157581173666966, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029848278645333657, + "loss": 0.7978, + "num_input_tokens_seen": 10966376, + "step": 18905 + }, + { + "epoch": 2.8165028299076558, + "grad_norm": 0.029296875, + "learning_rate": 0.029848001921910814, + "loss": 0.8071, + "num_input_tokens_seen": 10969224, + "step": 18910 + }, + { + "epoch": 2.817247542448615, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02984772494764607, + "loss": 0.7896, + "num_input_tokens_seen": 10971944, + "step": 18915 + }, + { + "epoch": 2.817992254989574, + "grad_norm": 0.019287109375, + "learning_rate": 0.0298474477225441, + "loss": 0.797, + "num_input_tokens_seen": 10975016, + "step": 18920 + }, + { + "epoch": 2.8187369675305334, + "grad_norm": 0.024169921875, + "learning_rate": 0.0298471702466096, + "loss": 0.7881, + "num_input_tokens_seen": 10977672, + "step": 18925 + }, + { + "epoch": 2.819481680071492, + "grad_norm": 0.020751953125, + "learning_rate": 0.02984689251984724, + "loss": 0.8241, + "num_input_tokens_seen": 10980360, + "step": 18930 + }, + { + "epoch": 2.820226392612452, + "grad_norm": 0.020751953125, + "learning_rate": 0.029846614542261726, + "loss": 0.8006, + "num_input_tokens_seen": 10983272, + "step": 18935 + }, + { + "epoch": 2.8209711051534105, + "grad_norm": 0.0205078125, + "learning_rate": 0.029846336313857746, + "loss": 0.8094, + "num_input_tokens_seen": 10985992, + "step": 18940 + }, + { + "epoch": 2.82171581769437, + "grad_norm": 0.021240234375, + "learning_rate": 0.02984605783464, + "loss": 0.8187, + "num_input_tokens_seen": 10988872, + "step": 18945 + }, + { + "epoch": 2.822460530235329, + "grad_norm": 0.01348876953125, + "learning_rate": 0.0298457791046132, + "loss": 0.8224, + "num_input_tokens_seen": 10991784, + "step": 18950 + }, + { + "epoch": 2.8232052427762886, + "grad_norm": 0.0133056640625, + "learning_rate": 0.02984550012378205, + "loss": 0.8298, + "num_input_tokens_seen": 10994728, + "step": 18955 + }, + { + "epoch": 2.8239499553172474, + "grad_norm": 0.023681640625, + "learning_rate": 0.029845220892151265, + "loss": 0.7942, + "num_input_tokens_seen": 10997576, + "step": 18960 + }, + { + "epoch": 2.824694667858207, + "grad_norm": 0.0263671875, + "learning_rate": 0.02984494140972556, + "loss": 0.8209, + "num_input_tokens_seen": 11000168, + "step": 18965 + }, + { + "epoch": 2.8254393803991658, + "grad_norm": 0.027587890625, + "learning_rate": 0.02984466167650966, + "loss": 0.8056, + "num_input_tokens_seen": 11002984, + "step": 18970 + }, + { + "epoch": 2.826184092940125, + "grad_norm": 0.012939453125, + "learning_rate": 0.029844381692508283, + "loss": 0.7989, + "num_input_tokens_seen": 11005768, + "step": 18975 + }, + { + "epoch": 2.826928805481084, + "grad_norm": 0.031494140625, + "learning_rate": 0.02984410145772617, + "loss": 0.7975, + "num_input_tokens_seen": 11008744, + "step": 18980 + }, + { + "epoch": 2.8276735180220434, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029843820972168044, + "loss": 0.8097, + "num_input_tokens_seen": 11011720, + "step": 18985 + }, + { + "epoch": 2.8284182305630026, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029843540235838652, + "loss": 0.7987, + "num_input_tokens_seen": 11014312, + "step": 18990 + }, + { + "epoch": 2.8291629431039618, + "grad_norm": 0.020751953125, + "learning_rate": 0.029843259248742735, + "loss": 0.7941, + "num_input_tokens_seen": 11017128, + "step": 18995 + }, + { + "epoch": 2.829907655644921, + "grad_norm": 0.0140380859375, + "learning_rate": 0.02984297801088504, + "loss": 0.8082, + "num_input_tokens_seen": 11020072, + "step": 19000 + }, + { + "epoch": 2.83065236818588, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029842696522270312, + "loss": 0.7982, + "num_input_tokens_seen": 11023080, + "step": 19005 + }, + { + "epoch": 2.8313970807268394, + "grad_norm": 0.03369140625, + "learning_rate": 0.029842414782903313, + "loss": 0.8116, + "num_input_tokens_seen": 11025960, + "step": 19010 + }, + { + "epoch": 2.8321417932677986, + "grad_norm": 0.0281982421875, + "learning_rate": 0.029842132792788807, + "loss": 0.8037, + "num_input_tokens_seen": 11028488, + "step": 19015 + }, + { + "epoch": 2.832886505808758, + "grad_norm": 0.022216796875, + "learning_rate": 0.02984185055193155, + "loss": 0.7995, + "num_input_tokens_seen": 11031464, + "step": 19020 + }, + { + "epoch": 2.833631218349717, + "grad_norm": 0.0244140625, + "learning_rate": 0.02984156806033631, + "loss": 0.8048, + "num_input_tokens_seen": 11034440, + "step": 19025 + }, + { + "epoch": 2.834375930890676, + "grad_norm": 0.021240234375, + "learning_rate": 0.029841285318007862, + "loss": 0.8107, + "num_input_tokens_seen": 11037384, + "step": 19030 + }, + { + "epoch": 2.8351206434316354, + "grad_norm": 0.0303955078125, + "learning_rate": 0.029841002324950985, + "loss": 0.8045, + "num_input_tokens_seen": 11040392, + "step": 19035 + }, + { + "epoch": 2.8358653559725946, + "grad_norm": 0.0140380859375, + "learning_rate": 0.029840719081170457, + "loss": 0.8141, + "num_input_tokens_seen": 11043528, + "step": 19040 + }, + { + "epoch": 2.836610068513554, + "grad_norm": 0.031494140625, + "learning_rate": 0.02984043558667106, + "loss": 0.8003, + "num_input_tokens_seen": 11046216, + "step": 19045 + }, + { + "epoch": 2.837354781054513, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02984015184145759, + "loss": 0.7952, + "num_input_tokens_seen": 11049128, + "step": 19050 + }, + { + "epoch": 2.838099493595472, + "grad_norm": 0.03515625, + "learning_rate": 0.02983986784553484, + "loss": 0.7929, + "num_input_tokens_seen": 11051976, + "step": 19055 + }, + { + "epoch": 2.8388442061364314, + "grad_norm": 0.041748046875, + "learning_rate": 0.029839583598907603, + "loss": 0.8258, + "num_input_tokens_seen": 11054888, + "step": 19060 + }, + { + "epoch": 2.8395889186773906, + "grad_norm": 0.022216796875, + "learning_rate": 0.02983929910158068, + "loss": 0.7967, + "num_input_tokens_seen": 11057896, + "step": 19065 + }, + { + "epoch": 2.84033363121835, + "grad_norm": 0.01300048828125, + "learning_rate": 0.029839014353558884, + "loss": 0.7858, + "num_input_tokens_seen": 11060808, + "step": 19070 + }, + { + "epoch": 2.841078343759309, + "grad_norm": 0.01397705078125, + "learning_rate": 0.02983872935484702, + "loss": 0.7941, + "num_input_tokens_seen": 11063592, + "step": 19075 + }, + { + "epoch": 2.841823056300268, + "grad_norm": 0.032470703125, + "learning_rate": 0.029838444105449908, + "loss": 0.8144, + "num_input_tokens_seen": 11066376, + "step": 19080 + }, + { + "epoch": 2.8425677688412274, + "grad_norm": 0.020751953125, + "learning_rate": 0.029838158605372367, + "loss": 0.7963, + "num_input_tokens_seen": 11069160, + "step": 19085 + }, + { + "epoch": 2.8433124813821866, + "grad_norm": 0.01953125, + "learning_rate": 0.02983787285461921, + "loss": 0.7969, + "num_input_tokens_seen": 11072136, + "step": 19090 + }, + { + "epoch": 2.844057193923146, + "grad_norm": 0.012451171875, + "learning_rate": 0.029837586853195278, + "loss": 0.8099, + "num_input_tokens_seen": 11075368, + "step": 19095 + }, + { + "epoch": 2.844801906464105, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029837300601105392, + "loss": 0.7923, + "num_input_tokens_seen": 11078312, + "step": 19100 + }, + { + "epoch": 2.845546619005064, + "grad_norm": 0.0244140625, + "learning_rate": 0.02983701409835439, + "loss": 0.8199, + "num_input_tokens_seen": 11081288, + "step": 19105 + }, + { + "epoch": 2.8462913315460234, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02983672734494712, + "loss": 0.8057, + "num_input_tokens_seen": 11084232, + "step": 19110 + }, + { + "epoch": 2.847036044086982, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029836440340888417, + "loss": 0.7989, + "num_input_tokens_seen": 11087240, + "step": 19115 + }, + { + "epoch": 2.847780756627942, + "grad_norm": 0.0322265625, + "learning_rate": 0.029836153086183134, + "loss": 0.7993, + "num_input_tokens_seen": 11090184, + "step": 19120 + }, + { + "epoch": 2.8485254691689006, + "grad_norm": 0.01513671875, + "learning_rate": 0.029835865580836125, + "loss": 0.8023, + "num_input_tokens_seen": 11093032, + "step": 19125 + }, + { + "epoch": 2.8492701817098602, + "grad_norm": 0.019775390625, + "learning_rate": 0.029835577824852244, + "loss": 0.7978, + "num_input_tokens_seen": 11095848, + "step": 19130 + }, + { + "epoch": 2.850014894250819, + "grad_norm": 0.043212890625, + "learning_rate": 0.029835289818236355, + "loss": 0.8157, + "num_input_tokens_seen": 11099016, + "step": 19135 + }, + { + "epoch": 2.8507596067917786, + "grad_norm": 0.014892578125, + "learning_rate": 0.029835001560993325, + "loss": 0.8142, + "num_input_tokens_seen": 11102472, + "step": 19140 + }, + { + "epoch": 2.8515043193327374, + "grad_norm": 0.0224609375, + "learning_rate": 0.029834713053128018, + "loss": 0.8018, + "num_input_tokens_seen": 11105608, + "step": 19145 + }, + { + "epoch": 2.8522490318736966, + "grad_norm": 0.0140380859375, + "learning_rate": 0.02983442429464531, + "loss": 0.8004, + "num_input_tokens_seen": 11108392, + "step": 19150 + }, + { + "epoch": 2.852993744414656, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02983413528555008, + "loss": 0.7911, + "num_input_tokens_seen": 11111496, + "step": 19155 + }, + { + "epoch": 2.853738456955615, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029833846025847217, + "loss": 0.8226, + "num_input_tokens_seen": 11114216, + "step": 19160 + }, + { + "epoch": 2.854483169496574, + "grad_norm": 0.021728515625, + "learning_rate": 0.029833556515541596, + "loss": 0.796, + "num_input_tokens_seen": 11117448, + "step": 19165 + }, + { + "epoch": 2.8552278820375334, + "grad_norm": 0.02978515625, + "learning_rate": 0.029833266754638117, + "loss": 0.804, + "num_input_tokens_seen": 11120296, + "step": 19170 + }, + { + "epoch": 2.8559725945784926, + "grad_norm": 0.021728515625, + "learning_rate": 0.02983297674314167, + "loss": 0.7794, + "num_input_tokens_seen": 11123208, + "step": 19175 + }, + { + "epoch": 2.856717307119452, + "grad_norm": 0.031494140625, + "learning_rate": 0.029832686481057157, + "loss": 0.7957, + "num_input_tokens_seen": 11126056, + "step": 19180 + }, + { + "epoch": 2.857462019660411, + "grad_norm": 0.0155029296875, + "learning_rate": 0.029832395968389484, + "loss": 0.8113, + "num_input_tokens_seen": 11128744, + "step": 19185 + }, + { + "epoch": 2.8582067322013702, + "grad_norm": 0.0263671875, + "learning_rate": 0.029832105205143553, + "loss": 0.8125, + "num_input_tokens_seen": 11131464, + "step": 19190 + }, + { + "epoch": 2.8589514447423294, + "grad_norm": 0.01318359375, + "learning_rate": 0.02983181419132428, + "loss": 0.8047, + "num_input_tokens_seen": 11134344, + "step": 19195 + }, + { + "epoch": 2.8596961572832886, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029831522926936582, + "loss": 0.7923, + "num_input_tokens_seen": 11137032, + "step": 19200 + }, + { + "epoch": 2.860440869824248, + "grad_norm": 0.020263671875, + "learning_rate": 0.02983123141198538, + "loss": 0.7968, + "num_input_tokens_seen": 11140040, + "step": 19205 + }, + { + "epoch": 2.861185582365207, + "grad_norm": 0.012451171875, + "learning_rate": 0.029830939646475593, + "loss": 0.8118, + "num_input_tokens_seen": 11143048, + "step": 19210 + }, + { + "epoch": 2.8619302949061662, + "grad_norm": 0.02294921875, + "learning_rate": 0.02983064763041215, + "loss": 0.7992, + "num_input_tokens_seen": 11145768, + "step": 19215 + }, + { + "epoch": 2.8626750074471254, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029830355363799996, + "loss": 0.7947, + "num_input_tokens_seen": 11148424, + "step": 19220 + }, + { + "epoch": 2.8634197199880846, + "grad_norm": 0.0125732421875, + "learning_rate": 0.02983006284664406, + "loss": 0.8143, + "num_input_tokens_seen": 11151272, + "step": 19225 + }, + { + "epoch": 2.864164432529044, + "grad_norm": 0.02587890625, + "learning_rate": 0.029829770078949287, + "loss": 0.7955, + "num_input_tokens_seen": 11154440, + "step": 19230 + }, + { + "epoch": 2.864909145070003, + "grad_norm": 0.0245361328125, + "learning_rate": 0.029829477060720613, + "loss": 0.8001, + "num_input_tokens_seen": 11157384, + "step": 19235 + }, + { + "epoch": 2.8656538576109623, + "grad_norm": 0.02197265625, + "learning_rate": 0.029829183791963, + "loss": 0.8247, + "num_input_tokens_seen": 11160200, + "step": 19240 + }, + { + "epoch": 2.8663985701519215, + "grad_norm": 0.035400390625, + "learning_rate": 0.0298288902726814, + "loss": 0.7833, + "num_input_tokens_seen": 11163208, + "step": 19245 + }, + { + "epoch": 2.8671432826928807, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029828596502880767, + "loss": 0.7973, + "num_input_tokens_seen": 11165736, + "step": 19250 + }, + { + "epoch": 2.86788799523384, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02982830248256607, + "loss": 0.7953, + "num_input_tokens_seen": 11168488, + "step": 19255 + }, + { + "epoch": 2.868632707774799, + "grad_norm": 0.02685546875, + "learning_rate": 0.029828008211742276, + "loss": 0.8218, + "num_input_tokens_seen": 11171496, + "step": 19260 + }, + { + "epoch": 2.8693774203157583, + "grad_norm": 0.020263671875, + "learning_rate": 0.029827713690414353, + "loss": 0.7839, + "num_input_tokens_seen": 11174568, + "step": 19265 + }, + { + "epoch": 2.8701221328567175, + "grad_norm": 0.021240234375, + "learning_rate": 0.02982741891858727, + "loss": 0.8008, + "num_input_tokens_seen": 11177544, + "step": 19270 + }, + { + "epoch": 2.8708668453976767, + "grad_norm": 0.033447265625, + "learning_rate": 0.02982712389626602, + "loss": 0.8219, + "num_input_tokens_seen": 11180200, + "step": 19275 + }, + { + "epoch": 2.8716115579386354, + "grad_norm": 0.0146484375, + "learning_rate": 0.029826828623455582, + "loss": 0.814, + "num_input_tokens_seen": 11183048, + "step": 19280 + }, + { + "epoch": 2.872356270479595, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02982653310016094, + "loss": 0.79, + "num_input_tokens_seen": 11185768, + "step": 19285 + }, + { + "epoch": 2.873100983020554, + "grad_norm": 0.02197265625, + "learning_rate": 0.029826237326387093, + "loss": 0.794, + "num_input_tokens_seen": 11188776, + "step": 19290 + }, + { + "epoch": 2.8738456955615135, + "grad_norm": 0.020263671875, + "learning_rate": 0.029825941302139038, + "loss": 0.7998, + "num_input_tokens_seen": 11191528, + "step": 19295 + }, + { + "epoch": 2.8745904081024722, + "grad_norm": 0.0302734375, + "learning_rate": 0.029825645027421768, + "loss": 0.7922, + "num_input_tokens_seen": 11194312, + "step": 19300 + }, + { + "epoch": 2.875335120643432, + "grad_norm": 0.0137939453125, + "learning_rate": 0.029825348502240296, + "loss": 0.8133, + "num_input_tokens_seen": 11197320, + "step": 19305 + }, + { + "epoch": 2.8760798331843906, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02982505172659963, + "loss": 0.7832, + "num_input_tokens_seen": 11200360, + "step": 19310 + }, + { + "epoch": 2.8768245457253503, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029824754700504778, + "loss": 0.799, + "num_input_tokens_seen": 11202952, + "step": 19315 + }, + { + "epoch": 2.877569258266309, + "grad_norm": 0.023193359375, + "learning_rate": 0.029824457423960768, + "loss": 0.8158, + "num_input_tokens_seen": 11206056, + "step": 19320 + }, + { + "epoch": 2.8783139708072683, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029824159896972614, + "loss": 0.7861, + "num_input_tokens_seen": 11208904, + "step": 19325 + }, + { + "epoch": 2.8790586833482275, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029823862119545345, + "loss": 0.7885, + "num_input_tokens_seen": 11211944, + "step": 19330 + }, + { + "epoch": 2.8798033958891867, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029823564091683987, + "loss": 0.804, + "num_input_tokens_seen": 11214856, + "step": 19335 + }, + { + "epoch": 2.880548108430146, + "grad_norm": 0.02001953125, + "learning_rate": 0.029823265813393587, + "loss": 0.8033, + "num_input_tokens_seen": 11217832, + "step": 19340 + }, + { + "epoch": 2.881292820971105, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029822967284679175, + "loss": 0.8003, + "num_input_tokens_seen": 11220520, + "step": 19345 + }, + { + "epoch": 2.8820375335120643, + "grad_norm": 0.01336669921875, + "learning_rate": 0.029822668505545796, + "loss": 0.826, + "num_input_tokens_seen": 11223432, + "step": 19350 + }, + { + "epoch": 2.8827822460530235, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029822369475998495, + "loss": 0.8132, + "num_input_tokens_seen": 11226024, + "step": 19355 + }, + { + "epoch": 2.8835269585939827, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02982207019604233, + "loss": 0.8047, + "num_input_tokens_seen": 11229160, + "step": 19360 + }, + { + "epoch": 2.884271671134942, + "grad_norm": 0.020751953125, + "learning_rate": 0.029821770665682354, + "loss": 0.7978, + "num_input_tokens_seen": 11232168, + "step": 19365 + }, + { + "epoch": 2.885016383675901, + "grad_norm": 0.020263671875, + "learning_rate": 0.029821470884923627, + "loss": 0.8103, + "num_input_tokens_seen": 11235080, + "step": 19370 + }, + { + "epoch": 2.8857610962168603, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02982117085377121, + "loss": 0.7861, + "num_input_tokens_seen": 11237992, + "step": 19375 + }, + { + "epoch": 2.8865058087578195, + "grad_norm": 0.014404296875, + "learning_rate": 0.029820870572230176, + "loss": 0.7908, + "num_input_tokens_seen": 11241416, + "step": 19380 + }, + { + "epoch": 2.8872505212987787, + "grad_norm": 0.0244140625, + "learning_rate": 0.029820570040305597, + "loss": 0.8037, + "num_input_tokens_seen": 11244584, + "step": 19385 + }, + { + "epoch": 2.887995233839738, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02982026925800255, + "loss": 0.8046, + "num_input_tokens_seen": 11247976, + "step": 19390 + }, + { + "epoch": 2.888739946380697, + "grad_norm": 0.042236328125, + "learning_rate": 0.029819968225326118, + "loss": 0.8277, + "num_input_tokens_seen": 11251080, + "step": 19395 + }, + { + "epoch": 2.8894846589216563, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02981966694228138, + "loss": 0.8037, + "num_input_tokens_seen": 11253928, + "step": 19400 + }, + { + "epoch": 2.8902293714626155, + "grad_norm": 0.0234375, + "learning_rate": 0.02981936540887344, + "loss": 0.7947, + "num_input_tokens_seen": 11256840, + "step": 19405 + }, + { + "epoch": 2.8909740840035747, + "grad_norm": 0.0289306640625, + "learning_rate": 0.029819063625107375, + "loss": 0.7825, + "num_input_tokens_seen": 11259496, + "step": 19410 + }, + { + "epoch": 2.891718796544534, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029818761590988292, + "loss": 0.8209, + "num_input_tokens_seen": 11262344, + "step": 19415 + }, + { + "epoch": 2.892463509085493, + "grad_norm": 0.027587890625, + "learning_rate": 0.0298184593065213, + "loss": 0.7917, + "num_input_tokens_seen": 11265608, + "step": 19420 + }, + { + "epoch": 2.8932082216264523, + "grad_norm": 0.03271484375, + "learning_rate": 0.029818156771711492, + "loss": 0.8058, + "num_input_tokens_seen": 11268584, + "step": 19425 + }, + { + "epoch": 2.8939529341674115, + "grad_norm": 0.0224609375, + "learning_rate": 0.02981785398656399, + "loss": 0.7946, + "num_input_tokens_seen": 11271208, + "step": 19430 + }, + { + "epoch": 2.8946976467083707, + "grad_norm": 0.0224609375, + "learning_rate": 0.0298175509510839, + "loss": 0.797, + "num_input_tokens_seen": 11274056, + "step": 19435 + }, + { + "epoch": 2.89544235924933, + "grad_norm": 0.02197265625, + "learning_rate": 0.029817247665276353, + "loss": 0.7905, + "num_input_tokens_seen": 11277096, + "step": 19440 + }, + { + "epoch": 2.896187071790289, + "grad_norm": 0.02001953125, + "learning_rate": 0.029816944129146462, + "loss": 0.825, + "num_input_tokens_seen": 11279944, + "step": 19445 + }, + { + "epoch": 2.8969317843312483, + "grad_norm": 0.015625, + "learning_rate": 0.029816640342699364, + "loss": 0.8038, + "num_input_tokens_seen": 11282728, + "step": 19450 + }, + { + "epoch": 2.897676496872207, + "grad_norm": 0.0164794921875, + "learning_rate": 0.029816336305940182, + "loss": 0.8146, + "num_input_tokens_seen": 11285448, + "step": 19455 + }, + { + "epoch": 2.8984212094131667, + "grad_norm": 0.0142822265625, + "learning_rate": 0.029816032018874058, + "loss": 0.8059, + "num_input_tokens_seen": 11288168, + "step": 19460 + }, + { + "epoch": 2.8991659219541255, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029815727481506127, + "loss": 0.7894, + "num_input_tokens_seen": 11291048, + "step": 19465 + }, + { + "epoch": 2.899910634495085, + "grad_norm": 0.0274658203125, + "learning_rate": 0.029815422693841544, + "loss": 0.7873, + "num_input_tokens_seen": 11293768, + "step": 19470 + }, + { + "epoch": 2.900655347036044, + "grad_norm": 0.033935546875, + "learning_rate": 0.029815117655885452, + "loss": 0.7898, + "num_input_tokens_seen": 11296808, + "step": 19475 + }, + { + "epoch": 2.9014000595770035, + "grad_norm": 0.02294921875, + "learning_rate": 0.029814812367643002, + "loss": 0.7907, + "num_input_tokens_seen": 11299432, + "step": 19480 + }, + { + "epoch": 2.9021447721179623, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029814506829119358, + "loss": 0.8125, + "num_input_tokens_seen": 11302632, + "step": 19485 + }, + { + "epoch": 2.9028894846589215, + "grad_norm": 0.02294921875, + "learning_rate": 0.029814201040319673, + "loss": 0.7857, + "num_input_tokens_seen": 11305480, + "step": 19490 + }, + { + "epoch": 2.9036341971998807, + "grad_norm": 0.032470703125, + "learning_rate": 0.02981389500124912, + "loss": 0.8158, + "num_input_tokens_seen": 11308200, + "step": 19495 + }, + { + "epoch": 2.90437890974084, + "grad_norm": 0.036865234375, + "learning_rate": 0.029813588711912867, + "loss": 0.8202, + "num_input_tokens_seen": 11311112, + "step": 19500 + }, + { + "epoch": 2.905123622281799, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029813282172316092, + "loss": 0.8034, + "num_input_tokens_seen": 11314152, + "step": 19505 + }, + { + "epoch": 2.9058683348227583, + "grad_norm": 0.02587890625, + "learning_rate": 0.029812975382463962, + "loss": 0.817, + "num_input_tokens_seen": 11316904, + "step": 19510 + }, + { + "epoch": 2.9066130473637175, + "grad_norm": 0.0341796875, + "learning_rate": 0.029812668342361678, + "loss": 0.8184, + "num_input_tokens_seen": 11320040, + "step": 19515 + }, + { + "epoch": 2.9073577599046767, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029812361052014413, + "loss": 0.7867, + "num_input_tokens_seen": 11322920, + "step": 19520 + }, + { + "epoch": 2.908102472445636, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029812053511427364, + "loss": 0.836, + "num_input_tokens_seen": 11325768, + "step": 19525 + }, + { + "epoch": 2.908847184986595, + "grad_norm": 0.032958984375, + "learning_rate": 0.029811745720605718, + "loss": 0.8039, + "num_input_tokens_seen": 11328488, + "step": 19530 + }, + { + "epoch": 2.9095918975275543, + "grad_norm": 0.033447265625, + "learning_rate": 0.02981143767955469, + "loss": 0.8113, + "num_input_tokens_seen": 11331304, + "step": 19535 + }, + { + "epoch": 2.9103366100685135, + "grad_norm": 0.021484375, + "learning_rate": 0.02981112938827947, + "loss": 0.806, + "num_input_tokens_seen": 11334120, + "step": 19540 + }, + { + "epoch": 2.9110813226094727, + "grad_norm": 0.01904296875, + "learning_rate": 0.029810820846785276, + "loss": 0.787, + "num_input_tokens_seen": 11336712, + "step": 19545 + }, + { + "epoch": 2.911826035150432, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02981051205507732, + "loss": 0.8044, + "num_input_tokens_seen": 11339656, + "step": 19550 + }, + { + "epoch": 2.912570747691391, + "grad_norm": 0.0179443359375, + "learning_rate": 0.029810203013160813, + "loss": 0.7882, + "num_input_tokens_seen": 11342632, + "step": 19555 + }, + { + "epoch": 2.9133154602323503, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029809893721040977, + "loss": 0.8041, + "num_input_tokens_seen": 11345768, + "step": 19560 + }, + { + "epoch": 2.9140601727733095, + "grad_norm": 0.0167236328125, + "learning_rate": 0.02980958417872304, + "loss": 0.7982, + "num_input_tokens_seen": 11348552, + "step": 19565 + }, + { + "epoch": 2.9148048853142687, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02980927438621223, + "loss": 0.8059, + "num_input_tokens_seen": 11351336, + "step": 19570 + }, + { + "epoch": 2.915549597855228, + "grad_norm": 0.0322265625, + "learning_rate": 0.029808964343513777, + "loss": 0.7869, + "num_input_tokens_seen": 11354024, + "step": 19575 + }, + { + "epoch": 2.916294310396187, + "grad_norm": 0.032958984375, + "learning_rate": 0.029808654050632925, + "loss": 0.8229, + "num_input_tokens_seen": 11356840, + "step": 19580 + }, + { + "epoch": 2.9170390229371463, + "grad_norm": 0.032958984375, + "learning_rate": 0.029808343507574917, + "loss": 0.7893, + "num_input_tokens_seen": 11359656, + "step": 19585 + }, + { + "epoch": 2.9177837354781055, + "grad_norm": 0.025390625, + "learning_rate": 0.029808032714344995, + "loss": 0.7785, + "num_input_tokens_seen": 11362472, + "step": 19590 + }, + { + "epoch": 2.9185284480190647, + "grad_norm": 0.035400390625, + "learning_rate": 0.029807721670948407, + "loss": 0.8036, + "num_input_tokens_seen": 11365672, + "step": 19595 + }, + { + "epoch": 2.919273160560024, + "grad_norm": 0.03369140625, + "learning_rate": 0.029807410377390414, + "loss": 0.8028, + "num_input_tokens_seen": 11368552, + "step": 19600 + }, + { + "epoch": 2.920017873100983, + "grad_norm": 0.025146484375, + "learning_rate": 0.02980709883367627, + "loss": 0.8092, + "num_input_tokens_seen": 11371528, + "step": 19605 + }, + { + "epoch": 2.9207625856419424, + "grad_norm": 0.02392578125, + "learning_rate": 0.029806787039811242, + "loss": 0.7878, + "num_input_tokens_seen": 11374440, + "step": 19610 + }, + { + "epoch": 2.9215072981829016, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029806474995800598, + "loss": 0.8228, + "num_input_tokens_seen": 11377064, + "step": 19615 + }, + { + "epoch": 2.9222520107238603, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029806162701649607, + "loss": 0.8162, + "num_input_tokens_seen": 11379944, + "step": 19620 + }, + { + "epoch": 2.92299672326482, + "grad_norm": 0.025390625, + "learning_rate": 0.02980585015736354, + "loss": 0.807, + "num_input_tokens_seen": 11382760, + "step": 19625 + }, + { + "epoch": 2.9237414358057787, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029805537362947686, + "loss": 0.8048, + "num_input_tokens_seen": 11385704, + "step": 19630 + }, + { + "epoch": 2.9244861483467384, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02980522431840733, + "loss": 0.8058, + "num_input_tokens_seen": 11388392, + "step": 19635 + }, + { + "epoch": 2.925230860887697, + "grad_norm": 0.03173828125, + "learning_rate": 0.02980491102374775, + "loss": 0.7879, + "num_input_tokens_seen": 11391208, + "step": 19640 + }, + { + "epoch": 2.9259755734286568, + "grad_norm": 0.02001953125, + "learning_rate": 0.02980459747897425, + "loss": 0.7826, + "num_input_tokens_seen": 11394280, + "step": 19645 + }, + { + "epoch": 2.9267202859696155, + "grad_norm": 0.022216796875, + "learning_rate": 0.02980428368409212, + "loss": 0.7901, + "num_input_tokens_seen": 11397256, + "step": 19650 + }, + { + "epoch": 2.927464998510575, + "grad_norm": 0.024658203125, + "learning_rate": 0.029803969639106663, + "loss": 0.8059, + "num_input_tokens_seen": 11400328, + "step": 19655 + }, + { + "epoch": 2.928209711051534, + "grad_norm": 0.01385498046875, + "learning_rate": 0.02980365534402319, + "loss": 0.8279, + "num_input_tokens_seen": 11403176, + "step": 19660 + }, + { + "epoch": 2.928954423592493, + "grad_norm": 0.0118408203125, + "learning_rate": 0.029803340798847, + "loss": 0.7797, + "num_input_tokens_seen": 11405992, + "step": 19665 + }, + { + "epoch": 2.9296991361334523, + "grad_norm": 0.033203125, + "learning_rate": 0.029803026003583415, + "loss": 0.8145, + "num_input_tokens_seen": 11408808, + "step": 19670 + }, + { + "epoch": 2.9304438486744115, + "grad_norm": 0.02587890625, + "learning_rate": 0.02980271095823775, + "loss": 0.8153, + "num_input_tokens_seen": 11411624, + "step": 19675 + }, + { + "epoch": 2.9311885612153707, + "grad_norm": 0.023681640625, + "learning_rate": 0.029802395662815326, + "loss": 0.8177, + "num_input_tokens_seen": 11414536, + "step": 19680 + }, + { + "epoch": 2.93193327375633, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029802080117321477, + "loss": 0.815, + "num_input_tokens_seen": 11417384, + "step": 19685 + }, + { + "epoch": 2.932677986297289, + "grad_norm": 0.01287841796875, + "learning_rate": 0.02980176432176152, + "loss": 0.7934, + "num_input_tokens_seen": 11420296, + "step": 19690 + }, + { + "epoch": 2.9334226988382484, + "grad_norm": 0.0224609375, + "learning_rate": 0.02980144827614081, + "loss": 0.7946, + "num_input_tokens_seen": 11423144, + "step": 19695 + }, + { + "epoch": 2.9341674113792076, + "grad_norm": 0.031494140625, + "learning_rate": 0.02980113198046467, + "loss": 0.8124, + "num_input_tokens_seen": 11426120, + "step": 19700 + }, + { + "epoch": 2.9349121239201668, + "grad_norm": 0.026611328125, + "learning_rate": 0.029800815434738448, + "loss": 0.7901, + "num_input_tokens_seen": 11429000, + "step": 19705 + }, + { + "epoch": 2.935656836461126, + "grad_norm": 0.01226806640625, + "learning_rate": 0.02980049863896749, + "loss": 0.803, + "num_input_tokens_seen": 11431688, + "step": 19710 + }, + { + "epoch": 2.936401549002085, + "grad_norm": 0.02587890625, + "learning_rate": 0.029800181593157154, + "loss": 0.8218, + "num_input_tokens_seen": 11434376, + "step": 19715 + }, + { + "epoch": 2.9371462615430444, + "grad_norm": 0.0172119140625, + "learning_rate": 0.029799864297312788, + "loss": 0.779, + "num_input_tokens_seen": 11437320, + "step": 19720 + }, + { + "epoch": 2.9378909740840036, + "grad_norm": 0.028076171875, + "learning_rate": 0.029799546751439757, + "loss": 0.8262, + "num_input_tokens_seen": 11440168, + "step": 19725 + }, + { + "epoch": 2.9386356866249628, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02979922895554343, + "loss": 0.8193, + "num_input_tokens_seen": 11442984, + "step": 19730 + }, + { + "epoch": 2.939380399165922, + "grad_norm": 0.02197265625, + "learning_rate": 0.02979891090962917, + "loss": 0.8093, + "num_input_tokens_seen": 11446056, + "step": 19735 + }, + { + "epoch": 2.940125111706881, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029798592613702345, + "loss": 0.7947, + "num_input_tokens_seen": 11449000, + "step": 19740 + }, + { + "epoch": 2.9408698242478404, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029798274067768343, + "loss": 0.816, + "num_input_tokens_seen": 11451560, + "step": 19745 + }, + { + "epoch": 2.9416145367887996, + "grad_norm": 0.03076171875, + "learning_rate": 0.02979795527183254, + "loss": 0.8176, + "num_input_tokens_seen": 11454856, + "step": 19750 + }, + { + "epoch": 2.942359249329759, + "grad_norm": 0.0301513671875, + "learning_rate": 0.029797636225900325, + "loss": 0.7943, + "num_input_tokens_seen": 11457832, + "step": 19755 + }, + { + "epoch": 2.943103961870718, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029797316929977082, + "loss": 0.8052, + "num_input_tokens_seen": 11460904, + "step": 19760 + }, + { + "epoch": 2.943848674411677, + "grad_norm": 0.0224609375, + "learning_rate": 0.02979699738406821, + "loss": 0.8106, + "num_input_tokens_seen": 11465064, + "step": 19765 + }, + { + "epoch": 2.9445933869526364, + "grad_norm": 0.0303955078125, + "learning_rate": 0.029796677588179107, + "loss": 0.7981, + "num_input_tokens_seen": 11468168, + "step": 19770 + }, + { + "epoch": 2.9453380994935956, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029796357542315174, + "loss": 0.7995, + "num_input_tokens_seen": 11471208, + "step": 19775 + }, + { + "epoch": 2.946082812034555, + "grad_norm": 0.0244140625, + "learning_rate": 0.02979603724648182, + "loss": 0.801, + "num_input_tokens_seen": 11474376, + "step": 19780 + }, + { + "epoch": 2.946827524575514, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029795716700684455, + "loss": 0.7999, + "num_input_tokens_seen": 11477320, + "step": 19785 + }, + { + "epoch": 2.947572237116473, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029795395904928492, + "loss": 0.7948, + "num_input_tokens_seen": 11480136, + "step": 19790 + }, + { + "epoch": 2.948316949657432, + "grad_norm": 0.033203125, + "learning_rate": 0.029795074859219353, + "loss": 0.7822, + "num_input_tokens_seen": 11483176, + "step": 19795 + }, + { + "epoch": 2.9490616621983916, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02979475356356246, + "loss": 0.7956, + "num_input_tokens_seen": 11485704, + "step": 19800 + }, + { + "epoch": 2.9498063747393504, + "grad_norm": 0.015380859375, + "learning_rate": 0.029794432017963245, + "loss": 0.8032, + "num_input_tokens_seen": 11488552, + "step": 19805 + }, + { + "epoch": 2.95055108728031, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02979411022242714, + "loss": 0.7957, + "num_input_tokens_seen": 11491336, + "step": 19810 + }, + { + "epoch": 2.9512957998212688, + "grad_norm": 0.02978515625, + "learning_rate": 0.029793788176959574, + "loss": 0.7807, + "num_input_tokens_seen": 11494376, + "step": 19815 + }, + { + "epoch": 2.9520405123622284, + "grad_norm": 0.0234375, + "learning_rate": 0.029793465881565992, + "loss": 0.8393, + "num_input_tokens_seen": 11497352, + "step": 19820 + }, + { + "epoch": 2.952785224903187, + "grad_norm": 0.018798828125, + "learning_rate": 0.029793143336251843, + "loss": 0.8044, + "num_input_tokens_seen": 11500008, + "step": 19825 + }, + { + "epoch": 2.953529937444147, + "grad_norm": 0.0185546875, + "learning_rate": 0.02979282054102257, + "loss": 0.7988, + "num_input_tokens_seen": 11502824, + "step": 19830 + }, + { + "epoch": 2.9542746499851056, + "grad_norm": 0.01519775390625, + "learning_rate": 0.02979249749588363, + "loss": 0.8103, + "num_input_tokens_seen": 11505832, + "step": 19835 + }, + { + "epoch": 2.955019362526065, + "grad_norm": 0.01904296875, + "learning_rate": 0.029792174200840477, + "loss": 0.7889, + "num_input_tokens_seen": 11508872, + "step": 19840 + }, + { + "epoch": 2.955764075067024, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029791850655898577, + "loss": 0.8138, + "num_input_tokens_seen": 11511464, + "step": 19845 + }, + { + "epoch": 2.956508787607983, + "grad_norm": 0.0135498046875, + "learning_rate": 0.029791526861063393, + "loss": 0.7885, + "num_input_tokens_seen": 11514216, + "step": 19850 + }, + { + "epoch": 2.9572535001489424, + "grad_norm": 0.0303955078125, + "learning_rate": 0.029791202816340397, + "loss": 0.7777, + "num_input_tokens_seen": 11517480, + "step": 19855 + }, + { + "epoch": 2.9579982126899016, + "grad_norm": 0.01129150390625, + "learning_rate": 0.02979087852173506, + "loss": 0.7957, + "num_input_tokens_seen": 11520136, + "step": 19860 + }, + { + "epoch": 2.958742925230861, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029790553977252866, + "loss": 0.8167, + "num_input_tokens_seen": 11522952, + "step": 19865 + }, + { + "epoch": 2.95948763777182, + "grad_norm": 0.02294921875, + "learning_rate": 0.029790229182899294, + "loss": 0.8229, + "num_input_tokens_seen": 11525960, + "step": 19870 + }, + { + "epoch": 2.960232350312779, + "grad_norm": 0.019775390625, + "learning_rate": 0.029789904138679834, + "loss": 0.8015, + "num_input_tokens_seen": 11528488, + "step": 19875 + }, + { + "epoch": 2.9609770628537384, + "grad_norm": 0.01385498046875, + "learning_rate": 0.029789578844599974, + "loss": 0.7934, + "num_input_tokens_seen": 11531432, + "step": 19880 + }, + { + "epoch": 2.9617217753946976, + "grad_norm": 0.022216796875, + "learning_rate": 0.029789253300665213, + "loss": 0.7958, + "num_input_tokens_seen": 11534248, + "step": 19885 + }, + { + "epoch": 2.962466487935657, + "grad_norm": 0.022216796875, + "learning_rate": 0.02978892750688104, + "loss": 0.782, + "num_input_tokens_seen": 11537128, + "step": 19890 + }, + { + "epoch": 2.963211200476616, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02978860146325298, + "loss": 0.818, + "num_input_tokens_seen": 11540264, + "step": 19895 + }, + { + "epoch": 2.963955913017575, + "grad_norm": 0.024658203125, + "learning_rate": 0.029788275169786518, + "loss": 0.8335, + "num_input_tokens_seen": 11543144, + "step": 19900 + }, + { + "epoch": 2.9647006255585344, + "grad_norm": 0.01324462890625, + "learning_rate": 0.029787948626487187, + "loss": 0.8288, + "num_input_tokens_seen": 11546152, + "step": 19905 + }, + { + "epoch": 2.9654453380994936, + "grad_norm": 0.0205078125, + "learning_rate": 0.029787621833360485, + "loss": 0.8264, + "num_input_tokens_seen": 11548744, + "step": 19910 + }, + { + "epoch": 2.966190050640453, + "grad_norm": 0.022216796875, + "learning_rate": 0.029787294790411943, + "loss": 0.8034, + "num_input_tokens_seen": 11551560, + "step": 19915 + }, + { + "epoch": 2.966934763181412, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029786967497647088, + "loss": 0.8051, + "num_input_tokens_seen": 11554600, + "step": 19920 + }, + { + "epoch": 2.9676794757223712, + "grad_norm": 0.02734375, + "learning_rate": 0.029786639955071446, + "loss": 0.7971, + "num_input_tokens_seen": 11557704, + "step": 19925 + }, + { + "epoch": 2.9684241882633304, + "grad_norm": 0.023193359375, + "learning_rate": 0.029786312162690547, + "loss": 0.8084, + "num_input_tokens_seen": 11560424, + "step": 19930 + }, + { + "epoch": 2.9691689008042896, + "grad_norm": 0.025390625, + "learning_rate": 0.029785984120509933, + "loss": 0.8015, + "num_input_tokens_seen": 11563496, + "step": 19935 + }, + { + "epoch": 2.969913613345249, + "grad_norm": 0.0224609375, + "learning_rate": 0.029785655828535148, + "loss": 0.7984, + "num_input_tokens_seen": 11566376, + "step": 19940 + }, + { + "epoch": 2.970658325886208, + "grad_norm": 0.023681640625, + "learning_rate": 0.02978532728677173, + "loss": 0.8047, + "num_input_tokens_seen": 11569128, + "step": 19945 + }, + { + "epoch": 2.9714030384271672, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029784998495225243, + "loss": 0.8099, + "num_input_tokens_seen": 11571912, + "step": 19950 + }, + { + "epoch": 2.9721477509681264, + "grad_norm": 0.020751953125, + "learning_rate": 0.029784669453901225, + "loss": 0.7892, + "num_input_tokens_seen": 11574696, + "step": 19955 + }, + { + "epoch": 2.9728924635090856, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029784340162805248, + "loss": 0.8012, + "num_input_tokens_seen": 11577512, + "step": 19960 + }, + { + "epoch": 2.973637176050045, + "grad_norm": 0.020751953125, + "learning_rate": 0.02978401062194287, + "loss": 0.8114, + "num_input_tokens_seen": 11580616, + "step": 19965 + }, + { + "epoch": 2.9743818885910036, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029783680831319655, + "loss": 0.7947, + "num_input_tokens_seen": 11583368, + "step": 19970 + }, + { + "epoch": 2.9751266011319633, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029783350790941177, + "loss": 0.789, + "num_input_tokens_seen": 11586504, + "step": 19975 + }, + { + "epoch": 2.975871313672922, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029783020500813016, + "loss": 0.811, + "num_input_tokens_seen": 11589352, + "step": 19980 + }, + { + "epoch": 2.9766160262138817, + "grad_norm": 0.0137939453125, + "learning_rate": 0.02978268996094075, + "loss": 0.8092, + "num_input_tokens_seen": 11592360, + "step": 19985 + }, + { + "epoch": 2.9773607387548404, + "grad_norm": 0.02392578125, + "learning_rate": 0.029782359171329962, + "loss": 0.8086, + "num_input_tokens_seen": 11595272, + "step": 19990 + }, + { + "epoch": 2.9781054512958, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029782028131986233, + "loss": 0.7982, + "num_input_tokens_seen": 11598408, + "step": 19995 + }, + { + "epoch": 2.978850163836759, + "grad_norm": 0.035400390625, + "learning_rate": 0.029781696842915168, + "loss": 0.8011, + "num_input_tokens_seen": 11601000, + "step": 20000 + }, + { + "epoch": 2.9795948763777185, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029781365304122356, + "loss": 0.781, + "num_input_tokens_seen": 11603784, + "step": 20005 + }, + { + "epoch": 2.9803395889186772, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0297810335156134, + "loss": 0.8169, + "num_input_tokens_seen": 11606728, + "step": 20010 + }, + { + "epoch": 2.9810843014596364, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02978070147739391, + "loss": 0.8077, + "num_input_tokens_seen": 11609672, + "step": 20015 + }, + { + "epoch": 2.9818290140005956, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029780369189469486, + "loss": 0.8027, + "num_input_tokens_seen": 11612584, + "step": 20020 + }, + { + "epoch": 2.982573726541555, + "grad_norm": 0.01300048828125, + "learning_rate": 0.029780036651845747, + "loss": 0.7903, + "num_input_tokens_seen": 11615560, + "step": 20025 + }, + { + "epoch": 2.983318439082514, + "grad_norm": 0.03466796875, + "learning_rate": 0.02977970386452831, + "loss": 0.8199, + "num_input_tokens_seen": 11618664, + "step": 20030 + }, + { + "epoch": 2.9840631516234732, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0297793708275228, + "loss": 0.7794, + "num_input_tokens_seen": 11621512, + "step": 20035 + }, + { + "epoch": 2.9848078641644324, + "grad_norm": 0.01171875, + "learning_rate": 0.02977903754083484, + "loss": 0.8089, + "num_input_tokens_seen": 11624296, + "step": 20040 + }, + { + "epoch": 2.9855525767053916, + "grad_norm": 0.01171875, + "learning_rate": 0.029778704004470063, + "loss": 0.7788, + "num_input_tokens_seen": 11627112, + "step": 20045 + }, + { + "epoch": 2.986297289246351, + "grad_norm": 0.013427734375, + "learning_rate": 0.0297783702184341, + "loss": 0.8052, + "num_input_tokens_seen": 11630088, + "step": 20050 + }, + { + "epoch": 2.98704200178731, + "grad_norm": 0.021240234375, + "learning_rate": 0.029778036182732594, + "loss": 0.8131, + "num_input_tokens_seen": 11632744, + "step": 20055 + }, + { + "epoch": 2.9877867143282693, + "grad_norm": 0.033203125, + "learning_rate": 0.029777701897371188, + "loss": 0.7956, + "num_input_tokens_seen": 11636104, + "step": 20060 + }, + { + "epoch": 2.9885314268692285, + "grad_norm": 0.02197265625, + "learning_rate": 0.029777367362355527, + "loss": 0.8088, + "num_input_tokens_seen": 11639112, + "step": 20065 + }, + { + "epoch": 2.9892761394101877, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029777032577691258, + "loss": 0.7902, + "num_input_tokens_seen": 11641864, + "step": 20070 + }, + { + "epoch": 2.990020851951147, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029776697543384047, + "loss": 0.7827, + "num_input_tokens_seen": 11644776, + "step": 20075 + }, + { + "epoch": 2.990765564492106, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02977636225943955, + "loss": 0.8016, + "num_input_tokens_seen": 11647752, + "step": 20080 + }, + { + "epoch": 2.9915102770330653, + "grad_norm": 0.031982421875, + "learning_rate": 0.029776026725863425, + "loss": 0.8228, + "num_input_tokens_seen": 11650536, + "step": 20085 + }, + { + "epoch": 2.9922549895740245, + "grad_norm": 0.0118408203125, + "learning_rate": 0.029775690942661356, + "loss": 0.7805, + "num_input_tokens_seen": 11653256, + "step": 20090 + }, + { + "epoch": 2.9929997021149837, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029775354909839, + "loss": 0.7997, + "num_input_tokens_seen": 11656072, + "step": 20095 + }, + { + "epoch": 2.993744414655943, + "grad_norm": 0.026123046875, + "learning_rate": 0.029775018627402037, + "loss": 0.7825, + "num_input_tokens_seen": 11658984, + "step": 20100 + }, + { + "epoch": 2.994489127196902, + "grad_norm": 0.02294921875, + "learning_rate": 0.029774682095356154, + "loss": 0.8068, + "num_input_tokens_seen": 11662120, + "step": 20105 + }, + { + "epoch": 2.9952338397378613, + "grad_norm": 0.053466796875, + "learning_rate": 0.029774345313707033, + "loss": 0.8025, + "num_input_tokens_seen": 11665512, + "step": 20110 + }, + { + "epoch": 2.9959785522788205, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02977400828246037, + "loss": 0.8087, + "num_input_tokens_seen": 11668392, + "step": 20115 + }, + { + "epoch": 2.9967232648197797, + "grad_norm": 0.020263671875, + "learning_rate": 0.029773671001621843, + "loss": 0.8124, + "num_input_tokens_seen": 11671624, + "step": 20120 + }, + { + "epoch": 2.997467977360739, + "grad_norm": 0.02197265625, + "learning_rate": 0.029773333471197164, + "loss": 0.7966, + "num_input_tokens_seen": 11674408, + "step": 20125 + }, + { + "epoch": 2.998212689901698, + "grad_norm": 0.01904296875, + "learning_rate": 0.02977299569119203, + "loss": 0.7986, + "num_input_tokens_seen": 11677416, + "step": 20130 + }, + { + "epoch": 2.9989574024426573, + "grad_norm": 0.03271484375, + "learning_rate": 0.029772657661612148, + "loss": 0.7871, + "num_input_tokens_seen": 11680424, + "step": 20135 + }, + { + "epoch": 2.9997021149836165, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02977231938246323, + "loss": 0.792, + "num_input_tokens_seen": 11683176, + "step": 20140 + }, + { + "epoch": 3.0, + "eval_loss": 0.802687406539917, + "eval_runtime": 70.7039, + "eval_samples_per_second": 42.204, + "eval_steps_per_second": 10.551, + "num_input_tokens_seen": 11683856, + "step": 20142 + }, + { + "epoch": 3.0004468275245757, + "grad_norm": 0.03515625, + "learning_rate": 0.02977198085375099, + "loss": 0.8022, + "num_input_tokens_seen": 11685584, + "step": 20145 + }, + { + "epoch": 3.001191540065535, + "grad_norm": 0.0205078125, + "learning_rate": 0.029771642075481147, + "loss": 0.7899, + "num_input_tokens_seen": 11688464, + "step": 20150 + }, + { + "epoch": 3.001936252606494, + "grad_norm": 0.020751953125, + "learning_rate": 0.029771303047659424, + "loss": 0.7894, + "num_input_tokens_seen": 11691280, + "step": 20155 + }, + { + "epoch": 3.002680965147453, + "grad_norm": 0.027587890625, + "learning_rate": 0.02977096377029155, + "loss": 0.8257, + "num_input_tokens_seen": 11694000, + "step": 20160 + }, + { + "epoch": 3.003425677688412, + "grad_norm": 0.01531982421875, + "learning_rate": 0.029770624243383257, + "loss": 0.8264, + "num_input_tokens_seen": 11697072, + "step": 20165 + }, + { + "epoch": 3.0041703902293713, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029770284466940276, + "loss": 0.7695, + "num_input_tokens_seen": 11700144, + "step": 20170 + }, + { + "epoch": 3.0049151027703305, + "grad_norm": 0.01544189453125, + "learning_rate": 0.02976994444096835, + "loss": 0.804, + "num_input_tokens_seen": 11703120, + "step": 20175 + }, + { + "epoch": 3.0056598153112897, + "grad_norm": 0.02099609375, + "learning_rate": 0.029769604165473227, + "loss": 0.8019, + "num_input_tokens_seen": 11705776, + "step": 20180 + }, + { + "epoch": 3.006404527852249, + "grad_norm": 0.0145263671875, + "learning_rate": 0.029769263640460654, + "loss": 0.809, + "num_input_tokens_seen": 11708592, + "step": 20185 + }, + { + "epoch": 3.007149240393208, + "grad_norm": 0.01458740234375, + "learning_rate": 0.02976892286593638, + "loss": 0.8117, + "num_input_tokens_seen": 11711472, + "step": 20190 + }, + { + "epoch": 3.0078939529341673, + "grad_norm": 0.025146484375, + "learning_rate": 0.029768581841906165, + "loss": 0.8046, + "num_input_tokens_seen": 11714800, + "step": 20195 + }, + { + "epoch": 3.0086386654751265, + "grad_norm": 0.042724609375, + "learning_rate": 0.02976824056837577, + "loss": 0.8246, + "num_input_tokens_seen": 11717424, + "step": 20200 + }, + { + "epoch": 3.0093833780160857, + "grad_norm": 0.02099609375, + "learning_rate": 0.029767899045350963, + "loss": 0.8106, + "num_input_tokens_seen": 11720240, + "step": 20205 + }, + { + "epoch": 3.010128090557045, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029767557272837506, + "loss": 0.7849, + "num_input_tokens_seen": 11723248, + "step": 20210 + }, + { + "epoch": 3.010872803098004, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02976721525084118, + "loss": 0.7768, + "num_input_tokens_seen": 11726128, + "step": 20215 + }, + { + "epoch": 3.0116175156389633, + "grad_norm": 0.021728515625, + "learning_rate": 0.029766872979367767, + "loss": 0.7914, + "num_input_tokens_seen": 11729008, + "step": 20220 + }, + { + "epoch": 3.0123622281799225, + "grad_norm": 0.03369140625, + "learning_rate": 0.02976653045842304, + "loss": 0.7946, + "num_input_tokens_seen": 11732016, + "step": 20225 + }, + { + "epoch": 3.0131069407208817, + "grad_norm": 0.03466796875, + "learning_rate": 0.02976618768801278, + "loss": 0.8268, + "num_input_tokens_seen": 11734928, + "step": 20230 + }, + { + "epoch": 3.013851653261841, + "grad_norm": 0.0142822265625, + "learning_rate": 0.029765844668142798, + "loss": 0.8229, + "num_input_tokens_seen": 11737968, + "step": 20235 + }, + { + "epoch": 3.0145963658028, + "grad_norm": 0.01287841796875, + "learning_rate": 0.029765501398818877, + "loss": 0.8245, + "num_input_tokens_seen": 11740752, + "step": 20240 + }, + { + "epoch": 3.0153410783437593, + "grad_norm": 0.033935546875, + "learning_rate": 0.02976515788004681, + "loss": 0.7973, + "num_input_tokens_seen": 11743664, + "step": 20245 + }, + { + "epoch": 3.0160857908847185, + "grad_norm": 0.018798828125, + "learning_rate": 0.029764814111832412, + "loss": 0.8087, + "num_input_tokens_seen": 11746608, + "step": 20250 + }, + { + "epoch": 3.0168305034256777, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029764470094181483, + "loss": 0.7988, + "num_input_tokens_seen": 11749712, + "step": 20255 + }, + { + "epoch": 3.017575215966637, + "grad_norm": 0.010986328125, + "learning_rate": 0.029764125827099842, + "loss": 0.8086, + "num_input_tokens_seen": 11752528, + "step": 20260 + }, + { + "epoch": 3.018319928507596, + "grad_norm": 0.0230712890625, + "learning_rate": 0.029763781310593294, + "loss": 0.8123, + "num_input_tokens_seen": 11755344, + "step": 20265 + }, + { + "epoch": 3.0190646410485553, + "grad_norm": 0.01953125, + "learning_rate": 0.029763436544667673, + "loss": 0.7924, + "num_input_tokens_seen": 11758064, + "step": 20270 + }, + { + "epoch": 3.0198093535895145, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029763091529328795, + "loss": 0.795, + "num_input_tokens_seen": 11760848, + "step": 20275 + }, + { + "epoch": 3.0205540661304737, + "grad_norm": 0.01324462890625, + "learning_rate": 0.029762746264582488, + "loss": 0.791, + "num_input_tokens_seen": 11764048, + "step": 20280 + }, + { + "epoch": 3.021298778671433, + "grad_norm": 0.021728515625, + "learning_rate": 0.029762400750434584, + "loss": 0.7803, + "num_input_tokens_seen": 11767248, + "step": 20285 + }, + { + "epoch": 3.022043491212392, + "grad_norm": 0.01385498046875, + "learning_rate": 0.02976205498689093, + "loss": 0.7827, + "num_input_tokens_seen": 11770256, + "step": 20290 + }, + { + "epoch": 3.0227882037533513, + "grad_norm": 0.021240234375, + "learning_rate": 0.029761708973957354, + "loss": 0.7972, + "num_input_tokens_seen": 11773264, + "step": 20295 + }, + { + "epoch": 3.0235329162943105, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029761362711639715, + "loss": 0.7999, + "num_input_tokens_seen": 11776336, + "step": 20300 + }, + { + "epoch": 3.0242776288352697, + "grad_norm": 0.0244140625, + "learning_rate": 0.02976101619994385, + "loss": 0.8272, + "num_input_tokens_seen": 11779408, + "step": 20305 + }, + { + "epoch": 3.025022341376229, + "grad_norm": 0.02783203125, + "learning_rate": 0.02976066943887562, + "loss": 0.8003, + "num_input_tokens_seen": 11782320, + "step": 20310 + }, + { + "epoch": 3.025767053917188, + "grad_norm": 0.02734375, + "learning_rate": 0.029760322428440886, + "loss": 0.8173, + "num_input_tokens_seen": 11786032, + "step": 20315 + }, + { + "epoch": 3.0265117664581473, + "grad_norm": 0.01324462890625, + "learning_rate": 0.029759975168645498, + "loss": 0.8115, + "num_input_tokens_seen": 11789360, + "step": 20320 + }, + { + "epoch": 3.0272564789991065, + "grad_norm": 0.01904296875, + "learning_rate": 0.029759627659495337, + "loss": 0.7982, + "num_input_tokens_seen": 11792336, + "step": 20325 + }, + { + "epoch": 3.0280011915400658, + "grad_norm": 0.019775390625, + "learning_rate": 0.029759279900996267, + "loss": 0.8238, + "num_input_tokens_seen": 11794992, + "step": 20330 + }, + { + "epoch": 3.0287459040810245, + "grad_norm": 0.021728515625, + "learning_rate": 0.029758931893154164, + "loss": 0.8023, + "num_input_tokens_seen": 11797712, + "step": 20335 + }, + { + "epoch": 3.0294906166219837, + "grad_norm": 0.02392578125, + "learning_rate": 0.029758583635974906, + "loss": 0.8038, + "num_input_tokens_seen": 11800496, + "step": 20340 + }, + { + "epoch": 3.030235329162943, + "grad_norm": 0.019775390625, + "learning_rate": 0.029758235129464378, + "loss": 0.7936, + "num_input_tokens_seen": 11803248, + "step": 20345 + }, + { + "epoch": 3.030980041703902, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029757886373628466, + "loss": 0.7947, + "num_input_tokens_seen": 11806000, + "step": 20350 + }, + { + "epoch": 3.0317247542448613, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029757537368473065, + "loss": 0.7764, + "num_input_tokens_seen": 11808976, + "step": 20355 + }, + { + "epoch": 3.0324694667858205, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029757188114004067, + "loss": 0.7865, + "num_input_tokens_seen": 11811888, + "step": 20360 + }, + { + "epoch": 3.0332141793267797, + "grad_norm": 0.019775390625, + "learning_rate": 0.029756838610227376, + "loss": 0.8083, + "num_input_tokens_seen": 11814928, + "step": 20365 + }, + { + "epoch": 3.033958891867739, + "grad_norm": 0.0281982421875, + "learning_rate": 0.029756488857148895, + "loss": 0.8166, + "num_input_tokens_seen": 11818320, + "step": 20370 + }, + { + "epoch": 3.034703604408698, + "grad_norm": 0.018798828125, + "learning_rate": 0.02975613885477453, + "loss": 0.7925, + "num_input_tokens_seen": 11821072, + "step": 20375 + }, + { + "epoch": 3.0354483169496573, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0297557886031102, + "loss": 0.792, + "num_input_tokens_seen": 11823792, + "step": 20380 + }, + { + "epoch": 3.0361930294906165, + "grad_norm": 0.01336669921875, + "learning_rate": 0.029755438102161816, + "loss": 0.8146, + "num_input_tokens_seen": 11826832, + "step": 20385 + }, + { + "epoch": 3.0369377420315757, + "grad_norm": 0.019287109375, + "learning_rate": 0.029755087351935303, + "loss": 0.7939, + "num_input_tokens_seen": 11829520, + "step": 20390 + }, + { + "epoch": 3.037682454572535, + "grad_norm": 0.020751953125, + "learning_rate": 0.029754736352436584, + "loss": 0.7922, + "num_input_tokens_seen": 11832496, + "step": 20395 + }, + { + "epoch": 3.038427167113494, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02975438510367159, + "loss": 0.7914, + "num_input_tokens_seen": 11835376, + "step": 20400 + }, + { + "epoch": 3.0391718796544533, + "grad_norm": 0.012451171875, + "learning_rate": 0.029754033605646258, + "loss": 0.8175, + "num_input_tokens_seen": 11838384, + "step": 20405 + }, + { + "epoch": 3.0399165921954125, + "grad_norm": 0.0135498046875, + "learning_rate": 0.029753681858366524, + "loss": 0.8158, + "num_input_tokens_seen": 11841168, + "step": 20410 + }, + { + "epoch": 3.0406613047363718, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029753329861838332, + "loss": 0.8003, + "num_input_tokens_seen": 11844464, + "step": 20415 + }, + { + "epoch": 3.041406017277331, + "grad_norm": 0.039794921875, + "learning_rate": 0.029752977616067617, + "loss": 0.8249, + "num_input_tokens_seen": 11847312, + "step": 20420 + }, + { + "epoch": 3.04215072981829, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029752625121060347, + "loss": 0.8052, + "num_input_tokens_seen": 11850256, + "step": 20425 + }, + { + "epoch": 3.0428954423592494, + "grad_norm": 0.019775390625, + "learning_rate": 0.02975227237682247, + "loss": 0.7853, + "num_input_tokens_seen": 11853136, + "step": 20430 + }, + { + "epoch": 3.0436401549002086, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029751919383359943, + "loss": 0.7928, + "num_input_tokens_seen": 11856304, + "step": 20435 + }, + { + "epoch": 3.0443848674411678, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02975156614067873, + "loss": 0.8302, + "num_input_tokens_seen": 11859248, + "step": 20440 + }, + { + "epoch": 3.045129579982127, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0297512126487848, + "loss": 0.7849, + "num_input_tokens_seen": 11862032, + "step": 20445 + }, + { + "epoch": 3.045874292523086, + "grad_norm": 0.01123046875, + "learning_rate": 0.029750858907684127, + "loss": 0.7891, + "num_input_tokens_seen": 11864912, + "step": 20450 + }, + { + "epoch": 3.0466190050640454, + "grad_norm": 0.029052734375, + "learning_rate": 0.029750504917382684, + "loss": 0.7955, + "num_input_tokens_seen": 11867856, + "step": 20455 + }, + { + "epoch": 3.0473637176050046, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029750150677886455, + "loss": 0.802, + "num_input_tokens_seen": 11870832, + "step": 20460 + }, + { + "epoch": 3.0481084301459638, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029749796189201413, + "loss": 0.7813, + "num_input_tokens_seen": 11874640, + "step": 20465 + }, + { + "epoch": 3.048853142686923, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029749441451333566, + "loss": 0.8226, + "num_input_tokens_seen": 11877296, + "step": 20470 + }, + { + "epoch": 3.049597855227882, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02974908646428889, + "loss": 0.8329, + "num_input_tokens_seen": 11879792, + "step": 20475 + }, + { + "epoch": 3.0503425677688414, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029748731228073387, + "loss": 0.8147, + "num_input_tokens_seen": 11882512, + "step": 20480 + }, + { + "epoch": 3.0510872803098006, + "grad_norm": 0.027587890625, + "learning_rate": 0.029748375742693067, + "loss": 0.7808, + "num_input_tokens_seen": 11885520, + "step": 20485 + }, + { + "epoch": 3.05183199285076, + "grad_norm": 0.019775390625, + "learning_rate": 0.029748020008153922, + "loss": 0.7823, + "num_input_tokens_seen": 11888304, + "step": 20490 + }, + { + "epoch": 3.052576705391719, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02974766402446197, + "loss": 0.795, + "num_input_tokens_seen": 11891056, + "step": 20495 + }, + { + "epoch": 3.053321417932678, + "grad_norm": 0.02880859375, + "learning_rate": 0.029747307791623222, + "loss": 0.7908, + "num_input_tokens_seen": 11893776, + "step": 20500 + }, + { + "epoch": 3.054066130473637, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0297469513096437, + "loss": 0.8207, + "num_input_tokens_seen": 11897008, + "step": 20505 + }, + { + "epoch": 3.054810843014596, + "grad_norm": 0.02099609375, + "learning_rate": 0.02974659457852942, + "loss": 0.797, + "num_input_tokens_seen": 11900080, + "step": 20510 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 0.027587890625, + "learning_rate": 0.02974623759828642, + "loss": 0.7796, + "num_input_tokens_seen": 11902992, + "step": 20515 + }, + { + "epoch": 3.0563002680965146, + "grad_norm": 0.0274658203125, + "learning_rate": 0.029745880368920716, + "loss": 0.7995, + "num_input_tokens_seen": 11905840, + "step": 20520 + }, + { + "epoch": 3.0570449806374738, + "grad_norm": 0.026611328125, + "learning_rate": 0.029745522890438348, + "loss": 0.8025, + "num_input_tokens_seen": 11908464, + "step": 20525 + }, + { + "epoch": 3.057789693178433, + "grad_norm": 0.020263671875, + "learning_rate": 0.029745165162845363, + "loss": 0.772, + "num_input_tokens_seen": 11911312, + "step": 20530 + }, + { + "epoch": 3.058534405719392, + "grad_norm": 0.0198974609375, + "learning_rate": 0.0297448071861478, + "loss": 0.7974, + "num_input_tokens_seen": 11914192, + "step": 20535 + }, + { + "epoch": 3.0592791182603514, + "grad_norm": 0.018798828125, + "learning_rate": 0.029744448960351704, + "loss": 0.7714, + "num_input_tokens_seen": 11917328, + "step": 20540 + }, + { + "epoch": 3.0600238308013106, + "grad_norm": 0.029052734375, + "learning_rate": 0.029744090485463125, + "loss": 0.7447, + "num_input_tokens_seen": 11920048, + "step": 20545 + }, + { + "epoch": 3.0607685433422698, + "grad_norm": 0.0177001953125, + "learning_rate": 0.029743731761488125, + "loss": 0.8059, + "num_input_tokens_seen": 11922800, + "step": 20550 + }, + { + "epoch": 3.061513255883229, + "grad_norm": 0.01611328125, + "learning_rate": 0.029743372788432764, + "loss": 0.8027, + "num_input_tokens_seen": 11926096, + "step": 20555 + }, + { + "epoch": 3.062257968424188, + "grad_norm": 0.033447265625, + "learning_rate": 0.0297430135663031, + "loss": 0.8007, + "num_input_tokens_seen": 11928944, + "step": 20560 + }, + { + "epoch": 3.0630026809651474, + "grad_norm": 0.025390625, + "learning_rate": 0.029742654095105205, + "loss": 0.7869, + "num_input_tokens_seen": 11932240, + "step": 20565 + }, + { + "epoch": 3.0637473935061066, + "grad_norm": 0.017578125, + "learning_rate": 0.02974229437484516, + "loss": 0.8257, + "num_input_tokens_seen": 11935216, + "step": 20570 + }, + { + "epoch": 3.064492106047066, + "grad_norm": 0.01300048828125, + "learning_rate": 0.029741934405529028, + "loss": 0.7903, + "num_input_tokens_seen": 11938288, + "step": 20575 + }, + { + "epoch": 3.065236818588025, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0297415741871629, + "loss": 0.8036, + "num_input_tokens_seen": 11940816, + "step": 20580 + }, + { + "epoch": 3.065981531128984, + "grad_norm": 0.02978515625, + "learning_rate": 0.02974121371975286, + "loss": 0.8341, + "num_input_tokens_seen": 11943728, + "step": 20585 + }, + { + "epoch": 3.0667262436699434, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029740853003304993, + "loss": 0.7803, + "num_input_tokens_seen": 11946544, + "step": 20590 + }, + { + "epoch": 3.0674709562109026, + "grad_norm": 0.015625, + "learning_rate": 0.0297404920378254, + "loss": 0.8121, + "num_input_tokens_seen": 11949488, + "step": 20595 + }, + { + "epoch": 3.068215668751862, + "grad_norm": 0.0272216796875, + "learning_rate": 0.029740130823320176, + "loss": 0.8474, + "num_input_tokens_seen": 11952240, + "step": 20600 + }, + { + "epoch": 3.068960381292821, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02973976935979542, + "loss": 0.8286, + "num_input_tokens_seen": 11954896, + "step": 20605 + }, + { + "epoch": 3.06970509383378, + "grad_norm": 0.0184326171875, + "learning_rate": 0.029739407647257243, + "loss": 0.794, + "num_input_tokens_seen": 11957712, + "step": 20610 + }, + { + "epoch": 3.0704498063747394, + "grad_norm": 0.01904296875, + "learning_rate": 0.029739045685711753, + "loss": 0.7896, + "num_input_tokens_seen": 11960592, + "step": 20615 + }, + { + "epoch": 3.0711945189156986, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029738683475165065, + "loss": 0.8068, + "num_input_tokens_seen": 11963664, + "step": 20620 + }, + { + "epoch": 3.071939231456658, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029738321015623306, + "loss": 0.8096, + "num_input_tokens_seen": 11966928, + "step": 20625 + }, + { + "epoch": 3.072683943997617, + "grad_norm": 0.024169921875, + "learning_rate": 0.02973795830709259, + "loss": 0.8291, + "num_input_tokens_seen": 11970000, + "step": 20630 + }, + { + "epoch": 3.073428656538576, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029737595349579045, + "loss": 0.82, + "num_input_tokens_seen": 11973040, + "step": 20635 + }, + { + "epoch": 3.0741733690795354, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0297372321430888, + "loss": 0.8124, + "num_input_tokens_seen": 11976016, + "step": 20640 + }, + { + "epoch": 3.0749180816204946, + "grad_norm": 0.0181884765625, + "learning_rate": 0.029736868687628006, + "loss": 0.8184, + "num_input_tokens_seen": 11979120, + "step": 20645 + }, + { + "epoch": 3.075662794161454, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029736504983202787, + "loss": 0.835, + "num_input_tokens_seen": 11981776, + "step": 20650 + }, + { + "epoch": 3.076407506702413, + "grad_norm": 0.019775390625, + "learning_rate": 0.029736141029819297, + "loss": 0.8194, + "num_input_tokens_seen": 11984400, + "step": 20655 + }, + { + "epoch": 3.0771522192433722, + "grad_norm": 0.0380859375, + "learning_rate": 0.02973577682748368, + "loss": 0.7991, + "num_input_tokens_seen": 11987248, + "step": 20660 + }, + { + "epoch": 3.0778969317843314, + "grad_norm": 0.019775390625, + "learning_rate": 0.029735412376202093, + "loss": 0.8066, + "num_input_tokens_seen": 11990128, + "step": 20665 + }, + { + "epoch": 3.0786416443252906, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02973504767598069, + "loss": 0.8059, + "num_input_tokens_seen": 11993392, + "step": 20670 + }, + { + "epoch": 3.07938635686625, + "grad_norm": 0.02099609375, + "learning_rate": 0.02973468272682563, + "loss": 0.7976, + "num_input_tokens_seen": 11996464, + "step": 20675 + }, + { + "epoch": 3.0801310694072086, + "grad_norm": 0.036865234375, + "learning_rate": 0.02973431752874308, + "loss": 0.8056, + "num_input_tokens_seen": 11999472, + "step": 20680 + }, + { + "epoch": 3.080875781948168, + "grad_norm": 0.0186767578125, + "learning_rate": 0.029733952081739214, + "loss": 0.7924, + "num_input_tokens_seen": 12002416, + "step": 20685 + }, + { + "epoch": 3.081620494489127, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0297335863858202, + "loss": 0.7864, + "num_input_tokens_seen": 12005456, + "step": 20690 + }, + { + "epoch": 3.082365207030086, + "grad_norm": 0.01446533203125, + "learning_rate": 0.02973322044099222, + "loss": 0.7857, + "num_input_tokens_seen": 12008464, + "step": 20695 + }, + { + "epoch": 3.0831099195710454, + "grad_norm": 0.01373291015625, + "learning_rate": 0.029732854247261454, + "loss": 0.8215, + "num_input_tokens_seen": 12011312, + "step": 20700 + }, + { + "epoch": 3.0838546321120046, + "grad_norm": 0.01904296875, + "learning_rate": 0.029732487804634087, + "loss": 0.7897, + "num_input_tokens_seen": 12014512, + "step": 20705 + }, + { + "epoch": 3.084599344652964, + "grad_norm": 0.019775390625, + "learning_rate": 0.02973212111311632, + "loss": 0.8045, + "num_input_tokens_seen": 12017360, + "step": 20710 + }, + { + "epoch": 3.085344057193923, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02973175417271433, + "loss": 0.8257, + "num_input_tokens_seen": 12020112, + "step": 20715 + }, + { + "epoch": 3.086088769734882, + "grad_norm": 0.023193359375, + "learning_rate": 0.029731386983434333, + "loss": 0.797, + "num_input_tokens_seen": 12022800, + "step": 20720 + }, + { + "epoch": 3.0868334822758414, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02973101954528252, + "loss": 0.8076, + "num_input_tokens_seen": 12025840, + "step": 20725 + }, + { + "epoch": 3.0875781948168006, + "grad_norm": 0.013916015625, + "learning_rate": 0.02973065185826511, + "loss": 0.7742, + "num_input_tokens_seen": 12028784, + "step": 20730 + }, + { + "epoch": 3.08832290735776, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0297302839223883, + "loss": 0.7792, + "num_input_tokens_seen": 12031408, + "step": 20735 + }, + { + "epoch": 3.089067619898719, + "grad_norm": 0.0286865234375, + "learning_rate": 0.029729915737658326, + "loss": 0.8182, + "num_input_tokens_seen": 12034224, + "step": 20740 + }, + { + "epoch": 3.0898123324396782, + "grad_norm": 0.02978515625, + "learning_rate": 0.029729547304081384, + "loss": 0.797, + "num_input_tokens_seen": 12036880, + "step": 20745 + }, + { + "epoch": 3.0905570449806374, + "grad_norm": 0.027099609375, + "learning_rate": 0.02972917862166372, + "loss": 0.7901, + "num_input_tokens_seen": 12039920, + "step": 20750 + }, + { + "epoch": 3.0913017575215966, + "grad_norm": 0.023193359375, + "learning_rate": 0.029728809690411546, + "loss": 0.7774, + "num_input_tokens_seen": 12043088, + "step": 20755 + }, + { + "epoch": 3.092046470062556, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02972844051033111, + "loss": 0.7966, + "num_input_tokens_seen": 12045968, + "step": 20760 + }, + { + "epoch": 3.092791182603515, + "grad_norm": 0.021728515625, + "learning_rate": 0.029728071081428633, + "loss": 0.8242, + "num_input_tokens_seen": 12049008, + "step": 20765 + }, + { + "epoch": 3.0935358951444742, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02972770140371037, + "loss": 0.7866, + "num_input_tokens_seen": 12051856, + "step": 20770 + }, + { + "epoch": 3.0942806076854334, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029727331477182557, + "loss": 0.8008, + "num_input_tokens_seen": 12054608, + "step": 20775 + }, + { + "epoch": 3.0950253202263927, + "grad_norm": 0.0311279296875, + "learning_rate": 0.029726961301851446, + "loss": 0.8443, + "num_input_tokens_seen": 12057584, + "step": 20780 + }, + { + "epoch": 3.095770032767352, + "grad_norm": 0.01251220703125, + "learning_rate": 0.029726590877723294, + "loss": 0.8115, + "num_input_tokens_seen": 12060272, + "step": 20785 + }, + { + "epoch": 3.096514745308311, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029726220204804355, + "loss": 0.8083, + "num_input_tokens_seen": 12063120, + "step": 20790 + }, + { + "epoch": 3.0972594578492703, + "grad_norm": 0.01171875, + "learning_rate": 0.029725849283100896, + "loss": 0.8124, + "num_input_tokens_seen": 12066032, + "step": 20795 + }, + { + "epoch": 3.0980041703902295, + "grad_norm": 0.019287109375, + "learning_rate": 0.029725478112619177, + "loss": 0.7886, + "num_input_tokens_seen": 12068816, + "step": 20800 + }, + { + "epoch": 3.0987488829311887, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02972510669336547, + "loss": 0.8032, + "num_input_tokens_seen": 12071792, + "step": 20805 + }, + { + "epoch": 3.099493595472148, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029724735025346057, + "loss": 0.8198, + "num_input_tokens_seen": 12074576, + "step": 20810 + }, + { + "epoch": 3.100238308013107, + "grad_norm": 0.03759765625, + "learning_rate": 0.029724363108567207, + "loss": 0.8145, + "num_input_tokens_seen": 12077200, + "step": 20815 + }, + { + "epoch": 3.1009830205540663, + "grad_norm": 0.022705078125, + "learning_rate": 0.029723990943035208, + "loss": 0.8042, + "num_input_tokens_seen": 12080048, + "step": 20820 + }, + { + "epoch": 3.1017277330950255, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029723618528756345, + "loss": 0.7993, + "num_input_tokens_seen": 12082736, + "step": 20825 + }, + { + "epoch": 3.1024724456359847, + "grad_norm": 0.0146484375, + "learning_rate": 0.029723245865736915, + "loss": 0.8077, + "num_input_tokens_seen": 12085456, + "step": 20830 + }, + { + "epoch": 3.103217158176944, + "grad_norm": 0.019287109375, + "learning_rate": 0.02972287295398321, + "loss": 0.7949, + "num_input_tokens_seen": 12088464, + "step": 20835 + }, + { + "epoch": 3.103961870717903, + "grad_norm": 0.0186767578125, + "learning_rate": 0.029722499793501528, + "loss": 0.7977, + "num_input_tokens_seen": 12091312, + "step": 20840 + }, + { + "epoch": 3.1047065832588623, + "grad_norm": 0.0250244140625, + "learning_rate": 0.029722126384298178, + "loss": 0.8185, + "num_input_tokens_seen": 12094256, + "step": 20845 + }, + { + "epoch": 3.1054512957998215, + "grad_norm": 0.02294921875, + "learning_rate": 0.029721752726379464, + "loss": 0.8045, + "num_input_tokens_seen": 12097040, + "step": 20850 + }, + { + "epoch": 3.1061960083407802, + "grad_norm": 0.0186767578125, + "learning_rate": 0.029721378819751697, + "loss": 0.8058, + "num_input_tokens_seen": 12100080, + "step": 20855 + }, + { + "epoch": 3.1069407208817394, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029721004664421197, + "loss": 0.8125, + "num_input_tokens_seen": 12102864, + "step": 20860 + }, + { + "epoch": 3.1076854334226987, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02972063026039429, + "loss": 0.7935, + "num_input_tokens_seen": 12106064, + "step": 20865 + }, + { + "epoch": 3.108430145963658, + "grad_norm": 0.02099609375, + "learning_rate": 0.029720255607677295, + "loss": 0.8117, + "num_input_tokens_seen": 12108784, + "step": 20870 + }, + { + "epoch": 3.109174858504617, + "grad_norm": 0.01239013671875, + "learning_rate": 0.029719880706276545, + "loss": 0.797, + "num_input_tokens_seen": 12111344, + "step": 20875 + }, + { + "epoch": 3.1099195710455763, + "grad_norm": 0.0169677734375, + "learning_rate": 0.029719505556198365, + "loss": 0.7946, + "num_input_tokens_seen": 12114224, + "step": 20880 + }, + { + "epoch": 3.1106642835865355, + "grad_norm": 0.0181884765625, + "learning_rate": 0.029719130157449102, + "loss": 0.7905, + "num_input_tokens_seen": 12117328, + "step": 20885 + }, + { + "epoch": 3.1114089961274947, + "grad_norm": 0.01434326171875, + "learning_rate": 0.029718754510035094, + "loss": 0.818, + "num_input_tokens_seen": 12120016, + "step": 20890 + }, + { + "epoch": 3.112153708668454, + "grad_norm": 0.0181884765625, + "learning_rate": 0.029718378613962686, + "loss": 0.8083, + "num_input_tokens_seen": 12122672, + "step": 20895 + }, + { + "epoch": 3.112898421209413, + "grad_norm": 0.018798828125, + "learning_rate": 0.029718002469238235, + "loss": 0.7911, + "num_input_tokens_seen": 12125648, + "step": 20900 + }, + { + "epoch": 3.1136431337503723, + "grad_norm": 0.0113525390625, + "learning_rate": 0.029717626075868087, + "loss": 0.8066, + "num_input_tokens_seen": 12128464, + "step": 20905 + }, + { + "epoch": 3.1143878462913315, + "grad_norm": 0.019287109375, + "learning_rate": 0.029717249433858603, + "loss": 0.7925, + "num_input_tokens_seen": 12131376, + "step": 20910 + }, + { + "epoch": 3.1151325588322907, + "grad_norm": 0.018798828125, + "learning_rate": 0.029716872543216158, + "loss": 0.8007, + "num_input_tokens_seen": 12134672, + "step": 20915 + }, + { + "epoch": 3.11587727137325, + "grad_norm": 0.021728515625, + "learning_rate": 0.029716495403947097, + "loss": 0.7978, + "num_input_tokens_seen": 12137616, + "step": 20920 + }, + { + "epoch": 3.116621983914209, + "grad_norm": 0.018798828125, + "learning_rate": 0.02971611801605781, + "loss": 0.7989, + "num_input_tokens_seen": 12140496, + "step": 20925 + }, + { + "epoch": 3.1173666964551683, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029715740379554666, + "loss": 0.7975, + "num_input_tokens_seen": 12143632, + "step": 20930 + }, + { + "epoch": 3.1181114089961275, + "grad_norm": 0.026611328125, + "learning_rate": 0.029715362494444043, + "loss": 0.7946, + "num_input_tokens_seen": 12146448, + "step": 20935 + }, + { + "epoch": 3.1188561215370867, + "grad_norm": 0.0150146484375, + "learning_rate": 0.029714984360732326, + "loss": 0.7886, + "num_input_tokens_seen": 12149296, + "step": 20940 + }, + { + "epoch": 3.119600834078046, + "grad_norm": 0.028564453125, + "learning_rate": 0.029714605978425904, + "loss": 0.7921, + "num_input_tokens_seen": 12152144, + "step": 20945 + }, + { + "epoch": 3.120345546619005, + "grad_norm": 0.0174560546875, + "learning_rate": 0.02971422734753117, + "loss": 0.7932, + "num_input_tokens_seen": 12155248, + "step": 20950 + }, + { + "epoch": 3.1210902591599643, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02971384846805452, + "loss": 0.8003, + "num_input_tokens_seen": 12158192, + "step": 20955 + }, + { + "epoch": 3.1218349717009235, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029713469340002352, + "loss": 0.8074, + "num_input_tokens_seen": 12161232, + "step": 20960 + }, + { + "epoch": 3.1225796842418827, + "grad_norm": 0.026611328125, + "learning_rate": 0.029713089963381078, + "loss": 0.8083, + "num_input_tokens_seen": 12163984, + "step": 20965 + }, + { + "epoch": 3.123324396782842, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0297127103381971, + "loss": 0.7936, + "num_input_tokens_seen": 12167152, + "step": 20970 + }, + { + "epoch": 3.124069109323801, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029712330464456836, + "loss": 0.7777, + "num_input_tokens_seen": 12169968, + "step": 20975 + }, + { + "epoch": 3.1248138218647603, + "grad_norm": 0.01904296875, + "learning_rate": 0.0297119503421667, + "loss": 0.7893, + "num_input_tokens_seen": 12173072, + "step": 20980 + }, + { + "epoch": 3.1255585344057195, + "grad_norm": 0.0286865234375, + "learning_rate": 0.029711569971333115, + "loss": 0.8247, + "num_input_tokens_seen": 12176144, + "step": 20985 + }, + { + "epoch": 3.1263032469466787, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02971118935196251, + "loss": 0.782, + "num_input_tokens_seen": 12178832, + "step": 20990 + }, + { + "epoch": 3.127047959487638, + "grad_norm": 0.0152587890625, + "learning_rate": 0.029710808484061314, + "loss": 0.7809, + "num_input_tokens_seen": 12181392, + "step": 20995 + }, + { + "epoch": 3.127792672028597, + "grad_norm": 0.042236328125, + "learning_rate": 0.029710427367635957, + "loss": 0.8127, + "num_input_tokens_seen": 12184272, + "step": 21000 + }, + { + "epoch": 3.1285373845695563, + "grad_norm": 0.033203125, + "learning_rate": 0.029710046002692878, + "loss": 0.8282, + "num_input_tokens_seen": 12187152, + "step": 21005 + }, + { + "epoch": 3.1292820971105155, + "grad_norm": 0.033447265625, + "learning_rate": 0.02970966438923853, + "loss": 0.8282, + "num_input_tokens_seen": 12190352, + "step": 21010 + }, + { + "epoch": 3.1300268096514747, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029709282527279342, + "loss": 0.79, + "num_input_tokens_seen": 12193168, + "step": 21015 + }, + { + "epoch": 3.1307715221924335, + "grad_norm": 0.04248046875, + "learning_rate": 0.029708900416821785, + "loss": 0.8291, + "num_input_tokens_seen": 12195984, + "step": 21020 + }, + { + "epoch": 3.131516234733393, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029708518057872297, + "loss": 0.7945, + "num_input_tokens_seen": 12198960, + "step": 21025 + }, + { + "epoch": 3.132260947274352, + "grad_norm": 0.034423828125, + "learning_rate": 0.029708135450437354, + "loss": 0.8169, + "num_input_tokens_seen": 12201712, + "step": 21030 + }, + { + "epoch": 3.133005659815311, + "grad_norm": 0.0308837890625, + "learning_rate": 0.029707752594523407, + "loss": 0.8383, + "num_input_tokens_seen": 12204528, + "step": 21035 + }, + { + "epoch": 3.1337503723562703, + "grad_norm": 0.023681640625, + "learning_rate": 0.02970736949013693, + "loss": 0.8214, + "num_input_tokens_seen": 12207376, + "step": 21040 + }, + { + "epoch": 3.1344950848972295, + "grad_norm": 0.02490234375, + "learning_rate": 0.02970698613728439, + "loss": 0.8142, + "num_input_tokens_seen": 12210256, + "step": 21045 + }, + { + "epoch": 3.1352397974381887, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02970660253597227, + "loss": 0.81, + "num_input_tokens_seen": 12213488, + "step": 21050 + }, + { + "epoch": 3.135984509979148, + "grad_norm": 0.0260009765625, + "learning_rate": 0.029706218686207046, + "loss": 0.8001, + "num_input_tokens_seen": 12216272, + "step": 21055 + }, + { + "epoch": 3.136729222520107, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0297058345879952, + "loss": 0.7967, + "num_input_tokens_seen": 12219120, + "step": 21060 + }, + { + "epoch": 3.1374739350610663, + "grad_norm": 0.01318359375, + "learning_rate": 0.02970545024134323, + "loss": 0.7949, + "num_input_tokens_seen": 12221840, + "step": 21065 + }, + { + "epoch": 3.1382186476020255, + "grad_norm": 0.02197265625, + "learning_rate": 0.02970506564625762, + "loss": 0.8028, + "num_input_tokens_seen": 12225136, + "step": 21070 + }, + { + "epoch": 3.1389633601429847, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029704680802744878, + "loss": 0.8136, + "num_input_tokens_seen": 12227856, + "step": 21075 + }, + { + "epoch": 3.139708072683944, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029704295710811495, + "loss": 0.7932, + "num_input_tokens_seen": 12230704, + "step": 21080 + }, + { + "epoch": 3.140452785224903, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02970391037046398, + "loss": 0.7958, + "num_input_tokens_seen": 12233872, + "step": 21085 + }, + { + "epoch": 3.1411974977658623, + "grad_norm": 0.029052734375, + "learning_rate": 0.029703524781708845, + "loss": 0.7905, + "num_input_tokens_seen": 12236496, + "step": 21090 + }, + { + "epoch": 3.1419422103068215, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029703138944552603, + "loss": 0.7995, + "num_input_tokens_seen": 12239216, + "step": 21095 + }, + { + "epoch": 3.1426869228477807, + "grad_norm": 0.015625, + "learning_rate": 0.02970275285900177, + "loss": 0.8004, + "num_input_tokens_seen": 12242032, + "step": 21100 + }, + { + "epoch": 3.14343163538874, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029702366525062873, + "loss": 0.8064, + "num_input_tokens_seen": 12245232, + "step": 21105 + }, + { + "epoch": 3.144176347929699, + "grad_norm": 0.035888671875, + "learning_rate": 0.029701979942742436, + "loss": 0.7988, + "num_input_tokens_seen": 12247856, + "step": 21110 + }, + { + "epoch": 3.1449210604706583, + "grad_norm": 0.01348876953125, + "learning_rate": 0.029701593112046988, + "loss": 0.8235, + "num_input_tokens_seen": 12250480, + "step": 21115 + }, + { + "epoch": 3.1456657730116175, + "grad_norm": 0.02587890625, + "learning_rate": 0.02970120603298307, + "loss": 0.8026, + "num_input_tokens_seen": 12253232, + "step": 21120 + }, + { + "epoch": 3.1464104855525767, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029700818705557214, + "loss": 0.793, + "num_input_tokens_seen": 12256240, + "step": 21125 + }, + { + "epoch": 3.147155198093536, + "grad_norm": 0.044677734375, + "learning_rate": 0.02970043112977597, + "loss": 0.7954, + "num_input_tokens_seen": 12259344, + "step": 21130 + }, + { + "epoch": 3.147899910634495, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029700043305645882, + "loss": 0.8081, + "num_input_tokens_seen": 12262192, + "step": 21135 + }, + { + "epoch": 3.1486446231754543, + "grad_norm": 0.028564453125, + "learning_rate": 0.029699655233173503, + "loss": 0.8197, + "num_input_tokens_seen": 12265136, + "step": 21140 + }, + { + "epoch": 3.1493893357164136, + "grad_norm": 0.019775390625, + "learning_rate": 0.02969926691236539, + "loss": 0.7956, + "num_input_tokens_seen": 12268240, + "step": 21145 + }, + { + "epoch": 3.1501340482573728, + "grad_norm": 0.0205078125, + "learning_rate": 0.029698878343228104, + "loss": 0.7959, + "num_input_tokens_seen": 12271120, + "step": 21150 + }, + { + "epoch": 3.150878760798332, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0296984895257682, + "loss": 0.7889, + "num_input_tokens_seen": 12273776, + "step": 21155 + }, + { + "epoch": 3.151623473339291, + "grad_norm": 0.026123046875, + "learning_rate": 0.02969810045999226, + "loss": 0.8093, + "num_input_tokens_seen": 12277008, + "step": 21160 + }, + { + "epoch": 3.1523681858802504, + "grad_norm": 0.040283203125, + "learning_rate": 0.02969771114590685, + "loss": 0.7849, + "num_input_tokens_seen": 12279568, + "step": 21165 + }, + { + "epoch": 3.1531128984212096, + "grad_norm": 0.026611328125, + "learning_rate": 0.02969732158351855, + "loss": 0.8253, + "num_input_tokens_seen": 12282352, + "step": 21170 + }, + { + "epoch": 3.1538576109621688, + "grad_norm": 0.01507568359375, + "learning_rate": 0.02969693177283394, + "loss": 0.8045, + "num_input_tokens_seen": 12285296, + "step": 21175 + }, + { + "epoch": 3.154602323503128, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029696541713859603, + "loss": 0.7996, + "num_input_tokens_seen": 12287984, + "step": 21180 + }, + { + "epoch": 3.155347036044087, + "grad_norm": 0.0283203125, + "learning_rate": 0.029696151406602127, + "loss": 0.8004, + "num_input_tokens_seen": 12290928, + "step": 21185 + }, + { + "epoch": 3.1560917485850464, + "grad_norm": 0.02734375, + "learning_rate": 0.029695760851068113, + "loss": 0.7976, + "num_input_tokens_seen": 12294192, + "step": 21190 + }, + { + "epoch": 3.156836461126005, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029695370047264157, + "loss": 0.7981, + "num_input_tokens_seen": 12296976, + "step": 21195 + }, + { + "epoch": 3.157581173666965, + "grad_norm": 0.032958984375, + "learning_rate": 0.02969497899519686, + "loss": 0.7681, + "num_input_tokens_seen": 12299984, + "step": 21200 + }, + { + "epoch": 3.1583258862079235, + "grad_norm": 0.021240234375, + "learning_rate": 0.029694587694872827, + "loss": 0.8084, + "num_input_tokens_seen": 12303152, + "step": 21205 + }, + { + "epoch": 3.1590705987488827, + "grad_norm": 0.029052734375, + "learning_rate": 0.02969419614629867, + "loss": 0.7661, + "num_input_tokens_seen": 12306032, + "step": 21210 + }, + { + "epoch": 3.159815311289842, + "grad_norm": 0.022705078125, + "learning_rate": 0.029693804349481, + "loss": 0.797, + "num_input_tokens_seen": 12308656, + "step": 21215 + }, + { + "epoch": 3.160560023830801, + "grad_norm": 0.029296875, + "learning_rate": 0.029693412304426445, + "loss": 0.7726, + "num_input_tokens_seen": 12311600, + "step": 21220 + }, + { + "epoch": 3.1613047363717603, + "grad_norm": 0.016845703125, + "learning_rate": 0.02969302001114162, + "loss": 0.8402, + "num_input_tokens_seen": 12314416, + "step": 21225 + }, + { + "epoch": 3.1620494489127196, + "grad_norm": 0.053955078125, + "learning_rate": 0.029692627469633158, + "loss": 0.8017, + "num_input_tokens_seen": 12317520, + "step": 21230 + }, + { + "epoch": 3.1627941614536788, + "grad_norm": 0.031005859375, + "learning_rate": 0.029692234679907684, + "loss": 0.8306, + "num_input_tokens_seen": 12320496, + "step": 21235 + }, + { + "epoch": 3.163538873994638, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02969184164197184, + "loss": 0.7907, + "num_input_tokens_seen": 12323856, + "step": 21240 + }, + { + "epoch": 3.164283586535597, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02969144835583226, + "loss": 0.7879, + "num_input_tokens_seen": 12326736, + "step": 21245 + }, + { + "epoch": 3.1650282990765564, + "grad_norm": 0.02099609375, + "learning_rate": 0.029691054821495595, + "loss": 0.8249, + "num_input_tokens_seen": 12329456, + "step": 21250 + }, + { + "epoch": 3.1657730116175156, + "grad_norm": 0.0152587890625, + "learning_rate": 0.02969066103896849, + "loss": 0.8153, + "num_input_tokens_seen": 12332240, + "step": 21255 + }, + { + "epoch": 3.1665177241584748, + "grad_norm": 0.0152587890625, + "learning_rate": 0.029690267008257595, + "loss": 0.8133, + "num_input_tokens_seen": 12335120, + "step": 21260 + }, + { + "epoch": 3.167262436699434, + "grad_norm": 0.0341796875, + "learning_rate": 0.029689872729369574, + "loss": 0.7894, + "num_input_tokens_seen": 12338384, + "step": 21265 + }, + { + "epoch": 3.168007149240393, + "grad_norm": 0.02197265625, + "learning_rate": 0.029689478202311083, + "loss": 0.8123, + "num_input_tokens_seen": 12341200, + "step": 21270 + }, + { + "epoch": 3.1687518617813524, + "grad_norm": 0.0133056640625, + "learning_rate": 0.029689083427088783, + "loss": 0.7965, + "num_input_tokens_seen": 12344080, + "step": 21275 + }, + { + "epoch": 3.1694965743223116, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02968868840370935, + "loss": 0.7908, + "num_input_tokens_seen": 12347120, + "step": 21280 + }, + { + "epoch": 3.170241286863271, + "grad_norm": 0.032470703125, + "learning_rate": 0.029688293132179456, + "loss": 0.7806, + "num_input_tokens_seen": 12350128, + "step": 21285 + }, + { + "epoch": 3.17098599940423, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02968789761250578, + "loss": 0.796, + "num_input_tokens_seen": 12352848, + "step": 21290 + }, + { + "epoch": 3.171730711945189, + "grad_norm": 0.015625, + "learning_rate": 0.029687501844695, + "loss": 0.7968, + "num_input_tokens_seen": 12355920, + "step": 21295 + }, + { + "epoch": 3.1724754244861484, + "grad_norm": 0.019775390625, + "learning_rate": 0.029687105828753804, + "loss": 0.8106, + "num_input_tokens_seen": 12359120, + "step": 21300 + }, + { + "epoch": 3.1732201370271076, + "grad_norm": 0.0205078125, + "learning_rate": 0.029686709564688878, + "loss": 0.8074, + "num_input_tokens_seen": 12361808, + "step": 21305 + }, + { + "epoch": 3.173964849568067, + "grad_norm": 0.01318359375, + "learning_rate": 0.029686313052506928, + "loss": 0.7908, + "num_input_tokens_seen": 12364816, + "step": 21310 + }, + { + "epoch": 3.174709562109026, + "grad_norm": 0.0260009765625, + "learning_rate": 0.029685916292214645, + "loss": 0.7775, + "num_input_tokens_seen": 12367696, + "step": 21315 + }, + { + "epoch": 3.175454274649985, + "grad_norm": 0.0302734375, + "learning_rate": 0.029685519283818726, + "loss": 0.8055, + "num_input_tokens_seen": 12370704, + "step": 21320 + }, + { + "epoch": 3.1761989871909444, + "grad_norm": 0.0172119140625, + "learning_rate": 0.02968512202732589, + "loss": 0.7918, + "num_input_tokens_seen": 12373584, + "step": 21325 + }, + { + "epoch": 3.1769436997319036, + "grad_norm": 0.038818359375, + "learning_rate": 0.02968472452274284, + "loss": 0.7896, + "num_input_tokens_seen": 12376912, + "step": 21330 + }, + { + "epoch": 3.177688412272863, + "grad_norm": 0.0224609375, + "learning_rate": 0.029684326770076295, + "loss": 0.8314, + "num_input_tokens_seen": 12379920, + "step": 21335 + }, + { + "epoch": 3.178433124813822, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029683928769332974, + "loss": 0.8021, + "num_input_tokens_seen": 12382864, + "step": 21340 + }, + { + "epoch": 3.179177837354781, + "grad_norm": 0.02099609375, + "learning_rate": 0.029683530520519603, + "loss": 0.8319, + "num_input_tokens_seen": 12385744, + "step": 21345 + }, + { + "epoch": 3.1799225498957404, + "grad_norm": 0.027587890625, + "learning_rate": 0.0296831320236429, + "loss": 0.7937, + "num_input_tokens_seen": 12388624, + "step": 21350 + }, + { + "epoch": 3.1806672624366996, + "grad_norm": 0.0115966796875, + "learning_rate": 0.02968273327870961, + "loss": 0.7954, + "num_input_tokens_seen": 12391408, + "step": 21355 + }, + { + "epoch": 3.181411974977659, + "grad_norm": 0.031982421875, + "learning_rate": 0.029682334285726465, + "loss": 0.7958, + "num_input_tokens_seen": 12394320, + "step": 21360 + }, + { + "epoch": 3.182156687518618, + "grad_norm": 0.0130615234375, + "learning_rate": 0.0296819350447002, + "loss": 0.8357, + "num_input_tokens_seen": 12397264, + "step": 21365 + }, + { + "epoch": 3.182901400059577, + "grad_norm": 0.020751953125, + "learning_rate": 0.029681535555637573, + "loss": 0.8059, + "num_input_tokens_seen": 12399984, + "step": 21370 + }, + { + "epoch": 3.1836461126005364, + "grad_norm": 0.01507568359375, + "learning_rate": 0.02968113581854532, + "loss": 0.8141, + "num_input_tokens_seen": 12402832, + "step": 21375 + }, + { + "epoch": 3.184390825141495, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029680735833430196, + "loss": 0.7999, + "num_input_tokens_seen": 12405712, + "step": 21380 + }, + { + "epoch": 3.1851355376824544, + "grad_norm": 0.023193359375, + "learning_rate": 0.02968033560029896, + "loss": 0.7978, + "num_input_tokens_seen": 12408752, + "step": 21385 + }, + { + "epoch": 3.1858802502234136, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029679935119158375, + "loss": 0.8055, + "num_input_tokens_seen": 12411760, + "step": 21390 + }, + { + "epoch": 3.186624962764373, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029679534390015208, + "loss": 0.8135, + "num_input_tokens_seen": 12414224, + "step": 21395 + }, + { + "epoch": 3.187369675305332, + "grad_norm": 0.025634765625, + "learning_rate": 0.029679133412876226, + "loss": 0.815, + "num_input_tokens_seen": 12417296, + "step": 21400 + }, + { + "epoch": 3.188114387846291, + "grad_norm": 0.0234375, + "learning_rate": 0.02967873218774821, + "loss": 0.7924, + "num_input_tokens_seen": 12420240, + "step": 21405 + }, + { + "epoch": 3.1888591003872504, + "grad_norm": 0.031494140625, + "learning_rate": 0.029678330714637928, + "loss": 0.7989, + "num_input_tokens_seen": 12422928, + "step": 21410 + }, + { + "epoch": 3.1896038129282096, + "grad_norm": 0.0281982421875, + "learning_rate": 0.029677928993552167, + "loss": 0.8001, + "num_input_tokens_seen": 12425936, + "step": 21415 + }, + { + "epoch": 3.190348525469169, + "grad_norm": 0.020263671875, + "learning_rate": 0.02967752702449771, + "loss": 0.8159, + "num_input_tokens_seen": 12429040, + "step": 21420 + }, + { + "epoch": 3.191093238010128, + "grad_norm": 0.023681640625, + "learning_rate": 0.02967712480748136, + "loss": 0.8129, + "num_input_tokens_seen": 12431952, + "step": 21425 + }, + { + "epoch": 3.191837950551087, + "grad_norm": 0.026611328125, + "learning_rate": 0.029676722342509895, + "loss": 0.8242, + "num_input_tokens_seen": 12434800, + "step": 21430 + }, + { + "epoch": 3.1925826630920464, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02967631962959013, + "loss": 0.8067, + "num_input_tokens_seen": 12437744, + "step": 21435 + }, + { + "epoch": 3.1933273756330056, + "grad_norm": 0.022705078125, + "learning_rate": 0.029675916668728857, + "loss": 0.7893, + "num_input_tokens_seen": 12440496, + "step": 21440 + }, + { + "epoch": 3.194072088173965, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029675513459932888, + "loss": 0.7854, + "num_input_tokens_seen": 12443504, + "step": 21445 + }, + { + "epoch": 3.194816800714924, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029675110003209035, + "loss": 0.8123, + "num_input_tokens_seen": 12446352, + "step": 21450 + }, + { + "epoch": 3.1955615132558832, + "grad_norm": 0.02197265625, + "learning_rate": 0.029674706298564113, + "loss": 0.7985, + "num_input_tokens_seen": 12449392, + "step": 21455 + }, + { + "epoch": 3.1963062257968424, + "grad_norm": 0.0157470703125, + "learning_rate": 0.029674302346004943, + "loss": 0.8149, + "num_input_tokens_seen": 12452336, + "step": 21460 + }, + { + "epoch": 3.1970509383378016, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02967389814553835, + "loss": 0.824, + "num_input_tokens_seen": 12455280, + "step": 21465 + }, + { + "epoch": 3.197795650878761, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02967349369717116, + "loss": 0.7976, + "num_input_tokens_seen": 12458384, + "step": 21470 + }, + { + "epoch": 3.19854036341972, + "grad_norm": 0.018798828125, + "learning_rate": 0.02967308900091021, + "loss": 0.8134, + "num_input_tokens_seen": 12461136, + "step": 21475 + }, + { + "epoch": 3.1992850759606792, + "grad_norm": 0.01318359375, + "learning_rate": 0.029672684056762332, + "loss": 0.7986, + "num_input_tokens_seen": 12463856, + "step": 21480 + }, + { + "epoch": 3.2000297885016384, + "grad_norm": 0.01519775390625, + "learning_rate": 0.029672278864734374, + "loss": 0.801, + "num_input_tokens_seen": 12466704, + "step": 21485 + }, + { + "epoch": 3.2007745010425976, + "grad_norm": 0.02099609375, + "learning_rate": 0.02967187342483317, + "loss": 0.8019, + "num_input_tokens_seen": 12469744, + "step": 21490 + }, + { + "epoch": 3.201519213583557, + "grad_norm": 0.031494140625, + "learning_rate": 0.029671467737065582, + "loss": 0.8013, + "num_input_tokens_seen": 12472432, + "step": 21495 + }, + { + "epoch": 3.202263926124516, + "grad_norm": 0.023193359375, + "learning_rate": 0.029671061801438456, + "loss": 0.8106, + "num_input_tokens_seen": 12475312, + "step": 21500 + }, + { + "epoch": 3.2030086386654752, + "grad_norm": 0.0142822265625, + "learning_rate": 0.02967065561795865, + "loss": 0.8024, + "num_input_tokens_seen": 12478448, + "step": 21505 + }, + { + "epoch": 3.2037533512064345, + "grad_norm": 0.031494140625, + "learning_rate": 0.02967024918663303, + "loss": 0.7906, + "num_input_tokens_seen": 12481520, + "step": 21510 + }, + { + "epoch": 3.2044980637473937, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029669842507468458, + "loss": 0.8216, + "num_input_tokens_seen": 12484656, + "step": 21515 + }, + { + "epoch": 3.205242776288353, + "grad_norm": 0.022216796875, + "learning_rate": 0.02966943558047181, + "loss": 0.8073, + "num_input_tokens_seen": 12487376, + "step": 21520 + }, + { + "epoch": 3.205987488829312, + "grad_norm": 0.020263671875, + "learning_rate": 0.029669028405649955, + "loss": 0.7942, + "num_input_tokens_seen": 12490480, + "step": 21525 + }, + { + "epoch": 3.2067322013702713, + "grad_norm": 0.01373291015625, + "learning_rate": 0.029668620983009777, + "loss": 0.8112, + "num_input_tokens_seen": 12493360, + "step": 21530 + }, + { + "epoch": 3.2074769139112305, + "grad_norm": 0.025390625, + "learning_rate": 0.029668213312558152, + "loss": 0.7892, + "num_input_tokens_seen": 12496272, + "step": 21535 + }, + { + "epoch": 3.2082216264521897, + "grad_norm": 0.019775390625, + "learning_rate": 0.02966780539430197, + "loss": 0.8089, + "num_input_tokens_seen": 12499152, + "step": 21540 + }, + { + "epoch": 3.2089663389931484, + "grad_norm": 0.0130615234375, + "learning_rate": 0.02966739722824813, + "loss": 0.8046, + "num_input_tokens_seen": 12502320, + "step": 21545 + }, + { + "epoch": 3.2097110515341076, + "grad_norm": 0.021484375, + "learning_rate": 0.029666988814403515, + "loss": 0.8022, + "num_input_tokens_seen": 12505008, + "step": 21550 + }, + { + "epoch": 3.210455764075067, + "grad_norm": 0.02001953125, + "learning_rate": 0.029666580152775037, + "loss": 0.7985, + "num_input_tokens_seen": 12508016, + "step": 21555 + }, + { + "epoch": 3.211200476616026, + "grad_norm": 0.01446533203125, + "learning_rate": 0.029666171243369586, + "loss": 0.7977, + "num_input_tokens_seen": 12510896, + "step": 21560 + }, + { + "epoch": 3.2119451891569852, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029665762086194085, + "loss": 0.7737, + "num_input_tokens_seen": 12513872, + "step": 21565 + }, + { + "epoch": 3.2126899016979444, + "grad_norm": 0.0172119140625, + "learning_rate": 0.029665352681255436, + "loss": 0.7865, + "num_input_tokens_seen": 12516816, + "step": 21570 + }, + { + "epoch": 3.2134346142389036, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029664943028560556, + "loss": 0.8033, + "num_input_tokens_seen": 12519632, + "step": 21575 + }, + { + "epoch": 3.214179326779863, + "grad_norm": 0.03271484375, + "learning_rate": 0.02966453312811637, + "loss": 0.8058, + "num_input_tokens_seen": 12522352, + "step": 21580 + }, + { + "epoch": 3.214924039320822, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029664122979929805, + "loss": 0.793, + "num_input_tokens_seen": 12525104, + "step": 21585 + }, + { + "epoch": 3.2156687518617812, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02966371258400778, + "loss": 0.7711, + "num_input_tokens_seen": 12527920, + "step": 21590 + }, + { + "epoch": 3.2164134644027405, + "grad_norm": 0.02197265625, + "learning_rate": 0.02966330194035724, + "loss": 0.7951, + "num_input_tokens_seen": 12530800, + "step": 21595 + }, + { + "epoch": 3.2171581769436997, + "grad_norm": 0.022705078125, + "learning_rate": 0.029662891048985115, + "loss": 0.8237, + "num_input_tokens_seen": 12533936, + "step": 21600 + }, + { + "epoch": 3.217902889484659, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02966247990989835, + "loss": 0.815, + "num_input_tokens_seen": 12536656, + "step": 21605 + }, + { + "epoch": 3.218647602025618, + "grad_norm": 0.018798828125, + "learning_rate": 0.029662068523103886, + "loss": 0.7835, + "num_input_tokens_seen": 12539600, + "step": 21610 + }, + { + "epoch": 3.2193923145665773, + "grad_norm": 0.022705078125, + "learning_rate": 0.029661656888608678, + "loss": 0.8154, + "num_input_tokens_seen": 12542352, + "step": 21615 + }, + { + "epoch": 3.2201370271075365, + "grad_norm": 0.019775390625, + "learning_rate": 0.02966124500641968, + "loss": 0.8243, + "num_input_tokens_seen": 12545296, + "step": 21620 + }, + { + "epoch": 3.2208817396484957, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029660832876543843, + "loss": 0.8201, + "num_input_tokens_seen": 12548240, + "step": 21625 + }, + { + "epoch": 3.221626452189455, + "grad_norm": 0.022216796875, + "learning_rate": 0.02966042049898814, + "loss": 0.8154, + "num_input_tokens_seen": 12551152, + "step": 21630 + }, + { + "epoch": 3.222371164730414, + "grad_norm": 0.0137939453125, + "learning_rate": 0.029660007873759536, + "loss": 0.7994, + "num_input_tokens_seen": 12553840, + "step": 21635 + }, + { + "epoch": 3.2231158772713733, + "grad_norm": 0.041015625, + "learning_rate": 0.02965959500086499, + "loss": 0.8069, + "num_input_tokens_seen": 12556976, + "step": 21640 + }, + { + "epoch": 3.2238605898123325, + "grad_norm": 0.03076171875, + "learning_rate": 0.029659181880311487, + "loss": 0.8012, + "num_input_tokens_seen": 12559664, + "step": 21645 + }, + { + "epoch": 3.2246053023532917, + "grad_norm": 0.019287109375, + "learning_rate": 0.02965876851210601, + "loss": 0.802, + "num_input_tokens_seen": 12562704, + "step": 21650 + }, + { + "epoch": 3.225350014894251, + "grad_norm": 0.019287109375, + "learning_rate": 0.029658354896255536, + "loss": 0.7952, + "num_input_tokens_seen": 12565392, + "step": 21655 + }, + { + "epoch": 3.22609472743521, + "grad_norm": 0.02001953125, + "learning_rate": 0.029657941032767053, + "loss": 0.8031, + "num_input_tokens_seen": 12568208, + "step": 21660 + }, + { + "epoch": 3.2268394399761693, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029657526921647557, + "loss": 0.8142, + "num_input_tokens_seen": 12571088, + "step": 21665 + }, + { + "epoch": 3.2275841525171285, + "grad_norm": 0.01348876953125, + "learning_rate": 0.02965711256290404, + "loss": 0.7976, + "num_input_tokens_seen": 12573552, + "step": 21670 + }, + { + "epoch": 3.2283288650580877, + "grad_norm": 0.0224609375, + "learning_rate": 0.029656697956543496, + "loss": 0.8067, + "num_input_tokens_seen": 12576752, + "step": 21675 + }, + { + "epoch": 3.229073577599047, + "grad_norm": 0.023193359375, + "learning_rate": 0.029656283102572945, + "loss": 0.7962, + "num_input_tokens_seen": 12579536, + "step": 21680 + }, + { + "epoch": 3.229818290140006, + "grad_norm": 0.02587890625, + "learning_rate": 0.029655868000999383, + "loss": 0.8002, + "num_input_tokens_seen": 12582384, + "step": 21685 + }, + { + "epoch": 3.2305630026809653, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02965545265182983, + "loss": 0.8069, + "num_input_tokens_seen": 12585616, + "step": 21690 + }, + { + "epoch": 3.2313077152219245, + "grad_norm": 0.01458740234375, + "learning_rate": 0.029655037055071293, + "loss": 0.7832, + "num_input_tokens_seen": 12588304, + "step": 21695 + }, + { + "epoch": 3.2320524277628837, + "grad_norm": 0.0303955078125, + "learning_rate": 0.029654621210730807, + "loss": 0.7994, + "num_input_tokens_seen": 12591120, + "step": 21700 + }, + { + "epoch": 3.232797140303843, + "grad_norm": 0.01361083984375, + "learning_rate": 0.029654205118815388, + "loss": 0.8325, + "num_input_tokens_seen": 12593648, + "step": 21705 + }, + { + "epoch": 3.233541852844802, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029653788779332065, + "loss": 0.8066, + "num_input_tokens_seen": 12596496, + "step": 21710 + }, + { + "epoch": 3.2342865653857613, + "grad_norm": 0.0286865234375, + "learning_rate": 0.029653372192287874, + "loss": 0.8063, + "num_input_tokens_seen": 12599248, + "step": 21715 + }, + { + "epoch": 3.23503127792672, + "grad_norm": 0.0308837890625, + "learning_rate": 0.029652955357689855, + "loss": 0.8098, + "num_input_tokens_seen": 12601904, + "step": 21720 + }, + { + "epoch": 3.2357759904676793, + "grad_norm": 0.019775390625, + "learning_rate": 0.029652538275545048, + "loss": 0.798, + "num_input_tokens_seen": 12604880, + "step": 21725 + }, + { + "epoch": 3.2365207030086385, + "grad_norm": 0.02587890625, + "learning_rate": 0.029652120945860498, + "loss": 0.8068, + "num_input_tokens_seen": 12607824, + "step": 21730 + }, + { + "epoch": 3.2372654155495977, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029651703368643253, + "loss": 0.8078, + "num_input_tokens_seen": 12610800, + "step": 21735 + }, + { + "epoch": 3.238010128090557, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029651285543900376, + "loss": 0.7906, + "num_input_tokens_seen": 12613616, + "step": 21740 + }, + { + "epoch": 3.238754840631516, + "grad_norm": 0.020751953125, + "learning_rate": 0.02965086747163892, + "loss": 0.7907, + "num_input_tokens_seen": 12616880, + "step": 21745 + }, + { + "epoch": 3.2394995531724753, + "grad_norm": 0.0274658203125, + "learning_rate": 0.029650449151865946, + "loss": 0.8176, + "num_input_tokens_seen": 12619792, + "step": 21750 + }, + { + "epoch": 3.2402442657134345, + "grad_norm": 0.0279541015625, + "learning_rate": 0.029650030584588524, + "loss": 0.7776, + "num_input_tokens_seen": 12622576, + "step": 21755 + }, + { + "epoch": 3.2409889782543937, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029649611769813726, + "loss": 0.7804, + "num_input_tokens_seen": 12625456, + "step": 21760 + }, + { + "epoch": 3.241733690795353, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02964919270754863, + "loss": 0.7791, + "num_input_tokens_seen": 12628400, + "step": 21765 + }, + { + "epoch": 3.242478403336312, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029648773397800306, + "loss": 0.7998, + "num_input_tokens_seen": 12631248, + "step": 21770 + }, + { + "epoch": 3.2432231158772713, + "grad_norm": 0.01904296875, + "learning_rate": 0.029648353840575844, + "loss": 0.7885, + "num_input_tokens_seen": 12634192, + "step": 21775 + }, + { + "epoch": 3.2439678284182305, + "grad_norm": 0.0291748046875, + "learning_rate": 0.029647934035882336, + "loss": 0.8044, + "num_input_tokens_seen": 12636880, + "step": 21780 + }, + { + "epoch": 3.2447125409591897, + "grad_norm": 0.019287109375, + "learning_rate": 0.029647513983726868, + "loss": 0.8009, + "num_input_tokens_seen": 12640016, + "step": 21785 + }, + { + "epoch": 3.245457253500149, + "grad_norm": 0.0181884765625, + "learning_rate": 0.029647093684116537, + "loss": 0.7945, + "num_input_tokens_seen": 12643120, + "step": 21790 + }, + { + "epoch": 3.246201966041108, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029646673137058446, + "loss": 0.8067, + "num_input_tokens_seen": 12646000, + "step": 21795 + }, + { + "epoch": 3.2469466785820673, + "grad_norm": 0.018310546875, + "learning_rate": 0.0296462523425597, + "loss": 0.7817, + "num_input_tokens_seen": 12648784, + "step": 21800 + }, + { + "epoch": 3.2476913911230265, + "grad_norm": 0.0289306640625, + "learning_rate": 0.029645831300627402, + "loss": 0.817, + "num_input_tokens_seen": 12651632, + "step": 21805 + }, + { + "epoch": 3.2484361036639857, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029645410011268674, + "loss": 0.7973, + "num_input_tokens_seen": 12654256, + "step": 21810 + }, + { + "epoch": 3.249180816204945, + "grad_norm": 0.027587890625, + "learning_rate": 0.029644988474490628, + "loss": 0.8217, + "num_input_tokens_seen": 12657104, + "step": 21815 + }, + { + "epoch": 3.249925528745904, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029644566690300383, + "loss": 0.7899, + "num_input_tokens_seen": 12659728, + "step": 21820 + }, + { + "epoch": 3.2506702412868633, + "grad_norm": 0.0284423828125, + "learning_rate": 0.029644144658705074, + "loss": 0.7822, + "num_input_tokens_seen": 12662768, + "step": 21825 + }, + { + "epoch": 3.2514149538278225, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02964372237971182, + "loss": 0.8036, + "num_input_tokens_seen": 12665776, + "step": 21830 + }, + { + "epoch": 3.2521596663687817, + "grad_norm": 0.02099609375, + "learning_rate": 0.02964329985332776, + "loss": 0.7991, + "num_input_tokens_seen": 12668592, + "step": 21835 + }, + { + "epoch": 3.252904378909741, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02964287707956003, + "loss": 0.82, + "num_input_tokens_seen": 12671664, + "step": 21840 + }, + { + "epoch": 3.2536490914507, + "grad_norm": 0.025146484375, + "learning_rate": 0.029642454058415778, + "loss": 0.7987, + "num_input_tokens_seen": 12674512, + "step": 21845 + }, + { + "epoch": 3.2543938039916593, + "grad_norm": 0.0166015625, + "learning_rate": 0.029642030789902147, + "loss": 0.7706, + "num_input_tokens_seen": 12677520, + "step": 21850 + }, + { + "epoch": 3.2551385165326185, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029641607274026285, + "loss": 0.8081, + "num_input_tokens_seen": 12680304, + "step": 21855 + }, + { + "epoch": 3.2558832290735777, + "grad_norm": 0.02978515625, + "learning_rate": 0.02964118351079535, + "loss": 0.8143, + "num_input_tokens_seen": 12683280, + "step": 21860 + }, + { + "epoch": 3.256627941614537, + "grad_norm": 0.0274658203125, + "learning_rate": 0.029640759500216496, + "loss": 0.8283, + "num_input_tokens_seen": 12686352, + "step": 21865 + }, + { + "epoch": 3.257372654155496, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029640335242296895, + "loss": 0.7921, + "num_input_tokens_seen": 12689424, + "step": 21870 + }, + { + "epoch": 3.2581173666964554, + "grad_norm": 0.0234375, + "learning_rate": 0.029639910737043713, + "loss": 0.8247, + "num_input_tokens_seen": 12692304, + "step": 21875 + }, + { + "epoch": 3.2588620792374146, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02963948598446411, + "loss": 0.7873, + "num_input_tokens_seen": 12695184, + "step": 21880 + }, + { + "epoch": 3.2596067917783733, + "grad_norm": 0.020751953125, + "learning_rate": 0.029639060984565275, + "loss": 0.8113, + "num_input_tokens_seen": 12697808, + "step": 21885 + }, + { + "epoch": 3.260351504319333, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029638635737354386, + "loss": 0.7801, + "num_input_tokens_seen": 12700720, + "step": 21890 + }, + { + "epoch": 3.2610962168602917, + "grad_norm": 0.01287841796875, + "learning_rate": 0.02963821024283862, + "loss": 0.8095, + "num_input_tokens_seen": 12703472, + "step": 21895 + }, + { + "epoch": 3.2618409294012514, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02963778450102517, + "loss": 0.8075, + "num_input_tokens_seen": 12707152, + "step": 21900 + }, + { + "epoch": 3.26258564194221, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02963735851192123, + "loss": 0.8044, + "num_input_tokens_seen": 12710192, + "step": 21905 + }, + { + "epoch": 3.2633303544831693, + "grad_norm": 0.0185546875, + "learning_rate": 0.02963693227553399, + "loss": 0.814, + "num_input_tokens_seen": 12713264, + "step": 21910 + }, + { + "epoch": 3.2640750670241285, + "grad_norm": 0.0296630859375, + "learning_rate": 0.029636505791870663, + "loss": 0.7795, + "num_input_tokens_seen": 12716112, + "step": 21915 + }, + { + "epoch": 3.2648197795650877, + "grad_norm": 0.029296875, + "learning_rate": 0.02963607906093844, + "loss": 0.8099, + "num_input_tokens_seen": 12718736, + "step": 21920 + }, + { + "epoch": 3.265564492106047, + "grad_norm": 0.031982421875, + "learning_rate": 0.02963565208274454, + "loss": 0.8225, + "num_input_tokens_seen": 12721680, + "step": 21925 + }, + { + "epoch": 3.266309204647006, + "grad_norm": 0.021240234375, + "learning_rate": 0.029635224857296173, + "loss": 0.7968, + "num_input_tokens_seen": 12724496, + "step": 21930 + }, + { + "epoch": 3.2670539171879653, + "grad_norm": 0.01470947265625, + "learning_rate": 0.029634797384600554, + "loss": 0.8065, + "num_input_tokens_seen": 12727440, + "step": 21935 + }, + { + "epoch": 3.2677986297289245, + "grad_norm": 0.01611328125, + "learning_rate": 0.02963436966466491, + "loss": 0.8278, + "num_input_tokens_seen": 12730320, + "step": 21940 + }, + { + "epoch": 3.2685433422698837, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029633941697496464, + "loss": 0.8235, + "num_input_tokens_seen": 12733264, + "step": 21945 + }, + { + "epoch": 3.269288054810843, + "grad_norm": 0.0234375, + "learning_rate": 0.029633513483102444, + "loss": 0.7802, + "num_input_tokens_seen": 12736144, + "step": 21950 + }, + { + "epoch": 3.270032767351802, + "grad_norm": 0.02099609375, + "learning_rate": 0.029633085021490087, + "loss": 0.7983, + "num_input_tokens_seen": 12739184, + "step": 21955 + }, + { + "epoch": 3.2707774798927614, + "grad_norm": 0.01513671875, + "learning_rate": 0.029632656312666632, + "loss": 0.7993, + "num_input_tokens_seen": 12742224, + "step": 21960 + }, + { + "epoch": 3.2715221924337206, + "grad_norm": 0.019775390625, + "learning_rate": 0.029632227356639323, + "loss": 0.8111, + "num_input_tokens_seen": 12745232, + "step": 21965 + }, + { + "epoch": 3.2722669049746798, + "grad_norm": 0.0115966796875, + "learning_rate": 0.029631798153415403, + "loss": 0.8163, + "num_input_tokens_seen": 12747984, + "step": 21970 + }, + { + "epoch": 3.273011617515639, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029631368703002123, + "loss": 0.8033, + "num_input_tokens_seen": 12751184, + "step": 21975 + }, + { + "epoch": 3.273756330056598, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02963093900540674, + "loss": 0.7936, + "num_input_tokens_seen": 12753744, + "step": 21980 + }, + { + "epoch": 3.2745010425975574, + "grad_norm": 0.01324462890625, + "learning_rate": 0.02963050906063651, + "loss": 0.8066, + "num_input_tokens_seen": 12756848, + "step": 21985 + }, + { + "epoch": 3.2752457551385166, + "grad_norm": 0.01190185546875, + "learning_rate": 0.029630078868698704, + "loss": 0.821, + "num_input_tokens_seen": 12759728, + "step": 21990 + }, + { + "epoch": 3.2759904676794758, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02962964842960058, + "loss": 0.8073, + "num_input_tokens_seen": 12762576, + "step": 21995 + }, + { + "epoch": 3.276735180220435, + "grad_norm": 0.02392578125, + "learning_rate": 0.029629217743349416, + "loss": 0.8035, + "num_input_tokens_seen": 12765520, + "step": 22000 + }, + { + "epoch": 3.277479892761394, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02962878680995249, + "loss": 0.8135, + "num_input_tokens_seen": 12768336, + "step": 22005 + }, + { + "epoch": 3.2782246053023534, + "grad_norm": 0.020263671875, + "learning_rate": 0.02962835562941708, + "loss": 0.7898, + "num_input_tokens_seen": 12771184, + "step": 22010 + }, + { + "epoch": 3.2789693178433126, + "grad_norm": 0.020751953125, + "learning_rate": 0.029627924201750463, + "loss": 0.7988, + "num_input_tokens_seen": 12774320, + "step": 22015 + }, + { + "epoch": 3.279714030384272, + "grad_norm": 0.032470703125, + "learning_rate": 0.029627492526959936, + "loss": 0.796, + "num_input_tokens_seen": 12777008, + "step": 22020 + }, + { + "epoch": 3.280458742925231, + "grad_norm": 0.0306396484375, + "learning_rate": 0.029627060605052793, + "loss": 0.7879, + "num_input_tokens_seen": 12780240, + "step": 22025 + }, + { + "epoch": 3.28120345546619, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029626628436036324, + "loss": 0.8076, + "num_input_tokens_seen": 12783056, + "step": 22030 + }, + { + "epoch": 3.2819481680071494, + "grad_norm": 0.019775390625, + "learning_rate": 0.029626196019917835, + "loss": 0.7818, + "num_input_tokens_seen": 12785712, + "step": 22035 + }, + { + "epoch": 3.2826928805481086, + "grad_norm": 0.0174560546875, + "learning_rate": 0.029625763356704627, + "loss": 0.8194, + "num_input_tokens_seen": 12788496, + "step": 22040 + }, + { + "epoch": 3.283437593089068, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029625330446404013, + "loss": 0.8671, + "num_input_tokens_seen": 12791088, + "step": 22045 + }, + { + "epoch": 3.284182305630027, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02962489728902331, + "loss": 0.8054, + "num_input_tokens_seen": 12794064, + "step": 22050 + }, + { + "epoch": 3.284927018170986, + "grad_norm": 0.018310546875, + "learning_rate": 0.029624463884569824, + "loss": 0.821, + "num_input_tokens_seen": 12796848, + "step": 22055 + }, + { + "epoch": 3.285671730711945, + "grad_norm": 0.02197265625, + "learning_rate": 0.02962403023305089, + "loss": 0.8041, + "num_input_tokens_seen": 12799824, + "step": 22060 + }, + { + "epoch": 3.2864164432529046, + "grad_norm": 0.02294921875, + "learning_rate": 0.02962359633447383, + "loss": 0.8181, + "num_input_tokens_seen": 12802928, + "step": 22065 + }, + { + "epoch": 3.2871611557938634, + "grad_norm": 0.0174560546875, + "learning_rate": 0.029623162188845968, + "loss": 0.7962, + "num_input_tokens_seen": 12805456, + "step": 22070 + }, + { + "epoch": 3.2879058683348226, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02962272779617464, + "loss": 0.8073, + "num_input_tokens_seen": 12808112, + "step": 22075 + }, + { + "epoch": 3.2886505808757818, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029622293156467194, + "loss": 0.8079, + "num_input_tokens_seen": 12811088, + "step": 22080 + }, + { + "epoch": 3.289395293416741, + "grad_norm": 0.041015625, + "learning_rate": 0.029621858269730968, + "loss": 0.812, + "num_input_tokens_seen": 12813872, + "step": 22085 + }, + { + "epoch": 3.2901400059577, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0296214231359733, + "loss": 0.8101, + "num_input_tokens_seen": 12816688, + "step": 22090 + }, + { + "epoch": 3.2908847184986594, + "grad_norm": 0.01239013671875, + "learning_rate": 0.029620987755201552, + "loss": 0.8147, + "num_input_tokens_seen": 12819760, + "step": 22095 + }, + { + "epoch": 3.2916294310396186, + "grad_norm": 0.020263671875, + "learning_rate": 0.02962055212742308, + "loss": 0.7993, + "num_input_tokens_seen": 12822736, + "step": 22100 + }, + { + "epoch": 3.292374143580578, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029620116252645236, + "loss": 0.7916, + "num_input_tokens_seen": 12825360, + "step": 22105 + }, + { + "epoch": 3.293118856121537, + "grad_norm": 0.024658203125, + "learning_rate": 0.029619680130875386, + "loss": 0.805, + "num_input_tokens_seen": 12828400, + "step": 22110 + }, + { + "epoch": 3.293863568662496, + "grad_norm": 0.0130615234375, + "learning_rate": 0.0296192437621209, + "loss": 0.8006, + "num_input_tokens_seen": 12831312, + "step": 22115 + }, + { + "epoch": 3.2946082812034554, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029618807146389155, + "loss": 0.7896, + "num_input_tokens_seen": 12834128, + "step": 22120 + }, + { + "epoch": 3.2953529937444146, + "grad_norm": 0.021240234375, + "learning_rate": 0.029618370283687512, + "loss": 0.8136, + "num_input_tokens_seen": 12837360, + "step": 22125 + }, + { + "epoch": 3.296097706285374, + "grad_norm": 0.01275634765625, + "learning_rate": 0.02961793317402336, + "loss": 0.8021, + "num_input_tokens_seen": 12840496, + "step": 22130 + }, + { + "epoch": 3.296842418826333, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029617495817404093, + "loss": 0.8023, + "num_input_tokens_seen": 12843472, + "step": 22135 + }, + { + "epoch": 3.297587131367292, + "grad_norm": 0.01446533203125, + "learning_rate": 0.029617058213837085, + "loss": 0.7941, + "num_input_tokens_seen": 12846576, + "step": 22140 + }, + { + "epoch": 3.2983318439082514, + "grad_norm": 0.018798828125, + "learning_rate": 0.029616620363329734, + "loss": 0.8129, + "num_input_tokens_seen": 12849264, + "step": 22145 + }, + { + "epoch": 3.2990765564492106, + "grad_norm": 0.01324462890625, + "learning_rate": 0.029616182265889444, + "loss": 0.8095, + "num_input_tokens_seen": 12852176, + "step": 22150 + }, + { + "epoch": 3.29982126899017, + "grad_norm": 0.0118408203125, + "learning_rate": 0.0296157439215236, + "loss": 0.8113, + "num_input_tokens_seen": 12854992, + "step": 22155 + }, + { + "epoch": 3.300565981531129, + "grad_norm": 0.01397705078125, + "learning_rate": 0.029615305330239622, + "loss": 0.8022, + "num_input_tokens_seen": 12857872, + "step": 22160 + }, + { + "epoch": 3.301310694072088, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029614866492044915, + "loss": 0.7964, + "num_input_tokens_seen": 12860752, + "step": 22165 + }, + { + "epoch": 3.3020554066130474, + "grad_norm": 0.013916015625, + "learning_rate": 0.02961442740694689, + "loss": 0.8053, + "num_input_tokens_seen": 12863440, + "step": 22170 + }, + { + "epoch": 3.3028001191540066, + "grad_norm": 0.021728515625, + "learning_rate": 0.02961398807495297, + "loss": 0.7871, + "num_input_tokens_seen": 12866224, + "step": 22175 + }, + { + "epoch": 3.303544831694966, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029613548496070577, + "loss": 0.7953, + "num_input_tokens_seen": 12869040, + "step": 22180 + }, + { + "epoch": 3.304289544235925, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029613108670307135, + "loss": 0.816, + "num_input_tokens_seen": 12871888, + "step": 22185 + }, + { + "epoch": 3.3050342567768842, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02961266859767007, + "loss": 0.7864, + "num_input_tokens_seen": 12874576, + "step": 22190 + }, + { + "epoch": 3.3057789693178434, + "grad_norm": 0.032958984375, + "learning_rate": 0.029612228278166817, + "loss": 0.8195, + "num_input_tokens_seen": 12877456, + "step": 22195 + }, + { + "epoch": 3.3065236818588026, + "grad_norm": 0.01409912109375, + "learning_rate": 0.029611787711804824, + "loss": 0.8056, + "num_input_tokens_seen": 12880592, + "step": 22200 + }, + { + "epoch": 3.307268394399762, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02961134689859153, + "loss": 0.8096, + "num_input_tokens_seen": 12883472, + "step": 22205 + }, + { + "epoch": 3.308013106940721, + "grad_norm": 0.01226806640625, + "learning_rate": 0.029610905838534377, + "loss": 0.807, + "num_input_tokens_seen": 12886032, + "step": 22210 + }, + { + "epoch": 3.3087578194816802, + "grad_norm": 0.01611328125, + "learning_rate": 0.02961046453164082, + "loss": 0.7944, + "num_input_tokens_seen": 12889360, + "step": 22215 + }, + { + "epoch": 3.3095025320226394, + "grad_norm": 0.01953125, + "learning_rate": 0.029610022977918316, + "loss": 0.7991, + "num_input_tokens_seen": 12892048, + "step": 22220 + }, + { + "epoch": 3.310247244563598, + "grad_norm": 0.0133056640625, + "learning_rate": 0.02960958117737432, + "loss": 0.8006, + "num_input_tokens_seen": 12894896, + "step": 22225 + }, + { + "epoch": 3.310991957104558, + "grad_norm": 0.0177001953125, + "learning_rate": 0.029609139130016303, + "loss": 0.787, + "num_input_tokens_seen": 12897648, + "step": 22230 + }, + { + "epoch": 3.3117366696455166, + "grad_norm": 0.0224609375, + "learning_rate": 0.029608696835851725, + "loss": 0.7696, + "num_input_tokens_seen": 12900272, + "step": 22235 + }, + { + "epoch": 3.3124813821864763, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029608254294888062, + "loss": 0.8079, + "num_input_tokens_seen": 12903344, + "step": 22240 + }, + { + "epoch": 3.313226094727435, + "grad_norm": 0.025146484375, + "learning_rate": 0.029607811507132792, + "loss": 0.7818, + "num_input_tokens_seen": 12905904, + "step": 22245 + }, + { + "epoch": 3.313970807268394, + "grad_norm": 0.026123046875, + "learning_rate": 0.02960736847259339, + "loss": 0.7968, + "num_input_tokens_seen": 12908752, + "step": 22250 + }, + { + "epoch": 3.3147155198093534, + "grad_norm": 0.0164794921875, + "learning_rate": 0.029606925191277347, + "loss": 0.8064, + "num_input_tokens_seen": 12911664, + "step": 22255 + }, + { + "epoch": 3.3154602323503126, + "grad_norm": 0.0322265625, + "learning_rate": 0.029606481663192147, + "loss": 0.7883, + "num_input_tokens_seen": 12914512, + "step": 22260 + }, + { + "epoch": 3.316204944891272, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029606037888345284, + "loss": 0.8021, + "num_input_tokens_seen": 12917232, + "step": 22265 + }, + { + "epoch": 3.316949657432231, + "grad_norm": 0.0380859375, + "learning_rate": 0.029605593866744256, + "loss": 0.8132, + "num_input_tokens_seen": 12920016, + "step": 22270 + }, + { + "epoch": 3.3176943699731902, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029605149598396566, + "loss": 0.7917, + "num_input_tokens_seen": 12923024, + "step": 22275 + }, + { + "epoch": 3.3184390825141494, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029604705083309715, + "loss": 0.8085, + "num_input_tokens_seen": 12925872, + "step": 22280 + }, + { + "epoch": 3.3191837950551086, + "grad_norm": 0.021484375, + "learning_rate": 0.029604260321491216, + "loss": 0.8204, + "num_input_tokens_seen": 12928720, + "step": 22285 + }, + { + "epoch": 3.319928507596068, + "grad_norm": 0.0361328125, + "learning_rate": 0.029603815312948582, + "loss": 0.7887, + "num_input_tokens_seen": 12931600, + "step": 22290 + }, + { + "epoch": 3.320673220137027, + "grad_norm": 0.0283203125, + "learning_rate": 0.02960337005768933, + "loss": 0.793, + "num_input_tokens_seen": 12934224, + "step": 22295 + }, + { + "epoch": 3.3214179326779862, + "grad_norm": 0.031494140625, + "learning_rate": 0.029602924555720986, + "loss": 0.8032, + "num_input_tokens_seen": 12937296, + "step": 22300 + }, + { + "epoch": 3.3221626452189454, + "grad_norm": 0.07373046875, + "learning_rate": 0.02960247880705107, + "loss": 0.8323, + "num_input_tokens_seen": 12940112, + "step": 22305 + }, + { + "epoch": 3.3229073577599046, + "grad_norm": 0.041015625, + "learning_rate": 0.029602032811687117, + "loss": 0.7974, + "num_input_tokens_seen": 12943248, + "step": 22310 + }, + { + "epoch": 3.323652070300864, + "grad_norm": 0.035400390625, + "learning_rate": 0.029601586569636663, + "loss": 0.7942, + "num_input_tokens_seen": 12945968, + "step": 22315 + }, + { + "epoch": 3.324396782841823, + "grad_norm": 0.02099609375, + "learning_rate": 0.029601140080907244, + "loss": 0.7784, + "num_input_tokens_seen": 12949040, + "step": 22320 + }, + { + "epoch": 3.3251414953827823, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0296006933455064, + "loss": 0.789, + "num_input_tokens_seen": 12951504, + "step": 22325 + }, + { + "epoch": 3.3258862079237415, + "grad_norm": 0.03271484375, + "learning_rate": 0.029600246363441683, + "loss": 0.8215, + "num_input_tokens_seen": 12954608, + "step": 22330 + }, + { + "epoch": 3.3266309204647007, + "grad_norm": 0.04248046875, + "learning_rate": 0.02959979913472064, + "loss": 0.8048, + "num_input_tokens_seen": 12957232, + "step": 22335 + }, + { + "epoch": 3.32737563300566, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029599351659350834, + "loss": 0.7949, + "num_input_tokens_seen": 12959856, + "step": 22340 + }, + { + "epoch": 3.328120345546619, + "grad_norm": 0.02685546875, + "learning_rate": 0.029598903937339816, + "loss": 0.8366, + "num_input_tokens_seen": 12962928, + "step": 22345 + }, + { + "epoch": 3.3288650580875783, + "grad_norm": 0.019775390625, + "learning_rate": 0.029598455968695158, + "loss": 0.8045, + "num_input_tokens_seen": 12965648, + "step": 22350 + }, + { + "epoch": 3.3296097706285375, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02959800775342442, + "loss": 0.797, + "num_input_tokens_seen": 12968528, + "step": 22355 + }, + { + "epoch": 3.3303544831694967, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029597559291535177, + "loss": 0.8071, + "num_input_tokens_seen": 12971184, + "step": 22360 + }, + { + "epoch": 3.331099195710456, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02959711058303501, + "loss": 0.8141, + "num_input_tokens_seen": 12974128, + "step": 22365 + }, + { + "epoch": 3.331843908251415, + "grad_norm": 0.021240234375, + "learning_rate": 0.02959666162793149, + "loss": 0.7976, + "num_input_tokens_seen": 12976976, + "step": 22370 + }, + { + "epoch": 3.3325886207923743, + "grad_norm": 0.0135498046875, + "learning_rate": 0.029596212426232213, + "loss": 0.7949, + "num_input_tokens_seen": 12980112, + "step": 22375 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.02783203125, + "learning_rate": 0.02959576297794476, + "loss": 0.816, + "num_input_tokens_seen": 12983184, + "step": 22380 + }, + { + "epoch": 3.3340780458742927, + "grad_norm": 0.021240234375, + "learning_rate": 0.029595313283076723, + "loss": 0.8027, + "num_input_tokens_seen": 12985968, + "step": 22385 + }, + { + "epoch": 3.334822758415252, + "grad_norm": 0.03076171875, + "learning_rate": 0.029594863341635705, + "loss": 0.7961, + "num_input_tokens_seen": 12989200, + "step": 22390 + }, + { + "epoch": 3.335567470956211, + "grad_norm": 0.01214599609375, + "learning_rate": 0.029594413153629304, + "loss": 0.7953, + "num_input_tokens_seen": 12991952, + "step": 22395 + }, + { + "epoch": 3.33631218349717, + "grad_norm": 0.017578125, + "learning_rate": 0.02959396271906513, + "loss": 0.8035, + "num_input_tokens_seen": 12994832, + "step": 22400 + }, + { + "epoch": 3.3370568960381295, + "grad_norm": 0.01141357421875, + "learning_rate": 0.029593512037950784, + "loss": 0.7962, + "num_input_tokens_seen": 12997616, + "step": 22405 + }, + { + "epoch": 3.3378016085790883, + "grad_norm": 0.0205078125, + "learning_rate": 0.029593061110293882, + "loss": 0.807, + "num_input_tokens_seen": 13000304, + "step": 22410 + }, + { + "epoch": 3.338546321120048, + "grad_norm": 0.01165771484375, + "learning_rate": 0.02959260993610205, + "loss": 0.804, + "num_input_tokens_seen": 13003056, + "step": 22415 + }, + { + "epoch": 3.3392910336610067, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029592158515382896, + "loss": 0.8368, + "num_input_tokens_seen": 13006160, + "step": 22420 + }, + { + "epoch": 3.340035746201966, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029591706848144064, + "loss": 0.7916, + "num_input_tokens_seen": 13008848, + "step": 22425 + }, + { + "epoch": 3.340780458742925, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02959125493439317, + "loss": 0.8118, + "num_input_tokens_seen": 13011536, + "step": 22430 + }, + { + "epoch": 3.3415251712838843, + "grad_norm": 0.0283203125, + "learning_rate": 0.029590802774137854, + "loss": 0.7999, + "num_input_tokens_seen": 13014384, + "step": 22435 + }, + { + "epoch": 3.3422698838248435, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029590350367385755, + "loss": 0.791, + "num_input_tokens_seen": 13017456, + "step": 22440 + }, + { + "epoch": 3.3430145963658027, + "grad_norm": 0.0296630859375, + "learning_rate": 0.029589897714144518, + "loss": 0.7944, + "num_input_tokens_seen": 13020432, + "step": 22445 + }, + { + "epoch": 3.343759308906762, + "grad_norm": 0.013427734375, + "learning_rate": 0.029589444814421786, + "loss": 0.8062, + "num_input_tokens_seen": 13023312, + "step": 22450 + }, + { + "epoch": 3.344504021447721, + "grad_norm": 0.01287841796875, + "learning_rate": 0.02958899166822521, + "loss": 0.7836, + "num_input_tokens_seen": 13026096, + "step": 22455 + }, + { + "epoch": 3.3452487339886803, + "grad_norm": 0.019775390625, + "learning_rate": 0.02958853827556245, + "loss": 0.8101, + "num_input_tokens_seen": 13028592, + "step": 22460 + }, + { + "epoch": 3.3459934465296395, + "grad_norm": 0.020263671875, + "learning_rate": 0.02958808463644116, + "loss": 0.7927, + "num_input_tokens_seen": 13031888, + "step": 22465 + }, + { + "epoch": 3.3467381590705987, + "grad_norm": 0.028076171875, + "learning_rate": 0.02958763075086901, + "loss": 0.8235, + "num_input_tokens_seen": 13034672, + "step": 22470 + }, + { + "epoch": 3.347482871611558, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029587176618853663, + "loss": 0.8008, + "num_input_tokens_seen": 13037520, + "step": 22475 + }, + { + "epoch": 3.348227584152517, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029586722240402794, + "loss": 0.7863, + "num_input_tokens_seen": 13040688, + "step": 22480 + }, + { + "epoch": 3.3489722966934763, + "grad_norm": 0.0245361328125, + "learning_rate": 0.029586267615524076, + "loss": 0.8116, + "num_input_tokens_seen": 13043504, + "step": 22485 + }, + { + "epoch": 3.3497170092344355, + "grad_norm": 0.0185546875, + "learning_rate": 0.02958581274422519, + "loss": 0.7874, + "num_input_tokens_seen": 13046512, + "step": 22490 + }, + { + "epoch": 3.3504617217753947, + "grad_norm": 0.0205078125, + "learning_rate": 0.029585357626513827, + "loss": 0.7647, + "num_input_tokens_seen": 13049392, + "step": 22495 + }, + { + "epoch": 3.351206434316354, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029584902262397666, + "loss": 0.7753, + "num_input_tokens_seen": 13052496, + "step": 22500 + }, + { + "epoch": 3.351951146857313, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029584446651884408, + "loss": 0.8109, + "num_input_tokens_seen": 13055312, + "step": 22505 + }, + { + "epoch": 3.3526958593982723, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029583990794981748, + "loss": 0.7866, + "num_input_tokens_seen": 13058288, + "step": 22510 + }, + { + "epoch": 3.3534405719392315, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02958353469169738, + "loss": 0.7781, + "num_input_tokens_seen": 13061168, + "step": 22515 + }, + { + "epoch": 3.3541852844801907, + "grad_norm": 0.0224609375, + "learning_rate": 0.02958307834203902, + "loss": 0.7994, + "num_input_tokens_seen": 13064112, + "step": 22520 + }, + { + "epoch": 3.35492999702115, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029582621746014375, + "loss": 0.7889, + "num_input_tokens_seen": 13067088, + "step": 22525 + }, + { + "epoch": 3.355674709562109, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029582164903631154, + "loss": 0.7681, + "num_input_tokens_seen": 13070096, + "step": 22530 + }, + { + "epoch": 3.3564194221030683, + "grad_norm": 0.021728515625, + "learning_rate": 0.02958170781489708, + "loss": 0.7951, + "num_input_tokens_seen": 13073040, + "step": 22535 + }, + { + "epoch": 3.3571641346440275, + "grad_norm": 0.029052734375, + "learning_rate": 0.029581250479819868, + "loss": 0.804, + "num_input_tokens_seen": 13075792, + "step": 22540 + }, + { + "epoch": 3.3579088471849867, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02958079289840725, + "loss": 0.8117, + "num_input_tokens_seen": 13078736, + "step": 22545 + }, + { + "epoch": 3.358653559725946, + "grad_norm": 0.0294189453125, + "learning_rate": 0.029580335070666958, + "loss": 0.7838, + "num_input_tokens_seen": 13081488, + "step": 22550 + }, + { + "epoch": 3.359398272266905, + "grad_norm": 0.0162353515625, + "learning_rate": 0.029579876996606722, + "loss": 0.7909, + "num_input_tokens_seen": 13084208, + "step": 22555 + }, + { + "epoch": 3.3601429848078643, + "grad_norm": 0.030517578125, + "learning_rate": 0.02957941867623428, + "loss": 0.8233, + "num_input_tokens_seen": 13087152, + "step": 22560 + }, + { + "epoch": 3.3608876973488235, + "grad_norm": 0.025146484375, + "learning_rate": 0.029578960109557382, + "loss": 0.8348, + "num_input_tokens_seen": 13089872, + "step": 22565 + }, + { + "epoch": 3.3616324098897827, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029578501296583768, + "loss": 0.8056, + "num_input_tokens_seen": 13092944, + "step": 22570 + }, + { + "epoch": 3.3623771224307415, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029578042237321193, + "loss": 0.8062, + "num_input_tokens_seen": 13095760, + "step": 22575 + }, + { + "epoch": 3.363121834971701, + "grad_norm": 0.03857421875, + "learning_rate": 0.029577582931777407, + "loss": 0.756, + "num_input_tokens_seen": 13098864, + "step": 22580 + }, + { + "epoch": 3.36386654751266, + "grad_norm": 0.0179443359375, + "learning_rate": 0.029577123379960178, + "loss": 0.8215, + "num_input_tokens_seen": 13102096, + "step": 22585 + }, + { + "epoch": 3.3646112600536195, + "grad_norm": 0.01165771484375, + "learning_rate": 0.02957666358187726, + "loss": 0.8522, + "num_input_tokens_seen": 13104752, + "step": 22590 + }, + { + "epoch": 3.3653559725945783, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02957620353753643, + "loss": 0.7584, + "num_input_tokens_seen": 13108080, + "step": 22595 + }, + { + "epoch": 3.3661006851355375, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029575743246945455, + "loss": 0.795, + "num_input_tokens_seen": 13111152, + "step": 22600 + }, + { + "epoch": 3.3668453976764967, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029575282710112113, + "loss": 0.7891, + "num_input_tokens_seen": 13114160, + "step": 22605 + }, + { + "epoch": 3.367590110217456, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029574821927044182, + "loss": 0.7958, + "num_input_tokens_seen": 13117520, + "step": 22610 + }, + { + "epoch": 3.368334822758415, + "grad_norm": 0.013427734375, + "learning_rate": 0.02957436089774945, + "loss": 0.8249, + "num_input_tokens_seen": 13120272, + "step": 22615 + }, + { + "epoch": 3.3690795352993743, + "grad_norm": 0.021728515625, + "learning_rate": 0.029573899622235698, + "loss": 0.7679, + "num_input_tokens_seen": 13123056, + "step": 22620 + }, + { + "epoch": 3.3698242478403335, + "grad_norm": 0.01251220703125, + "learning_rate": 0.02957343810051073, + "loss": 0.817, + "num_input_tokens_seen": 13126064, + "step": 22625 + }, + { + "epoch": 3.3705689603812927, + "grad_norm": 0.021728515625, + "learning_rate": 0.029572976332582334, + "loss": 0.8235, + "num_input_tokens_seen": 13128720, + "step": 22630 + }, + { + "epoch": 3.371313672922252, + "grad_norm": 0.02783203125, + "learning_rate": 0.029572514318458313, + "loss": 0.8063, + "num_input_tokens_seen": 13131600, + "step": 22635 + }, + { + "epoch": 3.372058385463211, + "grad_norm": 0.01141357421875, + "learning_rate": 0.02957205205814647, + "loss": 0.8033, + "num_input_tokens_seen": 13134576, + "step": 22640 + }, + { + "epoch": 3.3728030980041703, + "grad_norm": 0.026611328125, + "learning_rate": 0.029571589551654627, + "loss": 0.8345, + "num_input_tokens_seen": 13137488, + "step": 22645 + }, + { + "epoch": 3.3735478105451295, + "grad_norm": 0.01226806640625, + "learning_rate": 0.029571126798990585, + "loss": 0.7883, + "num_input_tokens_seen": 13140240, + "step": 22650 + }, + { + "epoch": 3.3742925230860887, + "grad_norm": 0.0155029296875, + "learning_rate": 0.029570663800162163, + "loss": 0.8144, + "num_input_tokens_seen": 13143344, + "step": 22655 + }, + { + "epoch": 3.375037235627048, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029570200555177187, + "loss": 0.83, + "num_input_tokens_seen": 13146640, + "step": 22660 + }, + { + "epoch": 3.375781948168007, + "grad_norm": 0.024658203125, + "learning_rate": 0.029569737064043478, + "loss": 0.8, + "num_input_tokens_seen": 13149616, + "step": 22665 + }, + { + "epoch": 3.3765266607089663, + "grad_norm": 0.02978515625, + "learning_rate": 0.029569273326768872, + "loss": 0.8095, + "num_input_tokens_seen": 13152496, + "step": 22670 + }, + { + "epoch": 3.3772713732499255, + "grad_norm": 0.036865234375, + "learning_rate": 0.029568809343361202, + "loss": 0.8202, + "num_input_tokens_seen": 13155408, + "step": 22675 + }, + { + "epoch": 3.3780160857908847, + "grad_norm": 0.02294921875, + "learning_rate": 0.0295683451138283, + "loss": 0.8069, + "num_input_tokens_seen": 13158128, + "step": 22680 + }, + { + "epoch": 3.378760798331844, + "grad_norm": 0.021240234375, + "learning_rate": 0.02956788063817802, + "loss": 0.8077, + "num_input_tokens_seen": 13161296, + "step": 22685 + }, + { + "epoch": 3.379505510872803, + "grad_norm": 0.0311279296875, + "learning_rate": 0.029567415916418198, + "loss": 0.8171, + "num_input_tokens_seen": 13164176, + "step": 22690 + }, + { + "epoch": 3.3802502234137624, + "grad_norm": 0.0322265625, + "learning_rate": 0.029566950948556697, + "loss": 0.7923, + "num_input_tokens_seen": 13166768, + "step": 22695 + }, + { + "epoch": 3.3809949359547216, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02956648573460136, + "loss": 0.7982, + "num_input_tokens_seen": 13169648, + "step": 22700 + }, + { + "epoch": 3.3817396484956808, + "grad_norm": 0.03466796875, + "learning_rate": 0.02956602027456005, + "loss": 0.7888, + "num_input_tokens_seen": 13172240, + "step": 22705 + }, + { + "epoch": 3.38248436103664, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029565554568440632, + "loss": 0.81, + "num_input_tokens_seen": 13175824, + "step": 22710 + }, + { + "epoch": 3.383229073577599, + "grad_norm": 0.0281982421875, + "learning_rate": 0.029565088616250974, + "loss": 0.8258, + "num_input_tokens_seen": 13178608, + "step": 22715 + }, + { + "epoch": 3.3839737861185584, + "grad_norm": 0.021728515625, + "learning_rate": 0.029564622417998947, + "loss": 0.8201, + "num_input_tokens_seen": 13181424, + "step": 22720 + }, + { + "epoch": 3.3847184986595176, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029564155973692427, + "loss": 0.8369, + "num_input_tokens_seen": 13184048, + "step": 22725 + }, + { + "epoch": 3.3854632112004768, + "grad_norm": 0.0306396484375, + "learning_rate": 0.029563689283339297, + "loss": 0.809, + "num_input_tokens_seen": 13187120, + "step": 22730 + }, + { + "epoch": 3.386207923741436, + "grad_norm": 0.01495361328125, + "learning_rate": 0.029563222346947433, + "loss": 0.8129, + "num_input_tokens_seen": 13190256, + "step": 22735 + }, + { + "epoch": 3.386952636282395, + "grad_norm": 0.024658203125, + "learning_rate": 0.029562755164524734, + "loss": 0.8122, + "num_input_tokens_seen": 13193200, + "step": 22740 + }, + { + "epoch": 3.3876973488233544, + "grad_norm": 0.01324462890625, + "learning_rate": 0.029562287736079083, + "loss": 0.8064, + "num_input_tokens_seen": 13196016, + "step": 22745 + }, + { + "epoch": 3.388442061364313, + "grad_norm": 0.028076171875, + "learning_rate": 0.029561820061618387, + "loss": 0.7903, + "num_input_tokens_seen": 13198960, + "step": 22750 + }, + { + "epoch": 3.389186773905273, + "grad_norm": 0.0269775390625, + "learning_rate": 0.029561352141150535, + "loss": 0.8089, + "num_input_tokens_seen": 13201872, + "step": 22755 + }, + { + "epoch": 3.3899314864462315, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02956088397468344, + "loss": 0.8083, + "num_input_tokens_seen": 13205072, + "step": 22760 + }, + { + "epoch": 3.390676198987191, + "grad_norm": 0.055908203125, + "learning_rate": 0.029560415562225013, + "loss": 0.8108, + "num_input_tokens_seen": 13208048, + "step": 22765 + }, + { + "epoch": 3.39142091152815, + "grad_norm": 0.02587890625, + "learning_rate": 0.029559946903783156, + "loss": 0.7941, + "num_input_tokens_seen": 13211216, + "step": 22770 + }, + { + "epoch": 3.392165624069109, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029559477999365803, + "loss": 0.7838, + "num_input_tokens_seen": 13213936, + "step": 22775 + }, + { + "epoch": 3.3929103366100684, + "grad_norm": 0.0128173828125, + "learning_rate": 0.029559008848980863, + "loss": 0.8247, + "num_input_tokens_seen": 13216752, + "step": 22780 + }, + { + "epoch": 3.3936550491510276, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029558539452636264, + "loss": 0.8217, + "num_input_tokens_seen": 13219472, + "step": 22785 + }, + { + "epoch": 3.3943997616919868, + "grad_norm": 0.01953125, + "learning_rate": 0.029558069810339937, + "loss": 0.7955, + "num_input_tokens_seen": 13222416, + "step": 22790 + }, + { + "epoch": 3.395144474232946, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029557599922099817, + "loss": 0.7981, + "num_input_tokens_seen": 13225072, + "step": 22795 + }, + { + "epoch": 3.395889186773905, + "grad_norm": 0.0177001953125, + "learning_rate": 0.029557129787923846, + "loss": 0.8091, + "num_input_tokens_seen": 13227984, + "step": 22800 + }, + { + "epoch": 3.3966338993148644, + "grad_norm": 0.0157470703125, + "learning_rate": 0.029556659407819957, + "loss": 0.8, + "num_input_tokens_seen": 13230992, + "step": 22805 + }, + { + "epoch": 3.3973786118558236, + "grad_norm": 0.020751953125, + "learning_rate": 0.029556188781796108, + "loss": 0.8199, + "num_input_tokens_seen": 13233968, + "step": 22810 + }, + { + "epoch": 3.3981233243967828, + "grad_norm": 0.0205078125, + "learning_rate": 0.02955571790986024, + "loss": 0.8128, + "num_input_tokens_seen": 13237200, + "step": 22815 + }, + { + "epoch": 3.398868036937742, + "grad_norm": 0.0120849609375, + "learning_rate": 0.02955524679202031, + "loss": 0.7985, + "num_input_tokens_seen": 13239984, + "step": 22820 + }, + { + "epoch": 3.399612749478701, + "grad_norm": 0.02734375, + "learning_rate": 0.029554775428284285, + "loss": 0.8113, + "num_input_tokens_seen": 13243024, + "step": 22825 + }, + { + "epoch": 3.4003574620196604, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029554303818660116, + "loss": 0.7999, + "num_input_tokens_seen": 13246000, + "step": 22830 + }, + { + "epoch": 3.4011021745606196, + "grad_norm": 0.0303955078125, + "learning_rate": 0.029553831963155777, + "loss": 0.7952, + "num_input_tokens_seen": 13248784, + "step": 22835 + }, + { + "epoch": 3.401846887101579, + "grad_norm": 0.014404296875, + "learning_rate": 0.02955335986177924, + "loss": 0.7947, + "num_input_tokens_seen": 13251600, + "step": 22840 + }, + { + "epoch": 3.402591599642538, + "grad_norm": 0.02294921875, + "learning_rate": 0.02955288751453848, + "loss": 0.8185, + "num_input_tokens_seen": 13254512, + "step": 22845 + }, + { + "epoch": 3.403336312183497, + "grad_norm": 0.042724609375, + "learning_rate": 0.029552414921441478, + "loss": 0.7827, + "num_input_tokens_seen": 13257648, + "step": 22850 + }, + { + "epoch": 3.4040810247244564, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029551942082496212, + "loss": 0.8041, + "num_input_tokens_seen": 13260624, + "step": 22855 + }, + { + "epoch": 3.4048257372654156, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029551468997710675, + "loss": 0.7856, + "num_input_tokens_seen": 13263760, + "step": 22860 + }, + { + "epoch": 3.405570449806375, + "grad_norm": 0.029052734375, + "learning_rate": 0.029550995667092864, + "loss": 0.7945, + "num_input_tokens_seen": 13266384, + "step": 22865 + }, + { + "epoch": 3.406315162347334, + "grad_norm": 0.0133056640625, + "learning_rate": 0.029550522090650767, + "loss": 0.8081, + "num_input_tokens_seen": 13269040, + "step": 22870 + }, + { + "epoch": 3.407059874888293, + "grad_norm": 0.0181884765625, + "learning_rate": 0.029550048268392384, + "loss": 0.8072, + "num_input_tokens_seen": 13272208, + "step": 22875 + }, + { + "epoch": 3.4078045874292524, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02954957420032573, + "loss": 0.8031, + "num_input_tokens_seen": 13275088, + "step": 22880 + }, + { + "epoch": 3.4085492999702116, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029549099886458804, + "loss": 0.8091, + "num_input_tokens_seen": 13278448, + "step": 22885 + }, + { + "epoch": 3.409294012511171, + "grad_norm": 0.023681640625, + "learning_rate": 0.029548625326799624, + "loss": 0.7737, + "num_input_tokens_seen": 13281136, + "step": 22890 + }, + { + "epoch": 3.41003872505213, + "grad_norm": 0.01300048828125, + "learning_rate": 0.0295481505213562, + "loss": 0.8017, + "num_input_tokens_seen": 13284176, + "step": 22895 + }, + { + "epoch": 3.410783437593089, + "grad_norm": 0.019287109375, + "learning_rate": 0.029547675470136568, + "loss": 0.7983, + "num_input_tokens_seen": 13287024, + "step": 22900 + }, + { + "epoch": 3.4115281501340484, + "grad_norm": 0.0186767578125, + "learning_rate": 0.029547200173148738, + "loss": 0.77, + "num_input_tokens_seen": 13290096, + "step": 22905 + }, + { + "epoch": 3.4122728626750076, + "grad_norm": 0.019775390625, + "learning_rate": 0.029546724630400745, + "loss": 0.7924, + "num_input_tokens_seen": 13292880, + "step": 22910 + }, + { + "epoch": 3.413017575215967, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029546248841900626, + "loss": 0.7985, + "num_input_tokens_seen": 13295696, + "step": 22915 + }, + { + "epoch": 3.413762287756926, + "grad_norm": 0.033203125, + "learning_rate": 0.029545772807656422, + "loss": 0.8143, + "num_input_tokens_seen": 13298448, + "step": 22920 + }, + { + "epoch": 3.414507000297885, + "grad_norm": 0.021240234375, + "learning_rate": 0.029545296527676165, + "loss": 0.8076, + "num_input_tokens_seen": 13301040, + "step": 22925 + }, + { + "epoch": 3.4152517128388444, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02954482000196791, + "loss": 0.7795, + "num_input_tokens_seen": 13304112, + "step": 22930 + }, + { + "epoch": 3.415996425379803, + "grad_norm": 0.02392578125, + "learning_rate": 0.0295443432305397, + "loss": 0.8015, + "num_input_tokens_seen": 13306960, + "step": 22935 + }, + { + "epoch": 3.4167411379207624, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029543866213399595, + "loss": 0.7976, + "num_input_tokens_seen": 13309520, + "step": 22940 + }, + { + "epoch": 3.4174858504617216, + "grad_norm": 0.0244140625, + "learning_rate": 0.029543388950555652, + "loss": 0.7818, + "num_input_tokens_seen": 13312368, + "step": 22945 + }, + { + "epoch": 3.418230563002681, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02954291144201593, + "loss": 0.7599, + "num_input_tokens_seen": 13315280, + "step": 22950 + }, + { + "epoch": 3.41897527554364, + "grad_norm": 0.03759765625, + "learning_rate": 0.029542433687788503, + "loss": 0.8185, + "num_input_tokens_seen": 13318000, + "step": 22955 + }, + { + "epoch": 3.419719988084599, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02954195568788144, + "loss": 0.8147, + "num_input_tokens_seen": 13320688, + "step": 22960 + }, + { + "epoch": 3.4204647006255584, + "grad_norm": 0.0400390625, + "learning_rate": 0.029541477442302816, + "loss": 0.7904, + "num_input_tokens_seen": 13323600, + "step": 22965 + }, + { + "epoch": 3.4212094131665176, + "grad_norm": 0.029052734375, + "learning_rate": 0.02954099895106071, + "loss": 0.8019, + "num_input_tokens_seen": 13326416, + "step": 22970 + }, + { + "epoch": 3.421954125707477, + "grad_norm": 0.021240234375, + "learning_rate": 0.0295405202141632, + "loss": 0.7719, + "num_input_tokens_seen": 13329712, + "step": 22975 + }, + { + "epoch": 3.422698838248436, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02954004123161839, + "loss": 0.7813, + "num_input_tokens_seen": 13332528, + "step": 22980 + }, + { + "epoch": 3.423443550789395, + "grad_norm": 0.0224609375, + "learning_rate": 0.029539562003434352, + "loss": 0.7595, + "num_input_tokens_seen": 13335760, + "step": 22985 + }, + { + "epoch": 3.4241882633303544, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029539082529619193, + "loss": 0.8266, + "num_input_tokens_seen": 13338992, + "step": 22990 + }, + { + "epoch": 3.4249329758713136, + "grad_norm": 0.024169921875, + "learning_rate": 0.029538602810181017, + "loss": 0.8005, + "num_input_tokens_seen": 13341712, + "step": 22995 + }, + { + "epoch": 3.425677688412273, + "grad_norm": 0.02392578125, + "learning_rate": 0.02953812284512792, + "loss": 0.8068, + "num_input_tokens_seen": 13344528, + "step": 23000 + }, + { + "epoch": 3.426422400953232, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02953764263446801, + "loss": 0.8225, + "num_input_tokens_seen": 13347280, + "step": 23005 + }, + { + "epoch": 3.4271671134941912, + "grad_norm": 0.020751953125, + "learning_rate": 0.029537162178209404, + "loss": 0.8082, + "num_input_tokens_seen": 13350352, + "step": 23010 + }, + { + "epoch": 3.4279118260351504, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02953668147636022, + "loss": 0.7978, + "num_input_tokens_seen": 13353456, + "step": 23015 + }, + { + "epoch": 3.4286565385761096, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02953620052892858, + "loss": 0.7813, + "num_input_tokens_seen": 13356368, + "step": 23020 + }, + { + "epoch": 3.429401251117069, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029535719335922597, + "loss": 0.8335, + "num_input_tokens_seen": 13359408, + "step": 23025 + }, + { + "epoch": 3.430145963658028, + "grad_norm": 0.0177001953125, + "learning_rate": 0.029535237897350415, + "loss": 0.7891, + "num_input_tokens_seen": 13362064, + "step": 23030 + }, + { + "epoch": 3.4308906761989872, + "grad_norm": 0.014404296875, + "learning_rate": 0.02953475621322016, + "loss": 0.7775, + "num_input_tokens_seen": 13365136, + "step": 23035 + }, + { + "epoch": 3.4316353887399464, + "grad_norm": 0.0150146484375, + "learning_rate": 0.029534274283539975, + "loss": 0.7957, + "num_input_tokens_seen": 13368112, + "step": 23040 + }, + { + "epoch": 3.4323801012809056, + "grad_norm": 0.023193359375, + "learning_rate": 0.029533792108317993, + "loss": 0.8227, + "num_input_tokens_seen": 13371152, + "step": 23045 + }, + { + "epoch": 3.433124813821865, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029533309687562372, + "loss": 0.8059, + "num_input_tokens_seen": 13373840, + "step": 23050 + }, + { + "epoch": 3.433869526362824, + "grad_norm": 0.0458984375, + "learning_rate": 0.02953282702128125, + "loss": 0.8331, + "num_input_tokens_seen": 13376944, + "step": 23055 + }, + { + "epoch": 3.4346142389037833, + "grad_norm": 0.019775390625, + "learning_rate": 0.029532344109482783, + "loss": 0.8069, + "num_input_tokens_seen": 13379600, + "step": 23060 + }, + { + "epoch": 3.4353589514447425, + "grad_norm": 0.0111083984375, + "learning_rate": 0.029531860952175137, + "loss": 0.7877, + "num_input_tokens_seen": 13382256, + "step": 23065 + }, + { + "epoch": 3.4361036639857017, + "grad_norm": 0.02294921875, + "learning_rate": 0.029531377549366467, + "loss": 0.8228, + "num_input_tokens_seen": 13385168, + "step": 23070 + }, + { + "epoch": 3.436848376526661, + "grad_norm": 0.0108642578125, + "learning_rate": 0.029530893901064944, + "loss": 0.8113, + "num_input_tokens_seen": 13387984, + "step": 23075 + }, + { + "epoch": 3.43759308906762, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029530410007278733, + "loss": 0.8138, + "num_input_tokens_seen": 13390832, + "step": 23080 + }, + { + "epoch": 3.4383378016085793, + "grad_norm": 0.017333984375, + "learning_rate": 0.02952992586801602, + "loss": 0.8221, + "num_input_tokens_seen": 13393872, + "step": 23085 + }, + { + "epoch": 3.4390825141495385, + "grad_norm": 0.026611328125, + "learning_rate": 0.02952944148328497, + "loss": 0.7972, + "num_input_tokens_seen": 13397008, + "step": 23090 + }, + { + "epoch": 3.4398272266904977, + "grad_norm": 0.02294921875, + "learning_rate": 0.029528956853093775, + "loss": 0.8216, + "num_input_tokens_seen": 13399824, + "step": 23095 + }, + { + "epoch": 3.4405719392314564, + "grad_norm": 0.03662109375, + "learning_rate": 0.029528471977450628, + "loss": 0.8148, + "num_input_tokens_seen": 13402896, + "step": 23100 + }, + { + "epoch": 3.441316651772416, + "grad_norm": 0.0135498046875, + "learning_rate": 0.029527986856363705, + "loss": 0.8011, + "num_input_tokens_seen": 13406096, + "step": 23105 + }, + { + "epoch": 3.442061364313375, + "grad_norm": 0.0306396484375, + "learning_rate": 0.02952750148984121, + "loss": 0.8044, + "num_input_tokens_seen": 13409200, + "step": 23110 + }, + { + "epoch": 3.442806076854334, + "grad_norm": 0.038818359375, + "learning_rate": 0.029527015877891347, + "loss": 0.7914, + "num_input_tokens_seen": 13412304, + "step": 23115 + }, + { + "epoch": 3.4435507893952932, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02952653002052231, + "loss": 0.8243, + "num_input_tokens_seen": 13414960, + "step": 23120 + }, + { + "epoch": 3.4442955019362524, + "grad_norm": 0.0281982421875, + "learning_rate": 0.029526043917742315, + "loss": 0.7843, + "num_input_tokens_seen": 13417680, + "step": 23125 + }, + { + "epoch": 3.4450402144772116, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02952555756955957, + "loss": 0.7921, + "num_input_tokens_seen": 13420240, + "step": 23130 + }, + { + "epoch": 3.445784927018171, + "grad_norm": 0.02978515625, + "learning_rate": 0.029525070975982295, + "loss": 0.8118, + "num_input_tokens_seen": 13423216, + "step": 23135 + }, + { + "epoch": 3.44652963955913, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02952458413701871, + "loss": 0.8218, + "num_input_tokens_seen": 13426192, + "step": 23140 + }, + { + "epoch": 3.4472743521000893, + "grad_norm": 0.020263671875, + "learning_rate": 0.029524097052677032, + "loss": 0.806, + "num_input_tokens_seen": 13429232, + "step": 23145 + }, + { + "epoch": 3.4480190646410485, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029523609722965503, + "loss": 0.809, + "num_input_tokens_seen": 13432048, + "step": 23150 + }, + { + "epoch": 3.4487637771820077, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029523122147892347, + "loss": 0.7781, + "num_input_tokens_seen": 13434960, + "step": 23155 + }, + { + "epoch": 3.449508489722967, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0295226343274658, + "loss": 0.8071, + "num_input_tokens_seen": 13437712, + "step": 23160 + }, + { + "epoch": 3.450253202263926, + "grad_norm": 0.038818359375, + "learning_rate": 0.029522146261694106, + "loss": 0.8179, + "num_input_tokens_seen": 13440496, + "step": 23165 + }, + { + "epoch": 3.4509979148048853, + "grad_norm": 0.027587890625, + "learning_rate": 0.029521657950585516, + "loss": 0.8272, + "num_input_tokens_seen": 13443632, + "step": 23170 + }, + { + "epoch": 3.4517426273458445, + "grad_norm": 0.04541015625, + "learning_rate": 0.02952116939414827, + "loss": 0.8068, + "num_input_tokens_seen": 13446608, + "step": 23175 + }, + { + "epoch": 3.4524873398868037, + "grad_norm": 0.0142822265625, + "learning_rate": 0.029520680592390627, + "loss": 0.8175, + "num_input_tokens_seen": 13449520, + "step": 23180 + }, + { + "epoch": 3.453232052427763, + "grad_norm": 0.03515625, + "learning_rate": 0.029520191545320842, + "loss": 0.803, + "num_input_tokens_seen": 13452240, + "step": 23185 + }, + { + "epoch": 3.453976764968722, + "grad_norm": 0.03369140625, + "learning_rate": 0.02951970225294718, + "loss": 0.8026, + "num_input_tokens_seen": 13455248, + "step": 23190 + }, + { + "epoch": 3.4547214775096813, + "grad_norm": 0.026123046875, + "learning_rate": 0.029519212715277902, + "loss": 0.8175, + "num_input_tokens_seen": 13458320, + "step": 23195 + }, + { + "epoch": 3.4554661900506405, + "grad_norm": 0.0245361328125, + "learning_rate": 0.029518722932321287, + "loss": 0.8023, + "num_input_tokens_seen": 13460912, + "step": 23200 + }, + { + "epoch": 3.4562109025915997, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029518232904085603, + "loss": 0.7939, + "num_input_tokens_seen": 13463568, + "step": 23205 + }, + { + "epoch": 3.456955615132559, + "grad_norm": 0.01470947265625, + "learning_rate": 0.02951774263057913, + "loss": 0.8063, + "num_input_tokens_seen": 13466448, + "step": 23210 + }, + { + "epoch": 3.457700327673518, + "grad_norm": 0.029296875, + "learning_rate": 0.02951725211181015, + "loss": 0.7946, + "num_input_tokens_seen": 13469392, + "step": 23215 + }, + { + "epoch": 3.4584450402144773, + "grad_norm": 0.0133056640625, + "learning_rate": 0.029516761347786948, + "loss": 0.8062, + "num_input_tokens_seen": 13472464, + "step": 23220 + }, + { + "epoch": 3.4591897527554365, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02951627033851782, + "loss": 0.8118, + "num_input_tokens_seen": 13475408, + "step": 23225 + }, + { + "epoch": 3.4599344652963957, + "grad_norm": 0.023681640625, + "learning_rate": 0.029515779084011057, + "loss": 0.8063, + "num_input_tokens_seen": 13478288, + "step": 23230 + }, + { + "epoch": 3.460679177837355, + "grad_norm": 0.028564453125, + "learning_rate": 0.02951528758427496, + "loss": 0.8039, + "num_input_tokens_seen": 13481232, + "step": 23235 + }, + { + "epoch": 3.461423890378314, + "grad_norm": 0.019775390625, + "learning_rate": 0.029514795839317834, + "loss": 0.8143, + "num_input_tokens_seen": 13484048, + "step": 23240 + }, + { + "epoch": 3.4621686029192733, + "grad_norm": 0.01336669921875, + "learning_rate": 0.02951430384914798, + "loss": 0.8242, + "num_input_tokens_seen": 13486832, + "step": 23245 + }, + { + "epoch": 3.4629133154602325, + "grad_norm": 0.021484375, + "learning_rate": 0.029513811613773713, + "loss": 0.7942, + "num_input_tokens_seen": 13489456, + "step": 23250 + }, + { + "epoch": 3.4636580280011917, + "grad_norm": 0.01300048828125, + "learning_rate": 0.029513319133203358, + "loss": 0.8106, + "num_input_tokens_seen": 13492272, + "step": 23255 + }, + { + "epoch": 3.464402740542151, + "grad_norm": 0.031494140625, + "learning_rate": 0.02951282640744522, + "loss": 0.7922, + "num_input_tokens_seen": 13495088, + "step": 23260 + }, + { + "epoch": 3.4651474530831097, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02951233343650763, + "loss": 0.7903, + "num_input_tokens_seen": 13497904, + "step": 23265 + }, + { + "epoch": 3.4658921656240693, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02951184022039892, + "loss": 0.7952, + "num_input_tokens_seen": 13500880, + "step": 23270 + }, + { + "epoch": 3.466636878165028, + "grad_norm": 0.0179443359375, + "learning_rate": 0.029511346759127418, + "loss": 0.824, + "num_input_tokens_seen": 13503952, + "step": 23275 + }, + { + "epoch": 3.4673815907059877, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029510853052701458, + "loss": 0.7916, + "num_input_tokens_seen": 13507056, + "step": 23280 + }, + { + "epoch": 3.4681263032469465, + "grad_norm": 0.0130615234375, + "learning_rate": 0.029510359101129386, + "loss": 0.8238, + "num_input_tokens_seen": 13509776, + "step": 23285 + }, + { + "epoch": 3.4688710157879057, + "grad_norm": 0.022705078125, + "learning_rate": 0.029509864904419543, + "loss": 0.8029, + "num_input_tokens_seen": 13512848, + "step": 23290 + }, + { + "epoch": 3.469615728328865, + "grad_norm": 0.026611328125, + "learning_rate": 0.029509370462580287, + "loss": 0.7914, + "num_input_tokens_seen": 13515888, + "step": 23295 + }, + { + "epoch": 3.470360440869824, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029508875775619954, + "loss": 0.8048, + "num_input_tokens_seen": 13518352, + "step": 23300 + }, + { + "epoch": 3.4711051534107833, + "grad_norm": 0.0205078125, + "learning_rate": 0.029508380843546914, + "loss": 0.7968, + "num_input_tokens_seen": 13521200, + "step": 23305 + }, + { + "epoch": 3.4718498659517425, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02950788566636953, + "loss": 0.8093, + "num_input_tokens_seen": 13523984, + "step": 23310 + }, + { + "epoch": 3.4725945784927017, + "grad_norm": 0.015625, + "learning_rate": 0.029507390244096158, + "loss": 0.8007, + "num_input_tokens_seen": 13526608, + "step": 23315 + }, + { + "epoch": 3.473339291033661, + "grad_norm": 0.015380859375, + "learning_rate": 0.029506894576735172, + "loss": 0.8007, + "num_input_tokens_seen": 13529392, + "step": 23320 + }, + { + "epoch": 3.47408400357462, + "grad_norm": 0.01611328125, + "learning_rate": 0.02950639866429495, + "loss": 0.8191, + "num_input_tokens_seen": 13532208, + "step": 23325 + }, + { + "epoch": 3.4748287161155793, + "grad_norm": 0.0150146484375, + "learning_rate": 0.02950590250678386, + "loss": 0.7958, + "num_input_tokens_seen": 13535024, + "step": 23330 + }, + { + "epoch": 3.4755734286565385, + "grad_norm": 0.026123046875, + "learning_rate": 0.029505406104210298, + "loss": 0.8139, + "num_input_tokens_seen": 13538160, + "step": 23335 + }, + { + "epoch": 3.4763181411974977, + "grad_norm": 0.012939453125, + "learning_rate": 0.029504909456582636, + "loss": 0.806, + "num_input_tokens_seen": 13541360, + "step": 23340 + }, + { + "epoch": 3.477062853738457, + "grad_norm": 0.025146484375, + "learning_rate": 0.029504412563909274, + "loss": 0.8108, + "num_input_tokens_seen": 13544208, + "step": 23345 + }, + { + "epoch": 3.477807566279416, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029503915426198606, + "loss": 0.7945, + "num_input_tokens_seen": 13547216, + "step": 23350 + }, + { + "epoch": 3.4785522788203753, + "grad_norm": 0.02734375, + "learning_rate": 0.029503418043459025, + "loss": 0.8204, + "num_input_tokens_seen": 13549840, + "step": 23355 + }, + { + "epoch": 3.4792969913613345, + "grad_norm": 0.01251220703125, + "learning_rate": 0.029502920415698936, + "loss": 0.8234, + "num_input_tokens_seen": 13552592, + "step": 23360 + }, + { + "epoch": 3.4800417039022937, + "grad_norm": 0.01275634765625, + "learning_rate": 0.02950242254292675, + "loss": 0.8144, + "num_input_tokens_seen": 13555408, + "step": 23365 + }, + { + "epoch": 3.480786416443253, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029501924425150868, + "loss": 0.809, + "num_input_tokens_seen": 13558384, + "step": 23370 + }, + { + "epoch": 3.481531128984212, + "grad_norm": 0.024658203125, + "learning_rate": 0.029501426062379717, + "loss": 0.8031, + "num_input_tokens_seen": 13561488, + "step": 23375 + }, + { + "epoch": 3.4822758415251713, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029500927454621708, + "loss": 0.8048, + "num_input_tokens_seen": 13564432, + "step": 23380 + }, + { + "epoch": 3.4830205540661305, + "grad_norm": 0.0281982421875, + "learning_rate": 0.029500428601885273, + "loss": 0.8047, + "num_input_tokens_seen": 13567472, + "step": 23385 + }, + { + "epoch": 3.4837652666070897, + "grad_norm": 0.0245361328125, + "learning_rate": 0.029499929504178832, + "loss": 0.7967, + "num_input_tokens_seen": 13570448, + "step": 23390 + }, + { + "epoch": 3.484509979148049, + "grad_norm": 0.02587890625, + "learning_rate": 0.02949943016151082, + "loss": 0.8016, + "num_input_tokens_seen": 13573584, + "step": 23395 + }, + { + "epoch": 3.485254691689008, + "grad_norm": 0.018310546875, + "learning_rate": 0.029498930573889668, + "loss": 0.798, + "num_input_tokens_seen": 13576624, + "step": 23400 + }, + { + "epoch": 3.4859994042299673, + "grad_norm": 0.01531982421875, + "learning_rate": 0.029498430741323824, + "loss": 0.8136, + "num_input_tokens_seen": 13579664, + "step": 23405 + }, + { + "epoch": 3.4867441167709265, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029497930663821725, + "loss": 0.8126, + "num_input_tokens_seen": 13582736, + "step": 23410 + }, + { + "epoch": 3.4874888293118858, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029497430341391826, + "loss": 0.7942, + "num_input_tokens_seen": 13585328, + "step": 23415 + }, + { + "epoch": 3.488233541852845, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02949692977404257, + "loss": 0.8122, + "num_input_tokens_seen": 13588112, + "step": 23420 + }, + { + "epoch": 3.488978254393804, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02949642896178243, + "loss": 0.7991, + "num_input_tokens_seen": 13590960, + "step": 23425 + }, + { + "epoch": 3.4897229669347634, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02949592790461985, + "loss": 0.798, + "num_input_tokens_seen": 13593776, + "step": 23430 + }, + { + "epoch": 3.4904676794757226, + "grad_norm": 0.0311279296875, + "learning_rate": 0.029495426602563296, + "loss": 0.8036, + "num_input_tokens_seen": 13596688, + "step": 23435 + }, + { + "epoch": 3.4912123920166813, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029494925055621252, + "loss": 0.8137, + "num_input_tokens_seen": 13599792, + "step": 23440 + }, + { + "epoch": 3.491957104557641, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029494423263802177, + "loss": 0.8135, + "num_input_tokens_seen": 13602864, + "step": 23445 + }, + { + "epoch": 3.4927018170985997, + "grad_norm": 0.017822265625, + "learning_rate": 0.029493921227114545, + "loss": 0.8013, + "num_input_tokens_seen": 13605584, + "step": 23450 + }, + { + "epoch": 3.4934465296395594, + "grad_norm": 0.01239013671875, + "learning_rate": 0.029493418945566854, + "loss": 0.7996, + "num_input_tokens_seen": 13608464, + "step": 23455 + }, + { + "epoch": 3.494191242180518, + "grad_norm": 0.013916015625, + "learning_rate": 0.02949291641916758, + "loss": 0.7974, + "num_input_tokens_seen": 13611472, + "step": 23460 + }, + { + "epoch": 3.4949359547214773, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02949241364792521, + "loss": 0.7939, + "num_input_tokens_seen": 13614608, + "step": 23465 + }, + { + "epoch": 3.4956806672624365, + "grad_norm": 0.0274658203125, + "learning_rate": 0.029491910631848244, + "loss": 0.7909, + "num_input_tokens_seen": 13617520, + "step": 23470 + }, + { + "epoch": 3.4964253798033957, + "grad_norm": 0.01953125, + "learning_rate": 0.02949140737094517, + "loss": 0.7937, + "num_input_tokens_seen": 13620784, + "step": 23475 + }, + { + "epoch": 3.497170092344355, + "grad_norm": 0.023681640625, + "learning_rate": 0.029490903865224502, + "loss": 0.8057, + "num_input_tokens_seen": 13623696, + "step": 23480 + }, + { + "epoch": 3.497914804885314, + "grad_norm": 0.03271484375, + "learning_rate": 0.02949040011469474, + "loss": 0.7919, + "num_input_tokens_seen": 13626640, + "step": 23485 + }, + { + "epoch": 3.4986595174262733, + "grad_norm": 0.022216796875, + "learning_rate": 0.029489896119364395, + "loss": 0.7987, + "num_input_tokens_seen": 13629584, + "step": 23490 + }, + { + "epoch": 3.4994042299672325, + "grad_norm": 0.023681640625, + "learning_rate": 0.029489391879241985, + "loss": 0.8026, + "num_input_tokens_seen": 13632560, + "step": 23495 + }, + { + "epoch": 3.5001489425081918, + "grad_norm": 0.011962890625, + "learning_rate": 0.02948888739433602, + "loss": 0.8093, + "num_input_tokens_seen": 13635376, + "step": 23500 + }, + { + "epoch": 3.500893655049151, + "grad_norm": 0.0118408203125, + "learning_rate": 0.029488382664655036, + "loss": 0.7888, + "num_input_tokens_seen": 13638128, + "step": 23505 + }, + { + "epoch": 3.50163836759011, + "grad_norm": 0.02783203125, + "learning_rate": 0.029487877690207547, + "loss": 0.814, + "num_input_tokens_seen": 13640848, + "step": 23510 + }, + { + "epoch": 3.5023830801310694, + "grad_norm": 0.021484375, + "learning_rate": 0.02948737247100209, + "loss": 0.7856, + "num_input_tokens_seen": 13643888, + "step": 23515 + }, + { + "epoch": 3.5031277926720286, + "grad_norm": 0.02099609375, + "learning_rate": 0.0294868670070472, + "loss": 0.8023, + "num_input_tokens_seen": 13646864, + "step": 23520 + }, + { + "epoch": 3.5038725052129878, + "grad_norm": 0.0157470703125, + "learning_rate": 0.029486361298351416, + "loss": 0.8072, + "num_input_tokens_seen": 13649872, + "step": 23525 + }, + { + "epoch": 3.504617217753947, + "grad_norm": 0.013427734375, + "learning_rate": 0.02948585534492328, + "loss": 0.8098, + "num_input_tokens_seen": 13652560, + "step": 23530 + }, + { + "epoch": 3.505361930294906, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02948534914677134, + "loss": 0.7986, + "num_input_tokens_seen": 13656048, + "step": 23535 + }, + { + "epoch": 3.5061066428358654, + "grad_norm": 0.01226806640625, + "learning_rate": 0.02948484270390415, + "loss": 0.786, + "num_input_tokens_seen": 13658800, + "step": 23540 + }, + { + "epoch": 3.5068513553768246, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029484336016330263, + "loss": 0.7893, + "num_input_tokens_seen": 13661584, + "step": 23545 + }, + { + "epoch": 3.5075960679177838, + "grad_norm": 0.0167236328125, + "learning_rate": 0.029483829084058243, + "loss": 0.8172, + "num_input_tokens_seen": 13664464, + "step": 23550 + }, + { + "epoch": 3.508340780458743, + "grad_norm": 0.0179443359375, + "learning_rate": 0.029483321907096648, + "loss": 0.8267, + "num_input_tokens_seen": 13667312, + "step": 23555 + }, + { + "epoch": 3.509085492999702, + "grad_norm": 0.0311279296875, + "learning_rate": 0.02948281448545405, + "loss": 0.8078, + "num_input_tokens_seen": 13670224, + "step": 23560 + }, + { + "epoch": 3.5098302055406614, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029482306819139024, + "loss": 0.7798, + "num_input_tokens_seen": 13673008, + "step": 23565 + }, + { + "epoch": 3.5105749180816206, + "grad_norm": 0.01300048828125, + "learning_rate": 0.029481798908160144, + "loss": 0.7956, + "num_input_tokens_seen": 13676048, + "step": 23570 + }, + { + "epoch": 3.51131963062258, + "grad_norm": 0.0184326171875, + "learning_rate": 0.029481290752525986, + "loss": 0.8062, + "num_input_tokens_seen": 13678960, + "step": 23575 + }, + { + "epoch": 3.512064343163539, + "grad_norm": 0.0224609375, + "learning_rate": 0.029480782352245136, + "loss": 0.8229, + "num_input_tokens_seen": 13682160, + "step": 23580 + }, + { + "epoch": 3.512809055704498, + "grad_norm": 0.01251220703125, + "learning_rate": 0.029480273707326192, + "loss": 0.7987, + "num_input_tokens_seen": 13685264, + "step": 23585 + }, + { + "epoch": 3.5135537682454574, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02947976481777774, + "loss": 0.7894, + "num_input_tokens_seen": 13688176, + "step": 23590 + }, + { + "epoch": 3.5142984807864166, + "grad_norm": 0.03564453125, + "learning_rate": 0.029479255683608374, + "loss": 0.787, + "num_input_tokens_seen": 13691088, + "step": 23595 + }, + { + "epoch": 3.515043193327376, + "grad_norm": 0.0244140625, + "learning_rate": 0.029478746304826706, + "loss": 0.8082, + "num_input_tokens_seen": 13693936, + "step": 23600 + }, + { + "epoch": 3.5157879058683346, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02947823668144133, + "loss": 0.7978, + "num_input_tokens_seen": 13697008, + "step": 23605 + }, + { + "epoch": 3.516532618409294, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02947772681346086, + "loss": 0.7897, + "num_input_tokens_seen": 13699984, + "step": 23610 + }, + { + "epoch": 3.517277330950253, + "grad_norm": 0.033935546875, + "learning_rate": 0.029477216700893913, + "loss": 0.7993, + "num_input_tokens_seen": 13702768, + "step": 23615 + }, + { + "epoch": 3.5180220434912126, + "grad_norm": 0.0380859375, + "learning_rate": 0.0294767063437491, + "loss": 0.7944, + "num_input_tokens_seen": 13705744, + "step": 23620 + }, + { + "epoch": 3.5187667560321714, + "grad_norm": 0.02099609375, + "learning_rate": 0.02947619574203505, + "loss": 0.7679, + "num_input_tokens_seen": 13708464, + "step": 23625 + }, + { + "epoch": 3.519511468573131, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02947568489576038, + "loss": 0.7847, + "num_input_tokens_seen": 13711536, + "step": 23630 + }, + { + "epoch": 3.5202561811140898, + "grad_norm": 0.016357421875, + "learning_rate": 0.029475173804933735, + "loss": 0.7996, + "num_input_tokens_seen": 13714512, + "step": 23635 + }, + { + "epoch": 3.5210008936550494, + "grad_norm": 0.01336669921875, + "learning_rate": 0.029474662469563737, + "loss": 0.826, + "num_input_tokens_seen": 13717360, + "step": 23640 + }, + { + "epoch": 3.521745606196008, + "grad_norm": 0.028076171875, + "learning_rate": 0.029474150889659025, + "loss": 0.8373, + "num_input_tokens_seen": 13720432, + "step": 23645 + }, + { + "epoch": 3.5224903187369674, + "grad_norm": 0.0205078125, + "learning_rate": 0.029473639065228246, + "loss": 0.8327, + "num_input_tokens_seen": 13723536, + "step": 23650 + }, + { + "epoch": 3.5232350312779266, + "grad_norm": 0.0390625, + "learning_rate": 0.02947312699628005, + "loss": 0.7899, + "num_input_tokens_seen": 13726320, + "step": 23655 + }, + { + "epoch": 3.523979743818886, + "grad_norm": 0.03564453125, + "learning_rate": 0.029472614682823077, + "loss": 0.8161, + "num_input_tokens_seen": 13728944, + "step": 23660 + }, + { + "epoch": 3.524724456359845, + "grad_norm": 0.015625, + "learning_rate": 0.029472102124865993, + "loss": 0.8015, + "num_input_tokens_seen": 13731888, + "step": 23665 + }, + { + "epoch": 3.525469168900804, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02947158932241745, + "loss": 0.8035, + "num_input_tokens_seen": 13735024, + "step": 23670 + }, + { + "epoch": 3.5262138814417634, + "grad_norm": 0.021728515625, + "learning_rate": 0.029471076275486112, + "loss": 0.7996, + "num_input_tokens_seen": 13737840, + "step": 23675 + }, + { + "epoch": 3.5269585939827226, + "grad_norm": 0.0230712890625, + "learning_rate": 0.029470562984080653, + "loss": 0.777, + "num_input_tokens_seen": 13740368, + "step": 23680 + }, + { + "epoch": 3.527703306523682, + "grad_norm": 0.04638671875, + "learning_rate": 0.02947004944820973, + "loss": 0.8198, + "num_input_tokens_seen": 13743536, + "step": 23685 + }, + { + "epoch": 3.528448019064641, + "grad_norm": 0.023681640625, + "learning_rate": 0.029469535667882036, + "loss": 0.798, + "num_input_tokens_seen": 13746448, + "step": 23690 + }, + { + "epoch": 3.5291927316056, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029469021643106243, + "loss": 0.7996, + "num_input_tokens_seen": 13749232, + "step": 23695 + }, + { + "epoch": 3.5299374441465594, + "grad_norm": 0.02490234375, + "learning_rate": 0.02946850737389103, + "loss": 0.8135, + "num_input_tokens_seen": 13752048, + "step": 23700 + }, + { + "epoch": 3.5306821566875186, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029467992860245096, + "loss": 0.7957, + "num_input_tokens_seen": 13754832, + "step": 23705 + }, + { + "epoch": 3.531426869228478, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029467478102177126, + "loss": 0.8092, + "num_input_tokens_seen": 13757744, + "step": 23710 + }, + { + "epoch": 3.532171581769437, + "grad_norm": 0.014404296875, + "learning_rate": 0.02946696309969582, + "loss": 0.7948, + "num_input_tokens_seen": 13760976, + "step": 23715 + }, + { + "epoch": 3.532916294310396, + "grad_norm": 0.02685546875, + "learning_rate": 0.029466447852809868, + "loss": 0.7879, + "num_input_tokens_seen": 13763920, + "step": 23720 + }, + { + "epoch": 3.5336610068513554, + "grad_norm": 0.02587890625, + "learning_rate": 0.02946593236152799, + "loss": 0.8024, + "num_input_tokens_seen": 13766448, + "step": 23725 + }, + { + "epoch": 3.5344057193923146, + "grad_norm": 0.031982421875, + "learning_rate": 0.029465416625858883, + "loss": 0.7988, + "num_input_tokens_seen": 13769808, + "step": 23730 + }, + { + "epoch": 3.535150431933274, + "grad_norm": 0.0155029296875, + "learning_rate": 0.029464900645811264, + "loss": 0.809, + "num_input_tokens_seen": 13772752, + "step": 23735 + }, + { + "epoch": 3.535895144474233, + "grad_norm": 0.03173828125, + "learning_rate": 0.029464384421393854, + "loss": 0.7914, + "num_input_tokens_seen": 13775728, + "step": 23740 + }, + { + "epoch": 3.5366398570151922, + "grad_norm": 0.0245361328125, + "learning_rate": 0.029463867952615366, + "loss": 0.7891, + "num_input_tokens_seen": 13778992, + "step": 23745 + }, + { + "epoch": 3.5373845695561514, + "grad_norm": 0.033935546875, + "learning_rate": 0.029463351239484528, + "loss": 0.8167, + "num_input_tokens_seen": 13782128, + "step": 23750 + }, + { + "epoch": 3.5381292820971106, + "grad_norm": 0.028076171875, + "learning_rate": 0.02946283428201008, + "loss": 0.8199, + "num_input_tokens_seen": 13784976, + "step": 23755 + }, + { + "epoch": 3.53887399463807, + "grad_norm": 0.032470703125, + "learning_rate": 0.029462317080200737, + "loss": 0.8208, + "num_input_tokens_seen": 13787824, + "step": 23760 + }, + { + "epoch": 3.539618707179029, + "grad_norm": 0.02880859375, + "learning_rate": 0.02946179963406525, + "loss": 0.8106, + "num_input_tokens_seen": 13790704, + "step": 23765 + }, + { + "epoch": 3.5403634197199882, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029461281943612354, + "loss": 0.7919, + "num_input_tokens_seen": 13793744, + "step": 23770 + }, + { + "epoch": 3.5411081322609474, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029460764008850795, + "loss": 0.7937, + "num_input_tokens_seen": 13796688, + "step": 23775 + }, + { + "epoch": 3.541852844801906, + "grad_norm": 0.0419921875, + "learning_rate": 0.02946024582978933, + "loss": 0.8211, + "num_input_tokens_seen": 13799664, + "step": 23780 + }, + { + "epoch": 3.542597557342866, + "grad_norm": 0.02197265625, + "learning_rate": 0.02945972740643671, + "loss": 0.7926, + "num_input_tokens_seen": 13802352, + "step": 23785 + }, + { + "epoch": 3.5433422698838246, + "grad_norm": 0.024169921875, + "learning_rate": 0.029459208738801687, + "loss": 0.7913, + "num_input_tokens_seen": 13804976, + "step": 23790 + }, + { + "epoch": 3.5440869824247843, + "grad_norm": 0.026123046875, + "learning_rate": 0.029458689826893033, + "loss": 0.7813, + "num_input_tokens_seen": 13808048, + "step": 23795 + }, + { + "epoch": 3.544831694965743, + "grad_norm": 0.0260009765625, + "learning_rate": 0.029458170670719503, + "loss": 0.7858, + "num_input_tokens_seen": 13810928, + "step": 23800 + }, + { + "epoch": 3.5455764075067027, + "grad_norm": 0.029541015625, + "learning_rate": 0.02945765127028988, + "loss": 0.8008, + "num_input_tokens_seen": 13813808, + "step": 23805 + }, + { + "epoch": 3.5463211200476614, + "grad_norm": 0.021728515625, + "learning_rate": 0.029457131625612927, + "loss": 0.7885, + "num_input_tokens_seen": 13816720, + "step": 23810 + }, + { + "epoch": 3.5470658325886206, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029456611736697436, + "loss": 0.8072, + "num_input_tokens_seen": 13819504, + "step": 23815 + }, + { + "epoch": 3.54781054512958, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02945609160355218, + "loss": 0.8057, + "num_input_tokens_seen": 13822672, + "step": 23820 + }, + { + "epoch": 3.548555257670539, + "grad_norm": 0.0147705078125, + "learning_rate": 0.029455571226185945, + "loss": 0.7748, + "num_input_tokens_seen": 13825520, + "step": 23825 + }, + { + "epoch": 3.5492999702114982, + "grad_norm": 0.02978515625, + "learning_rate": 0.02945505060460753, + "loss": 0.777, + "num_input_tokens_seen": 13828432, + "step": 23830 + }, + { + "epoch": 3.5500446827524574, + "grad_norm": 0.0263671875, + "learning_rate": 0.029454529738825723, + "loss": 0.7931, + "num_input_tokens_seen": 13831696, + "step": 23835 + }, + { + "epoch": 3.5507893952934166, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029454008628849334, + "loss": 0.7935, + "num_input_tokens_seen": 13834640, + "step": 23840 + }, + { + "epoch": 3.551534107834376, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029453487274687152, + "loss": 0.7839, + "num_input_tokens_seen": 13837808, + "step": 23845 + }, + { + "epoch": 3.552278820375335, + "grad_norm": 0.0284423828125, + "learning_rate": 0.029452965676348, + "loss": 0.8098, + "num_input_tokens_seen": 13840816, + "step": 23850 + }, + { + "epoch": 3.5530235329162942, + "grad_norm": 0.030029296875, + "learning_rate": 0.029452443833840675, + "loss": 0.8027, + "num_input_tokens_seen": 13843632, + "step": 23855 + }, + { + "epoch": 3.5537682454572534, + "grad_norm": 0.046630859375, + "learning_rate": 0.029451921747174002, + "loss": 0.8315, + "num_input_tokens_seen": 13846640, + "step": 23860 + }, + { + "epoch": 3.5545129579982127, + "grad_norm": 0.015380859375, + "learning_rate": 0.029451399416356803, + "loss": 0.767, + "num_input_tokens_seen": 13849424, + "step": 23865 + }, + { + "epoch": 3.555257670539172, + "grad_norm": 0.0184326171875, + "learning_rate": 0.029450876841397897, + "loss": 0.7894, + "num_input_tokens_seen": 13852176, + "step": 23870 + }, + { + "epoch": 3.556002383080131, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02945035402230611, + "loss": 0.8129, + "num_input_tokens_seen": 13855216, + "step": 23875 + }, + { + "epoch": 3.5567470956210903, + "grad_norm": 0.0230712890625, + "learning_rate": 0.029449830959090283, + "loss": 0.8197, + "num_input_tokens_seen": 13858096, + "step": 23880 + }, + { + "epoch": 3.5574918081620495, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029449307651759245, + "loss": 0.7664, + "num_input_tokens_seen": 13861328, + "step": 23885 + }, + { + "epoch": 3.5582365207030087, + "grad_norm": 0.03759765625, + "learning_rate": 0.02944878410032184, + "loss": 0.8244, + "num_input_tokens_seen": 13864240, + "step": 23890 + }, + { + "epoch": 3.558981233243968, + "grad_norm": 0.011474609375, + "learning_rate": 0.029448260304786915, + "loss": 0.7934, + "num_input_tokens_seen": 13866992, + "step": 23895 + }, + { + "epoch": 3.559725945784927, + "grad_norm": 0.026123046875, + "learning_rate": 0.029447736265163317, + "loss": 0.7876, + "num_input_tokens_seen": 13869712, + "step": 23900 + }, + { + "epoch": 3.5604706583258863, + "grad_norm": 0.021484375, + "learning_rate": 0.0294472119814599, + "loss": 0.7935, + "num_input_tokens_seen": 13872528, + "step": 23905 + }, + { + "epoch": 3.5612153708668455, + "grad_norm": 0.03857421875, + "learning_rate": 0.02944668745368552, + "loss": 0.8275, + "num_input_tokens_seen": 13875184, + "step": 23910 + }, + { + "epoch": 3.5619600834078047, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029446162681849034, + "loss": 0.7768, + "num_input_tokens_seen": 13878096, + "step": 23915 + }, + { + "epoch": 3.562704795948764, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02944563766595931, + "loss": 0.8169, + "num_input_tokens_seen": 13881008, + "step": 23920 + }, + { + "epoch": 3.563449508489723, + "grad_norm": 0.03125, + "learning_rate": 0.029445112406025223, + "loss": 0.8075, + "num_input_tokens_seen": 13883824, + "step": 23925 + }, + { + "epoch": 3.5641942210306823, + "grad_norm": 0.048095703125, + "learning_rate": 0.029444586902055644, + "loss": 0.8, + "num_input_tokens_seen": 13886384, + "step": 23930 + }, + { + "epoch": 3.5649389335716415, + "grad_norm": 0.01953125, + "learning_rate": 0.02944406115405945, + "loss": 0.787, + "num_input_tokens_seen": 13889360, + "step": 23935 + }, + { + "epoch": 3.5656836461126007, + "grad_norm": 0.0250244140625, + "learning_rate": 0.029443535162045526, + "loss": 0.8098, + "num_input_tokens_seen": 13892592, + "step": 23940 + }, + { + "epoch": 3.5664283586535594, + "grad_norm": 0.02001953125, + "learning_rate": 0.029443008926022748, + "loss": 0.8104, + "num_input_tokens_seen": 13895632, + "step": 23945 + }, + { + "epoch": 3.567173071194519, + "grad_norm": 0.01806640625, + "learning_rate": 0.02944248244600002, + "loss": 0.814, + "num_input_tokens_seen": 13898800, + "step": 23950 + }, + { + "epoch": 3.567917783735478, + "grad_norm": 0.0296630859375, + "learning_rate": 0.029441955721986227, + "loss": 0.8105, + "num_input_tokens_seen": 13901680, + "step": 23955 + }, + { + "epoch": 3.5686624962764375, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02944142875399027, + "loss": 0.818, + "num_input_tokens_seen": 13904464, + "step": 23960 + }, + { + "epoch": 3.5694072088173963, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02944090154202105, + "loss": 0.801, + "num_input_tokens_seen": 13907280, + "step": 23965 + }, + { + "epoch": 3.570151921358356, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02944037408608748, + "loss": 0.7817, + "num_input_tokens_seen": 13910064, + "step": 23970 + }, + { + "epoch": 3.5708966338993147, + "grad_norm": 0.039794921875, + "learning_rate": 0.029439846386198464, + "loss": 0.8205, + "num_input_tokens_seen": 13913072, + "step": 23975 + }, + { + "epoch": 3.5716413464402743, + "grad_norm": 0.020751953125, + "learning_rate": 0.02943931844236292, + "loss": 0.8033, + "num_input_tokens_seen": 13915824, + "step": 23980 + }, + { + "epoch": 3.572386058981233, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02943879025458977, + "loss": 0.7958, + "num_input_tokens_seen": 13919152, + "step": 23985 + }, + { + "epoch": 3.5731307715221923, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029438261822887925, + "loss": 0.8069, + "num_input_tokens_seen": 13921840, + "step": 23990 + }, + { + "epoch": 3.5738754840631515, + "grad_norm": 0.031494140625, + "learning_rate": 0.029437733147266328, + "loss": 0.8043, + "num_input_tokens_seen": 13924496, + "step": 23995 + }, + { + "epoch": 3.5746201966041107, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0294372042277339, + "loss": 0.8132, + "num_input_tokens_seen": 13927408, + "step": 24000 + }, + { + "epoch": 3.57536490914507, + "grad_norm": 0.041015625, + "learning_rate": 0.029436675064299582, + "loss": 0.8039, + "num_input_tokens_seen": 13930224, + "step": 24005 + }, + { + "epoch": 3.576109621686029, + "grad_norm": 0.021484375, + "learning_rate": 0.029436145656972307, + "loss": 0.7995, + "num_input_tokens_seen": 13933072, + "step": 24010 + }, + { + "epoch": 3.5768543342269883, + "grad_norm": 0.03125, + "learning_rate": 0.02943561600576103, + "loss": 0.7889, + "num_input_tokens_seen": 13936208, + "step": 24015 + }, + { + "epoch": 3.5775990467679475, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029435086110674684, + "loss": 0.8021, + "num_input_tokens_seen": 13939216, + "step": 24020 + }, + { + "epoch": 3.5783437593089067, + "grad_norm": 0.0233154296875, + "learning_rate": 0.029434555971722235, + "loss": 0.8154, + "num_input_tokens_seen": 13942384, + "step": 24025 + }, + { + "epoch": 3.579088471849866, + "grad_norm": 0.0234375, + "learning_rate": 0.029434025588912636, + "loss": 0.7915, + "num_input_tokens_seen": 13945136, + "step": 24030 + }, + { + "epoch": 3.579833184390825, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02943349496225484, + "loss": 0.7851, + "num_input_tokens_seen": 13947728, + "step": 24035 + }, + { + "epoch": 3.5805778969317843, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02943296409175782, + "loss": 0.7973, + "num_input_tokens_seen": 13950672, + "step": 24040 + }, + { + "epoch": 3.5813226094727435, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029432432977430537, + "loss": 0.7948, + "num_input_tokens_seen": 13953648, + "step": 24045 + }, + { + "epoch": 3.5820673220137027, + "grad_norm": 0.031494140625, + "learning_rate": 0.02943190161928197, + "loss": 0.8014, + "num_input_tokens_seen": 13956560, + "step": 24050 + }, + { + "epoch": 3.582812034554662, + "grad_norm": 0.028076171875, + "learning_rate": 0.02943137001732109, + "loss": 0.8029, + "num_input_tokens_seen": 13959504, + "step": 24055 + }, + { + "epoch": 3.583556747095621, + "grad_norm": 0.033447265625, + "learning_rate": 0.02943083817155688, + "loss": 0.8136, + "num_input_tokens_seen": 13962384, + "step": 24060 + }, + { + "epoch": 3.5843014596365803, + "grad_norm": 0.029296875, + "learning_rate": 0.029430306081998328, + "loss": 0.7951, + "num_input_tokens_seen": 13965200, + "step": 24065 + }, + { + "epoch": 3.5850461721775395, + "grad_norm": 0.0234375, + "learning_rate": 0.029429773748654423, + "loss": 0.7869, + "num_input_tokens_seen": 13967952, + "step": 24070 + }, + { + "epoch": 3.5857908847184987, + "grad_norm": 0.03369140625, + "learning_rate": 0.029429241171534157, + "loss": 0.8303, + "num_input_tokens_seen": 13970928, + "step": 24075 + }, + { + "epoch": 3.586535597259458, + "grad_norm": 0.0302734375, + "learning_rate": 0.029428708350646524, + "loss": 0.792, + "num_input_tokens_seen": 13973872, + "step": 24080 + }, + { + "epoch": 3.587280309800417, + "grad_norm": 0.01361083984375, + "learning_rate": 0.02942817528600053, + "loss": 0.7695, + "num_input_tokens_seen": 13976592, + "step": 24085 + }, + { + "epoch": 3.5880250223413763, + "grad_norm": 0.023193359375, + "learning_rate": 0.02942764197760518, + "loss": 0.8002, + "num_input_tokens_seen": 13979472, + "step": 24090 + }, + { + "epoch": 3.5887697348823355, + "grad_norm": 0.02197265625, + "learning_rate": 0.02942710842546948, + "loss": 0.8137, + "num_input_tokens_seen": 13982288, + "step": 24095 + }, + { + "epoch": 3.5895144474232947, + "grad_norm": 0.0250244140625, + "learning_rate": 0.029426574629602444, + "loss": 0.7903, + "num_input_tokens_seen": 13985136, + "step": 24100 + }, + { + "epoch": 3.590259159964254, + "grad_norm": 0.01434326171875, + "learning_rate": 0.029426040590013094, + "loss": 0.7903, + "num_input_tokens_seen": 13988016, + "step": 24105 + }, + { + "epoch": 3.591003872505213, + "grad_norm": 0.052978515625, + "learning_rate": 0.029425506306710446, + "loss": 2.8683, + "num_input_tokens_seen": 13991120, + "step": 24110 + }, + { + "epoch": 3.5917485850461723, + "grad_norm": 0.057373046875, + "learning_rate": 0.029424971779703536, + "loss": 0.7865, + "num_input_tokens_seen": 13993968, + "step": 24115 + }, + { + "epoch": 3.592493297587131, + "grad_norm": 0.032470703125, + "learning_rate": 0.029424437009001388, + "loss": 0.8088, + "num_input_tokens_seen": 13996752, + "step": 24120 + }, + { + "epoch": 3.5932380101280907, + "grad_norm": 0.04345703125, + "learning_rate": 0.029423901994613037, + "loss": 0.7727, + "num_input_tokens_seen": 13999696, + "step": 24125 + }, + { + "epoch": 3.5939827226690495, + "grad_norm": 0.08642578125, + "learning_rate": 0.02942336673654752, + "loss": 0.7795, + "num_input_tokens_seen": 14002832, + "step": 24130 + }, + { + "epoch": 3.594727435210009, + "grad_norm": 0.050048828125, + "learning_rate": 0.029422831234813884, + "loss": 0.8163, + "num_input_tokens_seen": 14005712, + "step": 24135 + }, + { + "epoch": 3.595472147750968, + "grad_norm": 0.032470703125, + "learning_rate": 0.02942229548942117, + "loss": 1.2877, + "num_input_tokens_seen": 14009040, + "step": 24140 + }, + { + "epoch": 3.5962168602919276, + "grad_norm": 0.038818359375, + "learning_rate": 0.029421759500378432, + "loss": 0.7919, + "num_input_tokens_seen": 14011600, + "step": 24145 + }, + { + "epoch": 3.5969615728328863, + "grad_norm": 0.06591796875, + "learning_rate": 0.029421223267694727, + "loss": 0.8241, + "num_input_tokens_seen": 14014416, + "step": 24150 + }, + { + "epoch": 3.597706285373846, + "grad_norm": 0.04638671875, + "learning_rate": 0.02942068679137911, + "loss": 0.8132, + "num_input_tokens_seen": 14017104, + "step": 24155 + }, + { + "epoch": 3.5984509979148047, + "grad_norm": 0.053955078125, + "learning_rate": 0.02942015007144065, + "loss": 0.8336, + "num_input_tokens_seen": 14019856, + "step": 24160 + }, + { + "epoch": 3.599195710455764, + "grad_norm": 0.048583984375, + "learning_rate": 0.029419613107888407, + "loss": 0.8183, + "num_input_tokens_seen": 14022608, + "step": 24165 + }, + { + "epoch": 3.599940422996723, + "grad_norm": 0.033447265625, + "learning_rate": 0.029419075900731458, + "loss": 0.7901, + "num_input_tokens_seen": 14025392, + "step": 24170 + }, + { + "epoch": 3.6006851355376823, + "grad_norm": 0.023193359375, + "learning_rate": 0.029418538449978875, + "loss": 0.8087, + "num_input_tokens_seen": 14028208, + "step": 24175 + }, + { + "epoch": 3.6014298480786415, + "grad_norm": 0.046142578125, + "learning_rate": 0.029418000755639738, + "loss": 0.8157, + "num_input_tokens_seen": 14031024, + "step": 24180 + }, + { + "epoch": 3.6021745606196007, + "grad_norm": 0.02978515625, + "learning_rate": 0.02941746281772313, + "loss": 0.7826, + "num_input_tokens_seen": 14033936, + "step": 24185 + }, + { + "epoch": 3.60291927316056, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029416924636238146, + "loss": 0.7982, + "num_input_tokens_seen": 14036976, + "step": 24190 + }, + { + "epoch": 3.603663985701519, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02941638621119387, + "loss": 0.7597, + "num_input_tokens_seen": 14039920, + "step": 24195 + }, + { + "epoch": 3.6044086982424783, + "grad_norm": 0.0341796875, + "learning_rate": 0.029415847542599405, + "loss": 0.821, + "num_input_tokens_seen": 14042896, + "step": 24200 + }, + { + "epoch": 3.6051534107834375, + "grad_norm": 0.042724609375, + "learning_rate": 0.029415308630463845, + "loss": 0.7829, + "num_input_tokens_seen": 14045616, + "step": 24205 + }, + { + "epoch": 3.6058981233243967, + "grad_norm": 0.015869140625, + "learning_rate": 0.029414769474796296, + "loss": 0.79, + "num_input_tokens_seen": 14048592, + "step": 24210 + }, + { + "epoch": 3.606642835865356, + "grad_norm": 0.028076171875, + "learning_rate": 0.029414230075605872, + "loss": 0.8429, + "num_input_tokens_seen": 14051568, + "step": 24215 + }, + { + "epoch": 3.607387548406315, + "grad_norm": 0.04296875, + "learning_rate": 0.029413690432901676, + "loss": 0.8213, + "num_input_tokens_seen": 14054448, + "step": 24220 + }, + { + "epoch": 3.6081322609472744, + "grad_norm": 0.039794921875, + "learning_rate": 0.02941315054669283, + "loss": 0.8068, + "num_input_tokens_seen": 14057488, + "step": 24225 + }, + { + "epoch": 3.6088769734882336, + "grad_norm": 0.03076171875, + "learning_rate": 0.029412610416988453, + "loss": 0.8098, + "num_input_tokens_seen": 14060400, + "step": 24230 + }, + { + "epoch": 3.6096216860291928, + "grad_norm": 0.028564453125, + "learning_rate": 0.029412070043797672, + "loss": 0.8084, + "num_input_tokens_seen": 14063152, + "step": 24235 + }, + { + "epoch": 3.610366398570152, + "grad_norm": 0.043212890625, + "learning_rate": 0.029411529427129615, + "loss": 0.7852, + "num_input_tokens_seen": 14066288, + "step": 24240 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02941098856699342, + "loss": 0.7801, + "num_input_tokens_seen": 14069168, + "step": 24245 + }, + { + "epoch": 3.6118558236520704, + "grad_norm": 0.037841796875, + "learning_rate": 0.029410447463398215, + "loss": 0.806, + "num_input_tokens_seen": 14072176, + "step": 24250 + }, + { + "epoch": 3.6126005361930296, + "grad_norm": 0.0322265625, + "learning_rate": 0.02940990611635315, + "loss": 0.8097, + "num_input_tokens_seen": 14075056, + "step": 24255 + }, + { + "epoch": 3.6133452487339888, + "grad_norm": 0.04248046875, + "learning_rate": 0.02940936452586736, + "loss": 0.7913, + "num_input_tokens_seen": 14078224, + "step": 24260 + }, + { + "epoch": 3.614089961274948, + "grad_norm": 0.037109375, + "learning_rate": 0.02940882269195001, + "loss": 0.8181, + "num_input_tokens_seen": 14081360, + "step": 24265 + }, + { + "epoch": 3.614834673815907, + "grad_norm": 0.0390625, + "learning_rate": 0.02940828061461024, + "loss": 0.7879, + "num_input_tokens_seen": 14084336, + "step": 24270 + }, + { + "epoch": 3.6155793863568664, + "grad_norm": 0.04150390625, + "learning_rate": 0.029407738293857216, + "loss": 0.8022, + "num_input_tokens_seen": 14087312, + "step": 24275 + }, + { + "epoch": 3.6163240988978256, + "grad_norm": 0.01519775390625, + "learning_rate": 0.029407195729700093, + "loss": 0.809, + "num_input_tokens_seen": 14090128, + "step": 24280 + }, + { + "epoch": 3.617068811438785, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029406652922148044, + "loss": 0.8297, + "num_input_tokens_seen": 14092656, + "step": 24285 + }, + { + "epoch": 3.617813523979744, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029406109871210236, + "loss": 0.7836, + "num_input_tokens_seen": 14095664, + "step": 24290 + }, + { + "epoch": 3.6185582365207027, + "grad_norm": 0.019775390625, + "learning_rate": 0.029405566576895845, + "loss": 0.8031, + "num_input_tokens_seen": 14098800, + "step": 24295 + }, + { + "epoch": 3.6193029490616624, + "grad_norm": 0.026611328125, + "learning_rate": 0.02940502303921404, + "loss": 0.8316, + "num_input_tokens_seen": 14101584, + "step": 24300 + }, + { + "epoch": 3.620047661602621, + "grad_norm": 0.03271484375, + "learning_rate": 0.029404479258174018, + "loss": 0.7847, + "num_input_tokens_seen": 14104528, + "step": 24305 + }, + { + "epoch": 3.620792374143581, + "grad_norm": 0.033447265625, + "learning_rate": 0.02940393523378496, + "loss": 0.8128, + "num_input_tokens_seen": 14107184, + "step": 24310 + }, + { + "epoch": 3.6215370866845396, + "grad_norm": 0.033203125, + "learning_rate": 0.029403390966056056, + "loss": 0.8133, + "num_input_tokens_seen": 14110128, + "step": 24315 + }, + { + "epoch": 3.622281799225499, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02940284645499649, + "loss": 0.7957, + "num_input_tokens_seen": 14112688, + "step": 24320 + }, + { + "epoch": 3.623026511766458, + "grad_norm": 0.052978515625, + "learning_rate": 0.02940230170061548, + "loss": 0.8281, + "num_input_tokens_seen": 14115536, + "step": 24325 + }, + { + "epoch": 3.6237712243074176, + "grad_norm": 0.016845703125, + "learning_rate": 0.029401756702922225, + "loss": 0.8039, + "num_input_tokens_seen": 14118256, + "step": 24330 + }, + { + "epoch": 3.6245159368483764, + "grad_norm": 0.032958984375, + "learning_rate": 0.02940121146192592, + "loss": 0.8159, + "num_input_tokens_seen": 14121200, + "step": 24335 + }, + { + "epoch": 3.6252606493893356, + "grad_norm": 0.031982421875, + "learning_rate": 0.02940066597763579, + "loss": 0.8135, + "num_input_tokens_seen": 14124144, + "step": 24340 + }, + { + "epoch": 3.6260053619302948, + "grad_norm": 0.02685546875, + "learning_rate": 0.02940012025006104, + "loss": 0.8175, + "num_input_tokens_seen": 14127152, + "step": 24345 + }, + { + "epoch": 3.626750074471254, + "grad_norm": 0.037353515625, + "learning_rate": 0.0293995742792109, + "loss": 0.7894, + "num_input_tokens_seen": 14129872, + "step": 24350 + }, + { + "epoch": 3.627494787012213, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02939902806509458, + "loss": 0.795, + "num_input_tokens_seen": 14132752, + "step": 24355 + }, + { + "epoch": 3.6282394995531724, + "grad_norm": 0.028076171875, + "learning_rate": 0.02939848160772132, + "loss": 0.8116, + "num_input_tokens_seen": 14135696, + "step": 24360 + }, + { + "epoch": 3.6289842120941316, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029397934907100343, + "loss": 0.7977, + "num_input_tokens_seen": 14138512, + "step": 24365 + }, + { + "epoch": 3.629728924635091, + "grad_norm": 0.015380859375, + "learning_rate": 0.02939738796324089, + "loss": 0.8125, + "num_input_tokens_seen": 14141104, + "step": 24370 + }, + { + "epoch": 3.63047363717605, + "grad_norm": 0.035400390625, + "learning_rate": 0.029396840776152205, + "loss": 0.8196, + "num_input_tokens_seen": 14143952, + "step": 24375 + }, + { + "epoch": 3.631218349717009, + "grad_norm": 0.033935546875, + "learning_rate": 0.029396293345843524, + "loss": 0.8136, + "num_input_tokens_seen": 14146576, + "step": 24380 + }, + { + "epoch": 3.6319630622579684, + "grad_norm": 0.01495361328125, + "learning_rate": 0.0293957456723241, + "loss": 0.8027, + "num_input_tokens_seen": 14149392, + "step": 24385 + }, + { + "epoch": 3.6327077747989276, + "grad_norm": 0.04296875, + "learning_rate": 0.029395197755603186, + "loss": 0.7935, + "num_input_tokens_seen": 14152496, + "step": 24390 + }, + { + "epoch": 3.633452487339887, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02939464959569003, + "loss": 0.8309, + "num_input_tokens_seen": 14155312, + "step": 24395 + }, + { + "epoch": 3.634197199880846, + "grad_norm": 0.03662109375, + "learning_rate": 0.029394101192593905, + "loss": 0.7977, + "num_input_tokens_seen": 14158288, + "step": 24400 + }, + { + "epoch": 3.634941912421805, + "grad_norm": 0.020263671875, + "learning_rate": 0.02939355254632407, + "loss": 0.8069, + "num_input_tokens_seen": 14161296, + "step": 24405 + }, + { + "epoch": 3.6356866249627644, + "grad_norm": 0.0234375, + "learning_rate": 0.02939300365688979, + "loss": 0.7985, + "num_input_tokens_seen": 14163952, + "step": 24410 + }, + { + "epoch": 3.6364313375037236, + "grad_norm": 0.0234375, + "learning_rate": 0.029392454524300345, + "loss": 0.8161, + "num_input_tokens_seen": 14166768, + "step": 24415 + }, + { + "epoch": 3.637176050044683, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029391905148565007, + "loss": 0.7965, + "num_input_tokens_seen": 14169648, + "step": 24420 + }, + { + "epoch": 3.637920762585642, + "grad_norm": 0.03662109375, + "learning_rate": 0.029391355529693054, + "loss": 0.8321, + "num_input_tokens_seen": 14172656, + "step": 24425 + }, + { + "epoch": 3.638665475126601, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02939080566769378, + "loss": 0.8106, + "num_input_tokens_seen": 14175632, + "step": 24430 + }, + { + "epoch": 3.6394101876675604, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029390255562576476, + "loss": 0.8082, + "num_input_tokens_seen": 14178416, + "step": 24435 + }, + { + "epoch": 3.6401549002085196, + "grad_norm": 0.029541015625, + "learning_rate": 0.02938970521435042, + "loss": 0.8075, + "num_input_tokens_seen": 14181424, + "step": 24440 + }, + { + "epoch": 3.640899612749479, + "grad_norm": 0.0250244140625, + "learning_rate": 0.029389154623024927, + "loss": 0.8236, + "num_input_tokens_seen": 14184272, + "step": 24445 + }, + { + "epoch": 3.641644325290438, + "grad_norm": 0.02880859375, + "learning_rate": 0.029388603788609288, + "loss": 0.7974, + "num_input_tokens_seen": 14187216, + "step": 24450 + }, + { + "epoch": 3.6423890378313972, + "grad_norm": 0.020751953125, + "learning_rate": 0.029388052711112816, + "loss": 0.8041, + "num_input_tokens_seen": 14190256, + "step": 24455 + }, + { + "epoch": 3.6431337503723564, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029387501390544808, + "loss": 0.8029, + "num_input_tokens_seen": 14193360, + "step": 24460 + }, + { + "epoch": 3.6438784629133156, + "grad_norm": 0.013916015625, + "learning_rate": 0.029386949826914595, + "loss": 0.7892, + "num_input_tokens_seen": 14196240, + "step": 24465 + }, + { + "epoch": 3.6446231754542744, + "grad_norm": 0.0234375, + "learning_rate": 0.029386398020231484, + "loss": 0.8063, + "num_input_tokens_seen": 14199152, + "step": 24470 + }, + { + "epoch": 3.645367887995234, + "grad_norm": 0.01495361328125, + "learning_rate": 0.0293858459705048, + "loss": 0.8214, + "num_input_tokens_seen": 14202032, + "step": 24475 + }, + { + "epoch": 3.646112600536193, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02938529367774387, + "loss": 0.8182, + "num_input_tokens_seen": 14204816, + "step": 24480 + }, + { + "epoch": 3.6468573130771524, + "grad_norm": 0.0390625, + "learning_rate": 0.02938474114195802, + "loss": 0.8116, + "num_input_tokens_seen": 14207792, + "step": 24485 + }, + { + "epoch": 3.647602025618111, + "grad_norm": 0.021728515625, + "learning_rate": 0.029384188363156594, + "loss": 0.7932, + "num_input_tokens_seen": 14210992, + "step": 24490 + }, + { + "epoch": 3.648346738159071, + "grad_norm": 0.013427734375, + "learning_rate": 0.029383635341348923, + "loss": 0.8059, + "num_input_tokens_seen": 14213552, + "step": 24495 + }, + { + "epoch": 3.6490914507000296, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029383082076544352, + "loss": 0.7995, + "num_input_tokens_seen": 14216368, + "step": 24500 + }, + { + "epoch": 3.6498361632409893, + "grad_norm": 0.040771484375, + "learning_rate": 0.029382528568752227, + "loss": 0.814, + "num_input_tokens_seen": 14219216, + "step": 24505 + }, + { + "epoch": 3.650580875781948, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0293819748179819, + "loss": 0.799, + "num_input_tokens_seen": 14222416, + "step": 24510 + }, + { + "epoch": 3.651325588322907, + "grad_norm": 0.0234375, + "learning_rate": 0.029381420824242728, + "loss": 0.8011, + "num_input_tokens_seen": 14225104, + "step": 24515 + }, + { + "epoch": 3.6520703008638664, + "grad_norm": 0.01556396484375, + "learning_rate": 0.02938086658754406, + "loss": 0.8126, + "num_input_tokens_seen": 14228144, + "step": 24520 + }, + { + "epoch": 3.6528150134048256, + "grad_norm": 0.024658203125, + "learning_rate": 0.029380312107895275, + "loss": 0.8047, + "num_input_tokens_seen": 14231056, + "step": 24525 + }, + { + "epoch": 3.653559725945785, + "grad_norm": 0.017333984375, + "learning_rate": 0.029379757385305728, + "loss": 0.7957, + "num_input_tokens_seen": 14234000, + "step": 24530 + }, + { + "epoch": 3.654304438486744, + "grad_norm": 0.034912109375, + "learning_rate": 0.029379202419784797, + "loss": 0.8119, + "num_input_tokens_seen": 14236912, + "step": 24535 + }, + { + "epoch": 3.6550491510277032, + "grad_norm": 0.0185546875, + "learning_rate": 0.029378647211341855, + "loss": 0.7997, + "num_input_tokens_seen": 14239696, + "step": 24540 + }, + { + "epoch": 3.6557938635686624, + "grad_norm": 0.03955078125, + "learning_rate": 0.029378091759986284, + "loss": 0.8038, + "num_input_tokens_seen": 14242576, + "step": 24545 + }, + { + "epoch": 3.6565385761096216, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02937753606572746, + "loss": 0.7944, + "num_input_tokens_seen": 14245648, + "step": 24550 + }, + { + "epoch": 3.657283288650581, + "grad_norm": 0.038330078125, + "learning_rate": 0.029376980128574783, + "loss": 0.7948, + "num_input_tokens_seen": 14248464, + "step": 24555 + }, + { + "epoch": 3.65802800119154, + "grad_norm": 0.0159912109375, + "learning_rate": 0.029376423948537636, + "loss": 0.8018, + "num_input_tokens_seen": 14251568, + "step": 24560 + }, + { + "epoch": 3.6587727137324992, + "grad_norm": 0.036376953125, + "learning_rate": 0.02937586752562542, + "loss": 0.7938, + "num_input_tokens_seen": 14254544, + "step": 24565 + }, + { + "epoch": 3.6595174262734584, + "grad_norm": 0.02685546875, + "learning_rate": 0.02937531085984753, + "loss": 0.796, + "num_input_tokens_seen": 14257296, + "step": 24570 + }, + { + "epoch": 3.6602621388144176, + "grad_norm": 0.04248046875, + "learning_rate": 0.02937475395121338, + "loss": 0.7928, + "num_input_tokens_seen": 14260368, + "step": 24575 + }, + { + "epoch": 3.661006851355377, + "grad_norm": 0.07470703125, + "learning_rate": 0.029374196799732365, + "loss": 0.8218, + "num_input_tokens_seen": 14263216, + "step": 24580 + }, + { + "epoch": 3.661751563896336, + "grad_norm": 0.036376953125, + "learning_rate": 0.029373639405413907, + "loss": 0.7896, + "num_input_tokens_seen": 14266192, + "step": 24585 + }, + { + "epoch": 3.6624962764372953, + "grad_norm": 0.0283203125, + "learning_rate": 0.029373081768267417, + "loss": 0.8124, + "num_input_tokens_seen": 14268976, + "step": 24590 + }, + { + "epoch": 3.6632409889782545, + "grad_norm": 0.060302734375, + "learning_rate": 0.029372523888302325, + "loss": 0.7919, + "num_input_tokens_seen": 14271664, + "step": 24595 + }, + { + "epoch": 3.6639857015192137, + "grad_norm": 0.024658203125, + "learning_rate": 0.029371965765528044, + "loss": 0.7994, + "num_input_tokens_seen": 14274416, + "step": 24600 + }, + { + "epoch": 3.664730414060173, + "grad_norm": 0.02587890625, + "learning_rate": 0.029371407399954013, + "loss": 0.7906, + "num_input_tokens_seen": 14277456, + "step": 24605 + }, + { + "epoch": 3.665475126601132, + "grad_norm": 0.03955078125, + "learning_rate": 0.029370848791589656, + "loss": 0.8118, + "num_input_tokens_seen": 14280624, + "step": 24610 + }, + { + "epoch": 3.6662198391420913, + "grad_norm": 0.0277099609375, + "learning_rate": 0.029370289940444414, + "loss": 0.8419, + "num_input_tokens_seen": 14283536, + "step": 24615 + }, + { + "epoch": 3.6669645516830505, + "grad_norm": 0.0286865234375, + "learning_rate": 0.029369730846527733, + "loss": 0.795, + "num_input_tokens_seen": 14286640, + "step": 24620 + }, + { + "epoch": 3.6677092642240097, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02936917150984905, + "loss": 0.8172, + "num_input_tokens_seen": 14289584, + "step": 24625 + }, + { + "epoch": 3.668453976764969, + "grad_norm": 0.0291748046875, + "learning_rate": 0.029368611930417822, + "loss": 0.8006, + "num_input_tokens_seen": 14292528, + "step": 24630 + }, + { + "epoch": 3.669198689305928, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0293680521082435, + "loss": 0.8143, + "num_input_tokens_seen": 14295440, + "step": 24635 + }, + { + "epoch": 3.6699434018468873, + "grad_norm": 0.0244140625, + "learning_rate": 0.02936749204333554, + "loss": 0.807, + "num_input_tokens_seen": 14298672, + "step": 24640 + }, + { + "epoch": 3.670688114387846, + "grad_norm": 0.026611328125, + "learning_rate": 0.0293669317357034, + "loss": 0.8048, + "num_input_tokens_seen": 14301296, + "step": 24645 + }, + { + "epoch": 3.6714328269288057, + "grad_norm": 0.03466796875, + "learning_rate": 0.029366371185356552, + "loss": 0.7927, + "num_input_tokens_seen": 14304208, + "step": 24650 + }, + { + "epoch": 3.6721775394697644, + "grad_norm": 0.032958984375, + "learning_rate": 0.029365810392304466, + "loss": 0.7917, + "num_input_tokens_seen": 14307504, + "step": 24655 + }, + { + "epoch": 3.672922252010724, + "grad_norm": 0.0234375, + "learning_rate": 0.029365249356556614, + "loss": 0.8006, + "num_input_tokens_seen": 14310736, + "step": 24660 + }, + { + "epoch": 3.673666964551683, + "grad_norm": 0.0283203125, + "learning_rate": 0.02936468807812247, + "loss": 0.7823, + "num_input_tokens_seen": 14313776, + "step": 24665 + }, + { + "epoch": 3.6744116770926425, + "grad_norm": 0.038818359375, + "learning_rate": 0.029364126557011525, + "loss": 0.8135, + "num_input_tokens_seen": 14316784, + "step": 24670 + }, + { + "epoch": 3.6751563896336013, + "grad_norm": 0.031005859375, + "learning_rate": 0.029363564793233262, + "loss": 0.8027, + "num_input_tokens_seen": 14319440, + "step": 24675 + }, + { + "epoch": 3.675901102174561, + "grad_norm": 0.0308837890625, + "learning_rate": 0.029363002786797168, + "loss": 0.8099, + "num_input_tokens_seen": 14322384, + "step": 24680 + }, + { + "epoch": 3.6766458147155197, + "grad_norm": 0.032958984375, + "learning_rate": 0.02936244053771274, + "loss": 0.7721, + "num_input_tokens_seen": 14325200, + "step": 24685 + }, + { + "epoch": 3.677390527256479, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029361878045989477, + "loss": 0.8283, + "num_input_tokens_seen": 14328144, + "step": 24690 + }, + { + "epoch": 3.678135239797438, + "grad_norm": 0.03564453125, + "learning_rate": 0.029361315311636875, + "loss": 0.8138, + "num_input_tokens_seen": 14330864, + "step": 24695 + }, + { + "epoch": 3.6788799523383973, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029360752334664456, + "loss": 0.8094, + "num_input_tokens_seen": 14333808, + "step": 24700 + }, + { + "epoch": 3.6796246648793565, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029360189115081717, + "loss": 0.8027, + "num_input_tokens_seen": 14336816, + "step": 24705 + }, + { + "epoch": 3.6803693774203157, + "grad_norm": 0.031494140625, + "learning_rate": 0.02935962565289818, + "loss": 0.7653, + "num_input_tokens_seen": 14340016, + "step": 24710 + }, + { + "epoch": 3.681114089961275, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029359061948123363, + "loss": 0.8016, + "num_input_tokens_seen": 14343664, + "step": 24715 + }, + { + "epoch": 3.681858802502234, + "grad_norm": 0.02099609375, + "learning_rate": 0.029358498000766786, + "loss": 0.7921, + "num_input_tokens_seen": 14346480, + "step": 24720 + }, + { + "epoch": 3.6826035150431933, + "grad_norm": 0.014892578125, + "learning_rate": 0.02935793381083798, + "loss": 0.8104, + "num_input_tokens_seen": 14349136, + "step": 24725 + }, + { + "epoch": 3.6833482275841525, + "grad_norm": 0.02734375, + "learning_rate": 0.029357369378346473, + "loss": 0.8109, + "num_input_tokens_seen": 14351760, + "step": 24730 + }, + { + "epoch": 3.6840929401251117, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029356804703301806, + "loss": 0.8025, + "num_input_tokens_seen": 14354640, + "step": 24735 + }, + { + "epoch": 3.684837652666071, + "grad_norm": 0.01556396484375, + "learning_rate": 0.029356239785713514, + "loss": 0.8076, + "num_input_tokens_seen": 14357680, + "step": 24740 + }, + { + "epoch": 3.68558236520703, + "grad_norm": 0.032958984375, + "learning_rate": 0.02935567462559114, + "loss": 0.8182, + "num_input_tokens_seen": 14360464, + "step": 24745 + }, + { + "epoch": 3.6863270777479893, + "grad_norm": 0.0283203125, + "learning_rate": 0.029355109222944233, + "loss": 0.8228, + "num_input_tokens_seen": 14363408, + "step": 24750 + }, + { + "epoch": 3.6870717902889485, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02935454357778235, + "loss": 0.8234, + "num_input_tokens_seen": 14366512, + "step": 24755 + }, + { + "epoch": 3.6878165028299077, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029353977690115038, + "loss": 0.799, + "num_input_tokens_seen": 14369424, + "step": 24760 + }, + { + "epoch": 3.688561215370867, + "grad_norm": 0.0240478515625, + "learning_rate": 0.029353411559951862, + "loss": 0.8044, + "num_input_tokens_seen": 14372400, + "step": 24765 + }, + { + "epoch": 3.689305927911826, + "grad_norm": 0.0308837890625, + "learning_rate": 0.029352845187302388, + "loss": 0.8018, + "num_input_tokens_seen": 14375280, + "step": 24770 + }, + { + "epoch": 3.6900506404527853, + "grad_norm": 0.02099609375, + "learning_rate": 0.029352278572176184, + "loss": 0.7956, + "num_input_tokens_seen": 14378288, + "step": 24775 + }, + { + "epoch": 3.6907953529937445, + "grad_norm": 0.02099609375, + "learning_rate": 0.02935171171458282, + "loss": 0.803, + "num_input_tokens_seen": 14381200, + "step": 24780 + }, + { + "epoch": 3.6915400655347037, + "grad_norm": 0.014404296875, + "learning_rate": 0.029351144614531866, + "loss": 0.8204, + "num_input_tokens_seen": 14384560, + "step": 24785 + }, + { + "epoch": 3.692284778075663, + "grad_norm": 0.02099609375, + "learning_rate": 0.029350577272032916, + "loss": 0.7817, + "num_input_tokens_seen": 14387408, + "step": 24790 + }, + { + "epoch": 3.693029490616622, + "grad_norm": 0.017333984375, + "learning_rate": 0.029350009687095547, + "loss": 0.8058, + "num_input_tokens_seen": 14390512, + "step": 24795 + }, + { + "epoch": 3.6937742031575813, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02934944185972935, + "loss": 0.7783, + "num_input_tokens_seen": 14393296, + "step": 24800 + }, + { + "epoch": 3.6945189156985405, + "grad_norm": 0.019287109375, + "learning_rate": 0.029348873789943913, + "loss": 0.7783, + "num_input_tokens_seen": 14395984, + "step": 24805 + }, + { + "epoch": 3.6952636282394993, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029348305477748842, + "loss": 0.8154, + "num_input_tokens_seen": 14399152, + "step": 24810 + }, + { + "epoch": 3.696008340780459, + "grad_norm": 0.019775390625, + "learning_rate": 0.02934773692315373, + "loss": 0.7934, + "num_input_tokens_seen": 14402064, + "step": 24815 + }, + { + "epoch": 3.6967530533214177, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029347168126168183, + "loss": 0.7985, + "num_input_tokens_seen": 14404688, + "step": 24820 + }, + { + "epoch": 3.6974977658623773, + "grad_norm": 0.0341796875, + "learning_rate": 0.029346599086801817, + "loss": 0.7933, + "num_input_tokens_seen": 14407600, + "step": 24825 + }, + { + "epoch": 3.698242478403336, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02934602980506424, + "loss": 0.7922, + "num_input_tokens_seen": 14410416, + "step": 24830 + }, + { + "epoch": 3.6989871909442957, + "grad_norm": 0.02197265625, + "learning_rate": 0.029345460280965067, + "loss": 0.811, + "num_input_tokens_seen": 14413296, + "step": 24835 + }, + { + "epoch": 3.6997319034852545, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029344890514513926, + "loss": 0.7902, + "num_input_tokens_seen": 14415792, + "step": 24840 + }, + { + "epoch": 3.700476616026214, + "grad_norm": 0.0234375, + "learning_rate": 0.029344320505720434, + "loss": 0.7971, + "num_input_tokens_seen": 14418896, + "step": 24845 + }, + { + "epoch": 3.701221328567173, + "grad_norm": 0.029052734375, + "learning_rate": 0.029343750254594233, + "loss": 0.7795, + "num_input_tokens_seen": 14421584, + "step": 24850 + }, + { + "epoch": 3.701966041108132, + "grad_norm": 0.01904296875, + "learning_rate": 0.029343179761144946, + "loss": 0.8055, + "num_input_tokens_seen": 14424528, + "step": 24855 + }, + { + "epoch": 3.7027107536490913, + "grad_norm": 0.024169921875, + "learning_rate": 0.029342609025382212, + "loss": 0.7826, + "num_input_tokens_seen": 14427888, + "step": 24860 + }, + { + "epoch": 3.7034554661900505, + "grad_norm": 0.03564453125, + "learning_rate": 0.029342038047315683, + "loss": 0.7858, + "num_input_tokens_seen": 14430768, + "step": 24865 + }, + { + "epoch": 3.7042001787310097, + "grad_norm": 0.025634765625, + "learning_rate": 0.02934146682695499, + "loss": 0.7985, + "num_input_tokens_seen": 14433872, + "step": 24870 + }, + { + "epoch": 3.704944891271969, + "grad_norm": 0.0230712890625, + "learning_rate": 0.029340895364309796, + "loss": 0.8313, + "num_input_tokens_seen": 14436624, + "step": 24875 + }, + { + "epoch": 3.705689603812928, + "grad_norm": 0.023681640625, + "learning_rate": 0.02934032365938975, + "loss": 0.7622, + "num_input_tokens_seen": 14439952, + "step": 24880 + }, + { + "epoch": 3.7064343163538873, + "grad_norm": 0.03955078125, + "learning_rate": 0.029339751712204508, + "loss": 0.8433, + "num_input_tokens_seen": 14443056, + "step": 24885 + }, + { + "epoch": 3.7071790288948465, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029339179522763735, + "loss": 0.8395, + "num_input_tokens_seen": 14445808, + "step": 24890 + }, + { + "epoch": 3.7079237414358057, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0293386070910771, + "loss": 0.7872, + "num_input_tokens_seen": 14448624, + "step": 24895 + }, + { + "epoch": 3.708668453976765, + "grad_norm": 0.01434326171875, + "learning_rate": 0.029338034417154272, + "loss": 0.8167, + "num_input_tokens_seen": 14451472, + "step": 24900 + }, + { + "epoch": 3.709413166517724, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029337461501004924, + "loss": 0.7728, + "num_input_tokens_seen": 14454320, + "step": 24905 + }, + { + "epoch": 3.7101578790586833, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029336888342638735, + "loss": 0.7932, + "num_input_tokens_seen": 14457424, + "step": 24910 + }, + { + "epoch": 3.7109025915996425, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029336314942065386, + "loss": 0.8194, + "num_input_tokens_seen": 14460336, + "step": 24915 + }, + { + "epoch": 3.7116473041406017, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02933574129929457, + "loss": 0.8157, + "num_input_tokens_seen": 14463280, + "step": 24920 + }, + { + "epoch": 3.712392016681561, + "grad_norm": 0.022705078125, + "learning_rate": 0.029335167414335978, + "loss": 0.8148, + "num_input_tokens_seen": 14466160, + "step": 24925 + }, + { + "epoch": 3.71313672922252, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029334593287199298, + "loss": 0.7947, + "num_input_tokens_seen": 14468880, + "step": 24930 + }, + { + "epoch": 3.7138814417634793, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029334018917894233, + "loss": 0.7957, + "num_input_tokens_seen": 14471632, + "step": 24935 + }, + { + "epoch": 3.7146261543044385, + "grad_norm": 0.03173828125, + "learning_rate": 0.02933344430643049, + "loss": 0.7904, + "num_input_tokens_seen": 14474608, + "step": 24940 + }, + { + "epoch": 3.7153708668453977, + "grad_norm": 0.0166015625, + "learning_rate": 0.02933286945281777, + "loss": 0.8125, + "num_input_tokens_seen": 14477680, + "step": 24945 + }, + { + "epoch": 3.716115579386357, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02933229435706579, + "loss": 0.7782, + "num_input_tokens_seen": 14480496, + "step": 24950 + }, + { + "epoch": 3.716860291927316, + "grad_norm": 0.0159912109375, + "learning_rate": 0.029331719019184264, + "loss": 0.8203, + "num_input_tokens_seen": 14483472, + "step": 24955 + }, + { + "epoch": 3.7176050044682754, + "grad_norm": 0.03759765625, + "learning_rate": 0.02933114343918291, + "loss": 0.814, + "num_input_tokens_seen": 14486224, + "step": 24960 + }, + { + "epoch": 3.7183497170092346, + "grad_norm": 0.032470703125, + "learning_rate": 0.02933056761707145, + "loss": 0.8495, + "num_input_tokens_seen": 14489456, + "step": 24965 + }, + { + "epoch": 3.7190944295501938, + "grad_norm": 0.0341796875, + "learning_rate": 0.029329991552859622, + "loss": 0.7973, + "num_input_tokens_seen": 14492784, + "step": 24970 + }, + { + "epoch": 3.719839142091153, + "grad_norm": 0.0198974609375, + "learning_rate": 0.029329415246557147, + "loss": 0.7837, + "num_input_tokens_seen": 14495664, + "step": 24975 + }, + { + "epoch": 3.720583854632112, + "grad_norm": 0.027099609375, + "learning_rate": 0.029328838698173765, + "loss": 0.8257, + "num_input_tokens_seen": 14498640, + "step": 24980 + }, + { + "epoch": 3.721328567173071, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029328261907719217, + "loss": 0.8173, + "num_input_tokens_seen": 14501488, + "step": 24985 + }, + { + "epoch": 3.7220732797140306, + "grad_norm": 0.02001953125, + "learning_rate": 0.02932768487520325, + "loss": 0.7944, + "num_input_tokens_seen": 14504528, + "step": 24990 + }, + { + "epoch": 3.7228179922549893, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029327107600635602, + "loss": 0.8043, + "num_input_tokens_seen": 14507472, + "step": 24995 + }, + { + "epoch": 3.723562704795949, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02932653008402604, + "loss": 0.8169, + "num_input_tokens_seen": 14510256, + "step": 25000 + }, + { + "epoch": 3.7243074173369077, + "grad_norm": 0.020751953125, + "learning_rate": 0.02932595232538431, + "loss": 0.7947, + "num_input_tokens_seen": 14513200, + "step": 25005 + }, + { + "epoch": 3.7250521298778674, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029325374324720172, + "loss": 0.8228, + "num_input_tokens_seen": 14515792, + "step": 25010 + }, + { + "epoch": 3.725796842418826, + "grad_norm": 0.01287841796875, + "learning_rate": 0.0293247960820434, + "loss": 0.8204, + "num_input_tokens_seen": 14518640, + "step": 25015 + }, + { + "epoch": 3.726541554959786, + "grad_norm": 0.01519775390625, + "learning_rate": 0.029324217597363755, + "loss": 0.8022, + "num_input_tokens_seen": 14521904, + "step": 25020 + }, + { + "epoch": 3.7272862675007445, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02932363887069101, + "loss": 0.7905, + "num_input_tokens_seen": 14524848, + "step": 25025 + }, + { + "epoch": 3.7280309800417037, + "grad_norm": 0.031005859375, + "learning_rate": 0.029323059902034945, + "loss": 0.7833, + "num_input_tokens_seen": 14527600, + "step": 25030 + }, + { + "epoch": 3.728775692582663, + "grad_norm": 0.02099609375, + "learning_rate": 0.02932248069140534, + "loss": 0.8158, + "num_input_tokens_seen": 14530352, + "step": 25035 + }, + { + "epoch": 3.729520405123622, + "grad_norm": 0.0120849609375, + "learning_rate": 0.029321901238811985, + "loss": 0.7883, + "num_input_tokens_seen": 14533328, + "step": 25040 + }, + { + "epoch": 3.7302651176645814, + "grad_norm": 0.021240234375, + "learning_rate": 0.029321321544264662, + "loss": 0.797, + "num_input_tokens_seen": 14536624, + "step": 25045 + }, + { + "epoch": 3.7310098302055406, + "grad_norm": 0.020751953125, + "learning_rate": 0.029320741607773166, + "loss": 0.8052, + "num_input_tokens_seen": 14539280, + "step": 25050 + }, + { + "epoch": 3.7317545427464998, + "grad_norm": 0.028564453125, + "learning_rate": 0.029320161429347298, + "loss": 0.8096, + "num_input_tokens_seen": 14542000, + "step": 25055 + }, + { + "epoch": 3.732499255287459, + "grad_norm": 0.01287841796875, + "learning_rate": 0.029319581008996856, + "loss": 0.7849, + "num_input_tokens_seen": 14545072, + "step": 25060 + }, + { + "epoch": 3.733243967828418, + "grad_norm": 0.02294921875, + "learning_rate": 0.029319000346731645, + "loss": 0.8053, + "num_input_tokens_seen": 14547920, + "step": 25065 + }, + { + "epoch": 3.7339886803693774, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029318419442561475, + "loss": 0.7988, + "num_input_tokens_seen": 14550800, + "step": 25070 + }, + { + "epoch": 3.7347333929103366, + "grad_norm": 0.02490234375, + "learning_rate": 0.02931783829649616, + "loss": 0.7987, + "num_input_tokens_seen": 14553680, + "step": 25075 + }, + { + "epoch": 3.7354781054512958, + "grad_norm": 0.03173828125, + "learning_rate": 0.029317256908545523, + "loss": 0.7823, + "num_input_tokens_seen": 14556624, + "step": 25080 + }, + { + "epoch": 3.736222817992255, + "grad_norm": 0.046142578125, + "learning_rate": 0.02931667527871938, + "loss": 0.816, + "num_input_tokens_seen": 14559568, + "step": 25085 + }, + { + "epoch": 3.736967530533214, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02931609340702756, + "loss": 0.8132, + "num_input_tokens_seen": 14562608, + "step": 25090 + }, + { + "epoch": 3.7377122430741734, + "grad_norm": 0.031982421875, + "learning_rate": 0.029315511293479896, + "loss": 0.8232, + "num_input_tokens_seen": 14565456, + "step": 25095 + }, + { + "epoch": 3.7384569556151326, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029314928938086213, + "loss": 0.8044, + "num_input_tokens_seen": 14568304, + "step": 25100 + }, + { + "epoch": 3.739201668156092, + "grad_norm": 0.034423828125, + "learning_rate": 0.029314346340856355, + "loss": 0.7972, + "num_input_tokens_seen": 14571248, + "step": 25105 + }, + { + "epoch": 3.739946380697051, + "grad_norm": 0.0294189453125, + "learning_rate": 0.029313763501800167, + "loss": 0.805, + "num_input_tokens_seen": 14574096, + "step": 25110 + }, + { + "epoch": 3.74069109323801, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029313180420927492, + "loss": 0.7784, + "num_input_tokens_seen": 14576688, + "step": 25115 + }, + { + "epoch": 3.7414358057789694, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02931259709824818, + "loss": 0.8001, + "num_input_tokens_seen": 14579504, + "step": 25120 + }, + { + "epoch": 3.7421805183199286, + "grad_norm": 0.0205078125, + "learning_rate": 0.029312013533772083, + "loss": 0.8088, + "num_input_tokens_seen": 14582320, + "step": 25125 + }, + { + "epoch": 3.742925230860888, + "grad_norm": 0.0380859375, + "learning_rate": 0.029311429727509067, + "loss": 0.822, + "num_input_tokens_seen": 14585072, + "step": 25130 + }, + { + "epoch": 3.743669943401847, + "grad_norm": 0.03173828125, + "learning_rate": 0.029310845679468994, + "loss": 0.822, + "num_input_tokens_seen": 14588016, + "step": 25135 + }, + { + "epoch": 3.744414655942806, + "grad_norm": 0.01416015625, + "learning_rate": 0.029310261389661724, + "loss": 0.8003, + "num_input_tokens_seen": 14591088, + "step": 25140 + }, + { + "epoch": 3.7451593684837654, + "grad_norm": 0.03271484375, + "learning_rate": 0.02930967685809713, + "loss": 0.8001, + "num_input_tokens_seen": 14593712, + "step": 25145 + }, + { + "epoch": 3.7459040810247246, + "grad_norm": 0.0223388671875, + "learning_rate": 0.029309092084785094, + "loss": 0.7849, + "num_input_tokens_seen": 14596528, + "step": 25150 + }, + { + "epoch": 3.746648793565684, + "grad_norm": 0.0224609375, + "learning_rate": 0.02930850706973549, + "loss": 0.8066, + "num_input_tokens_seen": 14599280, + "step": 25155 + }, + { + "epoch": 3.7473935061066426, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0293079218129582, + "loss": 0.8139, + "num_input_tokens_seen": 14602352, + "step": 25160 + }, + { + "epoch": 3.748138218647602, + "grad_norm": 0.030517578125, + "learning_rate": 0.029307336314463115, + "loss": 0.802, + "num_input_tokens_seen": 14605264, + "step": 25165 + }, + { + "epoch": 3.748882931188561, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029306750574260118, + "loss": 0.7929, + "num_input_tokens_seen": 14608208, + "step": 25170 + }, + { + "epoch": 3.7496276437295206, + "grad_norm": 0.021484375, + "learning_rate": 0.029306164592359116, + "loss": 0.7878, + "num_input_tokens_seen": 14611376, + "step": 25175 + }, + { + "epoch": 3.7503723562704794, + "grad_norm": 0.0159912109375, + "learning_rate": 0.029305578368770006, + "loss": 0.7914, + "num_input_tokens_seen": 14614512, + "step": 25180 + }, + { + "epoch": 3.751117068811439, + "grad_norm": 0.018310546875, + "learning_rate": 0.029304991903502685, + "loss": 0.776, + "num_input_tokens_seen": 14617424, + "step": 25185 + }, + { + "epoch": 3.751861781352398, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029304405196567067, + "loss": 0.7739, + "num_input_tokens_seen": 14620400, + "step": 25190 + }, + { + "epoch": 3.7526064938933574, + "grad_norm": 0.0302734375, + "learning_rate": 0.02930381824797306, + "loss": 0.8325, + "num_input_tokens_seen": 14623728, + "step": 25195 + }, + { + "epoch": 3.753351206434316, + "grad_norm": 0.01806640625, + "learning_rate": 0.02930323105773058, + "loss": 0.7735, + "num_input_tokens_seen": 14626864, + "step": 25200 + }, + { + "epoch": 3.7540959189752754, + "grad_norm": 0.022216796875, + "learning_rate": 0.029302643625849552, + "loss": 0.8028, + "num_input_tokens_seen": 14629712, + "step": 25205 + }, + { + "epoch": 3.7548406315162346, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029302055952339895, + "loss": 0.8089, + "num_input_tokens_seen": 14632432, + "step": 25210 + }, + { + "epoch": 3.755585344057194, + "grad_norm": 0.034423828125, + "learning_rate": 0.029301468037211537, + "loss": 0.7719, + "num_input_tokens_seen": 14635216, + "step": 25215 + }, + { + "epoch": 3.756330056598153, + "grad_norm": 0.02685546875, + "learning_rate": 0.029300879880474414, + "loss": 0.8385, + "num_input_tokens_seen": 14638160, + "step": 25220 + }, + { + "epoch": 3.757074769139112, + "grad_norm": 0.02490234375, + "learning_rate": 0.02930029148213846, + "loss": 0.8246, + "num_input_tokens_seen": 14641360, + "step": 25225 + }, + { + "epoch": 3.7578194816800714, + "grad_norm": 0.03125, + "learning_rate": 0.029299702842213612, + "loss": 0.766, + "num_input_tokens_seen": 14644432, + "step": 25230 + }, + { + "epoch": 3.7585641942210306, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029299113960709822, + "loss": 0.7909, + "num_input_tokens_seen": 14647856, + "step": 25235 + }, + { + "epoch": 3.75930890676199, + "grad_norm": 0.03369140625, + "learning_rate": 0.029298524837637032, + "loss": 0.7946, + "num_input_tokens_seen": 14650704, + "step": 25240 + }, + { + "epoch": 3.760053619302949, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029297935473005198, + "loss": 0.7919, + "num_input_tokens_seen": 14653552, + "step": 25245 + }, + { + "epoch": 3.760798331843908, + "grad_norm": 0.019775390625, + "learning_rate": 0.029297345866824278, + "loss": 0.8039, + "num_input_tokens_seen": 14656688, + "step": 25250 + }, + { + "epoch": 3.7615430443848674, + "grad_norm": 0.021240234375, + "learning_rate": 0.02929675601910423, + "loss": 0.8435, + "num_input_tokens_seen": 14659664, + "step": 25255 + }, + { + "epoch": 3.7622877569258266, + "grad_norm": 0.02294921875, + "learning_rate": 0.02929616592985502, + "loss": 0.7957, + "num_input_tokens_seen": 14662608, + "step": 25260 + }, + { + "epoch": 3.763032469466786, + "grad_norm": 0.01220703125, + "learning_rate": 0.029295575599086614, + "loss": 0.794, + "num_input_tokens_seen": 14665584, + "step": 25265 + }, + { + "epoch": 3.763777182007745, + "grad_norm": 0.0135498046875, + "learning_rate": 0.029294985026808988, + "loss": 0.7909, + "num_input_tokens_seen": 14668560, + "step": 25270 + }, + { + "epoch": 3.7645218945487042, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02929439421303212, + "loss": 0.8085, + "num_input_tokens_seen": 14671184, + "step": 25275 + }, + { + "epoch": 3.7652666070896634, + "grad_norm": 0.01513671875, + "learning_rate": 0.02929380315776599, + "loss": 0.8132, + "num_input_tokens_seen": 14674064, + "step": 25280 + }, + { + "epoch": 3.7660113196306226, + "grad_norm": 0.03173828125, + "learning_rate": 0.029293211861020584, + "loss": 0.8026, + "num_input_tokens_seen": 14676848, + "step": 25285 + }, + { + "epoch": 3.766756032171582, + "grad_norm": 0.021728515625, + "learning_rate": 0.029292620322805888, + "loss": 0.8001, + "num_input_tokens_seen": 14679664, + "step": 25290 + }, + { + "epoch": 3.767500744712541, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029292028543131902, + "loss": 0.7777, + "num_input_tokens_seen": 14682704, + "step": 25295 + }, + { + "epoch": 3.7682454572535002, + "grad_norm": 0.0294189453125, + "learning_rate": 0.029291436522008617, + "loss": 0.806, + "num_input_tokens_seen": 14685456, + "step": 25300 + }, + { + "epoch": 3.7689901697944594, + "grad_norm": 0.020751953125, + "learning_rate": 0.029290844259446037, + "loss": 0.8172, + "num_input_tokens_seen": 14688592, + "step": 25305 + }, + { + "epoch": 3.7697348823354186, + "grad_norm": 0.030517578125, + "learning_rate": 0.029290251755454168, + "loss": 0.7665, + "num_input_tokens_seen": 14691760, + "step": 25310 + }, + { + "epoch": 3.770479594876378, + "grad_norm": 0.01165771484375, + "learning_rate": 0.02928965901004302, + "loss": 0.8023, + "num_input_tokens_seen": 14694768, + "step": 25315 + }, + { + "epoch": 3.771224307417337, + "grad_norm": 0.03564453125, + "learning_rate": 0.029289066023222607, + "loss": 0.8167, + "num_input_tokens_seen": 14698096, + "step": 25320 + }, + { + "epoch": 3.7719690199582963, + "grad_norm": 0.0322265625, + "learning_rate": 0.029288472795002945, + "loss": 0.8009, + "num_input_tokens_seen": 14700816, + "step": 25325 + }, + { + "epoch": 3.7727137324992555, + "grad_norm": 0.0205078125, + "learning_rate": 0.029287879325394055, + "loss": 0.7631, + "num_input_tokens_seen": 14703664, + "step": 25330 + }, + { + "epoch": 3.773458445040214, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02928728561440597, + "loss": 0.8174, + "num_input_tokens_seen": 14706672, + "step": 25335 + }, + { + "epoch": 3.774203157581174, + "grad_norm": 0.01348876953125, + "learning_rate": 0.02928669166204871, + "loss": 0.8257, + "num_input_tokens_seen": 14709552, + "step": 25340 + }, + { + "epoch": 3.7749478701221326, + "grad_norm": 0.023193359375, + "learning_rate": 0.02928609746833232, + "loss": 0.7775, + "num_input_tokens_seen": 14712720, + "step": 25345 + }, + { + "epoch": 3.7756925826630923, + "grad_norm": 0.021728515625, + "learning_rate": 0.02928550303326683, + "loss": 0.7973, + "num_input_tokens_seen": 14715280, + "step": 25350 + }, + { + "epoch": 3.776437295204051, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029284908356862288, + "loss": 0.8129, + "num_input_tokens_seen": 14717968, + "step": 25355 + }, + { + "epoch": 3.7771820077450107, + "grad_norm": 0.0244140625, + "learning_rate": 0.029284313439128736, + "loss": 0.7804, + "num_input_tokens_seen": 14720560, + "step": 25360 + }, + { + "epoch": 3.7779267202859694, + "grad_norm": 0.025634765625, + "learning_rate": 0.02928371828007623, + "loss": 0.8067, + "num_input_tokens_seen": 14723472, + "step": 25365 + }, + { + "epoch": 3.778671432826929, + "grad_norm": 0.026611328125, + "learning_rate": 0.029283122879714813, + "loss": 0.8064, + "num_input_tokens_seen": 14726256, + "step": 25370 + }, + { + "epoch": 3.779416145367888, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02928252723805456, + "loss": 0.8434, + "num_input_tokens_seen": 14728944, + "step": 25375 + }, + { + "epoch": 3.780160857908847, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029281931355105522, + "loss": 0.7805, + "num_input_tokens_seen": 14731664, + "step": 25380 + }, + { + "epoch": 3.7809055704498062, + "grad_norm": 0.02978515625, + "learning_rate": 0.029281335230877767, + "loss": 0.837, + "num_input_tokens_seen": 14734736, + "step": 25385 + }, + { + "epoch": 3.7816502829907654, + "grad_norm": 0.01336669921875, + "learning_rate": 0.02928073886538137, + "loss": 0.8258, + "num_input_tokens_seen": 14737808, + "step": 25390 + }, + { + "epoch": 3.7823949955317246, + "grad_norm": 0.024169921875, + "learning_rate": 0.029280142258626407, + "loss": 0.8225, + "num_input_tokens_seen": 14740688, + "step": 25395 + }, + { + "epoch": 3.783139708072684, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02927954541062295, + "loss": 0.804, + "num_input_tokens_seen": 14743344, + "step": 25400 + }, + { + "epoch": 3.783884420613643, + "grad_norm": 0.028076171875, + "learning_rate": 0.02927894832138109, + "loss": 0.7913, + "num_input_tokens_seen": 14746096, + "step": 25405 + }, + { + "epoch": 3.7846291331546023, + "grad_norm": 0.02685546875, + "learning_rate": 0.029278350990910907, + "loss": 0.8062, + "num_input_tokens_seen": 14749328, + "step": 25410 + }, + { + "epoch": 3.7853738456955615, + "grad_norm": 0.029296875, + "learning_rate": 0.029277753419222497, + "loss": 0.8166, + "num_input_tokens_seen": 14752144, + "step": 25415 + }, + { + "epoch": 3.7861185582365207, + "grad_norm": 0.0224609375, + "learning_rate": 0.029277155606325957, + "loss": 0.8119, + "num_input_tokens_seen": 14754992, + "step": 25420 + }, + { + "epoch": 3.78686327077748, + "grad_norm": 0.01287841796875, + "learning_rate": 0.029276557552231384, + "loss": 0.7998, + "num_input_tokens_seen": 14757712, + "step": 25425 + }, + { + "epoch": 3.787607983318439, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029275959256948876, + "loss": 0.7938, + "num_input_tokens_seen": 14760720, + "step": 25430 + }, + { + "epoch": 3.7883526958593983, + "grad_norm": 0.021484375, + "learning_rate": 0.02927536072048855, + "loss": 0.7948, + "num_input_tokens_seen": 14763472, + "step": 25435 + }, + { + "epoch": 3.7890974084003575, + "grad_norm": 0.020751953125, + "learning_rate": 0.02927476194286051, + "loss": 0.7871, + "num_input_tokens_seen": 14766192, + "step": 25440 + }, + { + "epoch": 3.7898421209413167, + "grad_norm": 0.01348876953125, + "learning_rate": 0.02927416292407488, + "loss": 0.8068, + "num_input_tokens_seen": 14769008, + "step": 25445 + }, + { + "epoch": 3.790586833482276, + "grad_norm": 0.01544189453125, + "learning_rate": 0.029273563664141772, + "loss": 0.816, + "num_input_tokens_seen": 14771984, + "step": 25450 + }, + { + "epoch": 3.791331546023235, + "grad_norm": 0.02294921875, + "learning_rate": 0.029272964163071312, + "loss": 0.8083, + "num_input_tokens_seen": 14775152, + "step": 25455 + }, + { + "epoch": 3.7920762585641943, + "grad_norm": 0.0234375, + "learning_rate": 0.02927236442087363, + "loss": 0.8143, + "num_input_tokens_seen": 14778000, + "step": 25460 + }, + { + "epoch": 3.7928209711051535, + "grad_norm": 0.0322265625, + "learning_rate": 0.029271764437558856, + "loss": 0.8009, + "num_input_tokens_seen": 14780912, + "step": 25465 + }, + { + "epoch": 3.7935656836461127, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02927116421313713, + "loss": 0.792, + "num_input_tokens_seen": 14783888, + "step": 25470 + }, + { + "epoch": 3.794310396187072, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029270563747618588, + "loss": 0.7905, + "num_input_tokens_seen": 14786896, + "step": 25475 + }, + { + "epoch": 3.795055108728031, + "grad_norm": 0.021240234375, + "learning_rate": 0.029269963041013373, + "loss": 0.7992, + "num_input_tokens_seen": 14789968, + "step": 25480 + }, + { + "epoch": 3.7957998212689903, + "grad_norm": 0.01507568359375, + "learning_rate": 0.029269362093331638, + "loss": 0.8143, + "num_input_tokens_seen": 14792816, + "step": 25485 + }, + { + "epoch": 3.7965445338099495, + "grad_norm": 0.035888671875, + "learning_rate": 0.029268760904583527, + "loss": 0.789, + "num_input_tokens_seen": 14795760, + "step": 25490 + }, + { + "epoch": 3.7972892463509087, + "grad_norm": 0.02392578125, + "learning_rate": 0.029268159474779207, + "loss": 0.7934, + "num_input_tokens_seen": 14798416, + "step": 25495 + }, + { + "epoch": 3.798033958891868, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02926755780392884, + "loss": 0.817, + "num_input_tokens_seen": 14801264, + "step": 25500 + }, + { + "epoch": 3.798778671432827, + "grad_norm": 0.01214599609375, + "learning_rate": 0.02926695589204258, + "loss": 0.8071, + "num_input_tokens_seen": 14803888, + "step": 25505 + }, + { + "epoch": 3.799523383973786, + "grad_norm": 0.0302734375, + "learning_rate": 0.0292663537391306, + "loss": 0.7667, + "num_input_tokens_seen": 14806640, + "step": 25510 + }, + { + "epoch": 3.8002680965147455, + "grad_norm": 0.032470703125, + "learning_rate": 0.029265751345203074, + "loss": 0.8097, + "num_input_tokens_seen": 14809552, + "step": 25515 + }, + { + "epoch": 3.8010128090557043, + "grad_norm": 0.0301513671875, + "learning_rate": 0.029265148710270177, + "loss": 0.826, + "num_input_tokens_seen": 14812592, + "step": 25520 + }, + { + "epoch": 3.801757521596664, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02926454583434209, + "loss": 0.7996, + "num_input_tokens_seen": 14815632, + "step": 25525 + }, + { + "epoch": 3.8025022341376227, + "grad_norm": 0.01324462890625, + "learning_rate": 0.029263942717429, + "loss": 0.8125, + "num_input_tokens_seen": 14818544, + "step": 25530 + }, + { + "epoch": 3.8032469466785823, + "grad_norm": 0.03271484375, + "learning_rate": 0.029263339359541098, + "loss": 0.8004, + "num_input_tokens_seen": 14821200, + "step": 25535 + }, + { + "epoch": 3.803991659219541, + "grad_norm": 0.02197265625, + "learning_rate": 0.029262735760688573, + "loss": 0.8027, + "num_input_tokens_seen": 14824080, + "step": 25540 + }, + { + "epoch": 3.8047363717605007, + "grad_norm": 0.0205078125, + "learning_rate": 0.029262131920881618, + "loss": 0.7823, + "num_input_tokens_seen": 14826864, + "step": 25545 + }, + { + "epoch": 3.8054810843014595, + "grad_norm": 0.0166015625, + "learning_rate": 0.029261527840130447, + "loss": 0.8099, + "num_input_tokens_seen": 14830096, + "step": 25550 + }, + { + "epoch": 3.8062257968424187, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029260923518445253, + "loss": 0.7999, + "num_input_tokens_seen": 14833104, + "step": 25555 + }, + { + "epoch": 3.806970509383378, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029260318955836254, + "loss": 0.7838, + "num_input_tokens_seen": 14836432, + "step": 25560 + }, + { + "epoch": 3.807715221924337, + "grad_norm": 0.02099609375, + "learning_rate": 0.02925971415231366, + "loss": 0.8037, + "num_input_tokens_seen": 14839184, + "step": 25565 + }, + { + "epoch": 3.8084599344652963, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029259109107887687, + "loss": 0.8136, + "num_input_tokens_seen": 14842064, + "step": 25570 + }, + { + "epoch": 3.8092046470062555, + "grad_norm": 0.025390625, + "learning_rate": 0.029258503822568553, + "loss": 0.8073, + "num_input_tokens_seen": 14844784, + "step": 25575 + }, + { + "epoch": 3.8099493595472147, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02925789829636649, + "loss": 0.8006, + "num_input_tokens_seen": 14847728, + "step": 25580 + }, + { + "epoch": 3.810694072088174, + "grad_norm": 0.0126953125, + "learning_rate": 0.02925729252929173, + "loss": 0.809, + "num_input_tokens_seen": 14850768, + "step": 25585 + }, + { + "epoch": 3.811438784629133, + "grad_norm": 0.011962890625, + "learning_rate": 0.029256686521354505, + "loss": 0.8114, + "num_input_tokens_seen": 14853360, + "step": 25590 + }, + { + "epoch": 3.8121834971700923, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02925608027256505, + "loss": 0.7998, + "num_input_tokens_seen": 14856400, + "step": 25595 + }, + { + "epoch": 3.8129282097110515, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029255473782933607, + "loss": 0.8054, + "num_input_tokens_seen": 14859344, + "step": 25600 + }, + { + "epoch": 3.8136729222520107, + "grad_norm": 0.0224609375, + "learning_rate": 0.02925486705247042, + "loss": 0.7826, + "num_input_tokens_seen": 14862288, + "step": 25605 + }, + { + "epoch": 3.81441763479297, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029254260081185743, + "loss": 0.7906, + "num_input_tokens_seen": 14865360, + "step": 25610 + }, + { + "epoch": 3.815162347333929, + "grad_norm": 0.024658203125, + "learning_rate": 0.029253652869089833, + "loss": 0.7867, + "num_input_tokens_seen": 14868112, + "step": 25615 + }, + { + "epoch": 3.8159070598748883, + "grad_norm": 0.02392578125, + "learning_rate": 0.02925304541619294, + "loss": 0.8131, + "num_input_tokens_seen": 14871152, + "step": 25620 + }, + { + "epoch": 3.8166517724158475, + "grad_norm": 0.02197265625, + "learning_rate": 0.029252437722505336, + "loss": 0.8332, + "num_input_tokens_seen": 14874096, + "step": 25625 + }, + { + "epoch": 3.8173964849568067, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029251829788037278, + "loss": 0.8162, + "num_input_tokens_seen": 14877136, + "step": 25630 + }, + { + "epoch": 3.818141197497766, + "grad_norm": 0.01165771484375, + "learning_rate": 0.02925122161279904, + "loss": 0.8046, + "num_input_tokens_seen": 14879952, + "step": 25635 + }, + { + "epoch": 3.818885910038725, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0292506131968009, + "loss": 0.8054, + "num_input_tokens_seen": 14882736, + "step": 25640 + }, + { + "epoch": 3.8196306225796843, + "grad_norm": 0.01361083984375, + "learning_rate": 0.029250004540053134, + "loss": 0.8272, + "num_input_tokens_seen": 14885968, + "step": 25645 + }, + { + "epoch": 3.8203753351206435, + "grad_norm": 0.0167236328125, + "learning_rate": 0.029249395642566026, + "loss": 0.8021, + "num_input_tokens_seen": 14889008, + "step": 25650 + }, + { + "epoch": 3.8211200476616027, + "grad_norm": 0.03173828125, + "learning_rate": 0.029248786504349856, + "loss": 0.7977, + "num_input_tokens_seen": 14891920, + "step": 25655 + }, + { + "epoch": 3.821864760202562, + "grad_norm": 0.02001953125, + "learning_rate": 0.029248177125414925, + "loss": 0.7964, + "num_input_tokens_seen": 14894448, + "step": 25660 + }, + { + "epoch": 3.822609472743521, + "grad_norm": 0.0296630859375, + "learning_rate": 0.029247567505771517, + "loss": 0.8029, + "num_input_tokens_seen": 14897424, + "step": 25665 + }, + { + "epoch": 3.8233541852844803, + "grad_norm": 0.021728515625, + "learning_rate": 0.029246957645429936, + "loss": 0.8039, + "num_input_tokens_seen": 14900240, + "step": 25670 + }, + { + "epoch": 3.824098897825439, + "grad_norm": 0.025390625, + "learning_rate": 0.029246347544400492, + "loss": 0.7883, + "num_input_tokens_seen": 14903248, + "step": 25675 + }, + { + "epoch": 3.8248436103663987, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02924573720269348, + "loss": 0.8038, + "num_input_tokens_seen": 14906256, + "step": 25680 + }, + { + "epoch": 3.8255883229073575, + "grad_norm": 0.0205078125, + "learning_rate": 0.029245126620319223, + "loss": 0.8043, + "num_input_tokens_seen": 14909264, + "step": 25685 + }, + { + "epoch": 3.826333035448317, + "grad_norm": 0.0123291015625, + "learning_rate": 0.029244515797288023, + "loss": 0.8054, + "num_input_tokens_seen": 14911888, + "step": 25690 + }, + { + "epoch": 3.827077747989276, + "grad_norm": 0.02294921875, + "learning_rate": 0.029243904733610212, + "loss": 0.8136, + "num_input_tokens_seen": 14914672, + "step": 25695 + }, + { + "epoch": 3.8278224605302356, + "grad_norm": 0.0252685546875, + "learning_rate": 0.029243293429296102, + "loss": 0.8124, + "num_input_tokens_seen": 14917808, + "step": 25700 + }, + { + "epoch": 3.8285671730711943, + "grad_norm": 0.01318359375, + "learning_rate": 0.02924268188435603, + "loss": 0.7775, + "num_input_tokens_seen": 14920848, + "step": 25705 + }, + { + "epoch": 3.829311885612154, + "grad_norm": 0.01177978515625, + "learning_rate": 0.02924207009880032, + "loss": 0.7765, + "num_input_tokens_seen": 14923664, + "step": 25710 + }, + { + "epoch": 3.8300565981531127, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029241458072639316, + "loss": 0.8032, + "num_input_tokens_seen": 14926320, + "step": 25715 + }, + { + "epoch": 3.830801310694072, + "grad_norm": 0.0322265625, + "learning_rate": 0.029240845805883348, + "loss": 0.7922, + "num_input_tokens_seen": 14929328, + "step": 25720 + }, + { + "epoch": 3.831546023235031, + "grad_norm": 0.02783203125, + "learning_rate": 0.02924023329854276, + "loss": 0.7944, + "num_input_tokens_seen": 14932304, + "step": 25725 + }, + { + "epoch": 3.8322907357759903, + "grad_norm": 0.021240234375, + "learning_rate": 0.02923962055062791, + "loss": 0.8118, + "num_input_tokens_seen": 14935216, + "step": 25730 + }, + { + "epoch": 3.8330354483169495, + "grad_norm": 0.015625, + "learning_rate": 0.029239007562149144, + "loss": 0.7952, + "num_input_tokens_seen": 14938256, + "step": 25735 + }, + { + "epoch": 3.8337801608579087, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02923839433311681, + "loss": 0.7912, + "num_input_tokens_seen": 14941296, + "step": 25740 + }, + { + "epoch": 3.834524873398868, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02923778086354128, + "loss": 0.7825, + "num_input_tokens_seen": 14944080, + "step": 25745 + }, + { + "epoch": 3.835269585939827, + "grad_norm": 0.0118408203125, + "learning_rate": 0.02923716715343291, + "loss": 0.816, + "num_input_tokens_seen": 14947056, + "step": 25750 + }, + { + "epoch": 3.8360142984807863, + "grad_norm": 0.0341796875, + "learning_rate": 0.029236553202802076, + "loss": 0.7899, + "num_input_tokens_seen": 14949968, + "step": 25755 + }, + { + "epoch": 3.8367590110217455, + "grad_norm": 0.01190185546875, + "learning_rate": 0.02923593901165914, + "loss": 0.8019, + "num_input_tokens_seen": 14952752, + "step": 25760 + }, + { + "epoch": 3.8375037235627047, + "grad_norm": 0.0264892578125, + "learning_rate": 0.029235324580014488, + "loss": 0.8001, + "num_input_tokens_seen": 14955920, + "step": 25765 + }, + { + "epoch": 3.838248436103664, + "grad_norm": 0.022216796875, + "learning_rate": 0.029234709907878492, + "loss": 0.799, + "num_input_tokens_seen": 14958832, + "step": 25770 + }, + { + "epoch": 3.838993148644623, + "grad_norm": 0.0303955078125, + "learning_rate": 0.029234094995261542, + "loss": 0.793, + "num_input_tokens_seen": 14961968, + "step": 25775 + }, + { + "epoch": 3.8397378611855824, + "grad_norm": 0.019287109375, + "learning_rate": 0.029233479842174018, + "loss": 0.8162, + "num_input_tokens_seen": 14964592, + "step": 25780 + }, + { + "epoch": 3.8404825737265416, + "grad_norm": 0.032470703125, + "learning_rate": 0.029232864448626324, + "loss": 0.8099, + "num_input_tokens_seen": 14967376, + "step": 25785 + }, + { + "epoch": 3.8412272862675008, + "grad_norm": 0.0130615234375, + "learning_rate": 0.02923224881462885, + "loss": 0.7792, + "num_input_tokens_seen": 14970352, + "step": 25790 + }, + { + "epoch": 3.84197199880846, + "grad_norm": 0.033935546875, + "learning_rate": 0.029231632940191994, + "loss": 0.8158, + "num_input_tokens_seen": 14972880, + "step": 25795 + }, + { + "epoch": 3.842716711349419, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02923101682532617, + "loss": 0.8037, + "num_input_tokens_seen": 14975632, + "step": 25800 + }, + { + "epoch": 3.8434614238903784, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029230400470041776, + "loss": 0.7762, + "num_input_tokens_seen": 14978864, + "step": 25805 + }, + { + "epoch": 3.8442061364313376, + "grad_norm": 0.03515625, + "learning_rate": 0.029229783874349227, + "loss": 0.8124, + "num_input_tokens_seen": 14981584, + "step": 25810 + }, + { + "epoch": 3.8449508489722968, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02922916703825895, + "loss": 0.8326, + "num_input_tokens_seen": 14984240, + "step": 25815 + }, + { + "epoch": 3.845695561513256, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029228549961781352, + "loss": 0.7808, + "num_input_tokens_seen": 14987248, + "step": 25820 + }, + { + "epoch": 3.846440274054215, + "grad_norm": 0.034423828125, + "learning_rate": 0.029227932644926864, + "loss": 0.8073, + "num_input_tokens_seen": 14990256, + "step": 25825 + }, + { + "epoch": 3.8471849865951744, + "grad_norm": 0.023193359375, + "learning_rate": 0.029227315087705916, + "loss": 0.8081, + "num_input_tokens_seen": 14993264, + "step": 25830 + }, + { + "epoch": 3.8479296991361336, + "grad_norm": 0.035400390625, + "learning_rate": 0.029226697290128938, + "loss": 0.7901, + "num_input_tokens_seen": 14995952, + "step": 25835 + }, + { + "epoch": 3.848674411677093, + "grad_norm": 0.027099609375, + "learning_rate": 0.02922607925220637, + "loss": 0.8105, + "num_input_tokens_seen": 14999216, + "step": 25840 + }, + { + "epoch": 3.849419124218052, + "grad_norm": 0.011962890625, + "learning_rate": 0.02922546097394865, + "loss": 0.7878, + "num_input_tokens_seen": 15002032, + "step": 25845 + }, + { + "epoch": 3.8501638367590107, + "grad_norm": 0.02294921875, + "learning_rate": 0.029224842455366225, + "loss": 0.8221, + "num_input_tokens_seen": 15004816, + "step": 25850 + }, + { + "epoch": 3.8509085492999704, + "grad_norm": 0.019775390625, + "learning_rate": 0.029224223696469546, + "loss": 0.8173, + "num_input_tokens_seen": 15007856, + "step": 25855 + }, + { + "epoch": 3.851653261840929, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029223604697269062, + "loss": 0.8188, + "num_input_tokens_seen": 15010736, + "step": 25860 + }, + { + "epoch": 3.852397974381889, + "grad_norm": 0.031982421875, + "learning_rate": 0.029222985457775238, + "loss": 0.7944, + "num_input_tokens_seen": 15013680, + "step": 25865 + }, + { + "epoch": 3.8531426869228476, + "grad_norm": 0.0185546875, + "learning_rate": 0.029222365977998526, + "loss": 0.8102, + "num_input_tokens_seen": 15016528, + "step": 25870 + }, + { + "epoch": 3.853887399463807, + "grad_norm": 0.0269775390625, + "learning_rate": 0.029221746257949398, + "loss": 0.8049, + "num_input_tokens_seen": 15019728, + "step": 25875 + }, + { + "epoch": 3.854632112004766, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02922112629763832, + "loss": 0.8159, + "num_input_tokens_seen": 15022608, + "step": 25880 + }, + { + "epoch": 3.8553768245457256, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029220506097075764, + "loss": 0.786, + "num_input_tokens_seen": 15025584, + "step": 25885 + }, + { + "epoch": 3.8561215370866844, + "grad_norm": 0.0205078125, + "learning_rate": 0.029219885656272215, + "loss": 0.8025, + "num_input_tokens_seen": 15028560, + "step": 25890 + }, + { + "epoch": 3.8568662496276436, + "grad_norm": 0.018798828125, + "learning_rate": 0.02921926497523815, + "loss": 0.8081, + "num_input_tokens_seen": 15031376, + "step": 25895 + }, + { + "epoch": 3.8576109621686028, + "grad_norm": 0.012939453125, + "learning_rate": 0.029218644053984052, + "loss": 0.8001, + "num_input_tokens_seen": 15034448, + "step": 25900 + }, + { + "epoch": 3.858355674709562, + "grad_norm": 0.0145263671875, + "learning_rate": 0.02921802289252042, + "loss": 0.8046, + "num_input_tokens_seen": 15037424, + "step": 25905 + }, + { + "epoch": 3.859100387250521, + "grad_norm": 0.0299072265625, + "learning_rate": 0.029217401490857733, + "loss": 0.8126, + "num_input_tokens_seen": 15040368, + "step": 25910 + }, + { + "epoch": 3.8598450997914804, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029216779849006502, + "loss": 0.7754, + "num_input_tokens_seen": 15042864, + "step": 25915 + }, + { + "epoch": 3.8605898123324396, + "grad_norm": 0.0133056640625, + "learning_rate": 0.029216157966977224, + "loss": 0.7956, + "num_input_tokens_seen": 15045584, + "step": 25920 + }, + { + "epoch": 3.861334524873399, + "grad_norm": 0.0224609375, + "learning_rate": 0.029215535844780406, + "loss": 0.7944, + "num_input_tokens_seen": 15048304, + "step": 25925 + }, + { + "epoch": 3.862079237414358, + "grad_norm": 0.01953125, + "learning_rate": 0.02921491348242656, + "loss": 0.7852, + "num_input_tokens_seen": 15051504, + "step": 25930 + }, + { + "epoch": 3.862823949955317, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0292142908799262, + "loss": 0.8225, + "num_input_tokens_seen": 15054384, + "step": 25935 + }, + { + "epoch": 3.8635686624962764, + "grad_norm": 0.02392578125, + "learning_rate": 0.02921366803728984, + "loss": 0.7874, + "num_input_tokens_seen": 15057488, + "step": 25940 + }, + { + "epoch": 3.8643133750372356, + "grad_norm": 0.0216064453125, + "learning_rate": 0.029213044954528, + "loss": 0.8093, + "num_input_tokens_seen": 15060240, + "step": 25945 + }, + { + "epoch": 3.865058087578195, + "grad_norm": 0.03173828125, + "learning_rate": 0.029212421631651216, + "loss": 0.7988, + "num_input_tokens_seen": 15062992, + "step": 25950 + }, + { + "epoch": 3.865802800119154, + "grad_norm": 0.02099609375, + "learning_rate": 0.02921179806867001, + "loss": 0.8112, + "num_input_tokens_seen": 15065808, + "step": 25955 + }, + { + "epoch": 3.866547512660113, + "grad_norm": 0.0146484375, + "learning_rate": 0.029211174265594924, + "loss": 0.7961, + "num_input_tokens_seen": 15068816, + "step": 25960 + }, + { + "epoch": 3.8672922252010724, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02921055022243649, + "loss": 0.785, + "num_input_tokens_seen": 15071696, + "step": 25965 + }, + { + "epoch": 3.8680369377420316, + "grad_norm": 0.021728515625, + "learning_rate": 0.029209925939205256, + "loss": 0.7991, + "num_input_tokens_seen": 15074512, + "step": 25970 + }, + { + "epoch": 3.868781650282991, + "grad_norm": 0.030029296875, + "learning_rate": 0.029209301415911763, + "loss": 0.8042, + "num_input_tokens_seen": 15077680, + "step": 25975 + }, + { + "epoch": 3.86952636282395, + "grad_norm": 0.0302734375, + "learning_rate": 0.02920867665256656, + "loss": 0.7814, + "num_input_tokens_seen": 15080464, + "step": 25980 + }, + { + "epoch": 3.870271075364909, + "grad_norm": 0.03369140625, + "learning_rate": 0.029208051649180212, + "loss": 0.817, + "num_input_tokens_seen": 15083248, + "step": 25985 + }, + { + "epoch": 3.8710157879058684, + "grad_norm": 0.015625, + "learning_rate": 0.029207426405763272, + "loss": 0.7972, + "num_input_tokens_seen": 15086192, + "step": 25990 + }, + { + "epoch": 3.8717605004468276, + "grad_norm": 0.020263671875, + "learning_rate": 0.0292068009223263, + "loss": 0.7858, + "num_input_tokens_seen": 15088944, + "step": 25995 + }, + { + "epoch": 3.872505212987787, + "grad_norm": 0.01220703125, + "learning_rate": 0.029206175198879865, + "loss": 0.8538, + "num_input_tokens_seen": 15091824, + "step": 26000 + }, + { + "epoch": 3.873249925528746, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02920554923543454, + "loss": 0.8288, + "num_input_tokens_seen": 15094992, + "step": 26005 + }, + { + "epoch": 3.8739946380697052, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0292049230320009, + "loss": 0.8337, + "num_input_tokens_seen": 15097904, + "step": 26010 + }, + { + "epoch": 3.8747393506106644, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029204296588589522, + "loss": 0.8078, + "num_input_tokens_seen": 15100848, + "step": 26015 + }, + { + "epoch": 3.8754840631516236, + "grad_norm": 0.01129150390625, + "learning_rate": 0.029203669905210984, + "loss": 0.7955, + "num_input_tokens_seen": 15103760, + "step": 26020 + }, + { + "epoch": 3.8762287756925824, + "grad_norm": 0.05322265625, + "learning_rate": 0.029203042981875885, + "loss": 0.8211, + "num_input_tokens_seen": 15106448, + "step": 26025 + }, + { + "epoch": 3.876973488233542, + "grad_norm": 0.03173828125, + "learning_rate": 0.02920241581859481, + "loss": 0.7784, + "num_input_tokens_seen": 15109360, + "step": 26030 + }, + { + "epoch": 3.877718200774501, + "grad_norm": 0.020751953125, + "learning_rate": 0.02920178841537835, + "loss": 0.807, + "num_input_tokens_seen": 15112624, + "step": 26035 + }, + { + "epoch": 3.8784629133154604, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029201160772237115, + "loss": 0.7832, + "num_input_tokens_seen": 15115280, + "step": 26040 + }, + { + "epoch": 3.879207625856419, + "grad_norm": 0.031494140625, + "learning_rate": 0.0292005328891817, + "loss": 0.8454, + "num_input_tokens_seen": 15118000, + "step": 26045 + }, + { + "epoch": 3.879952338397379, + "grad_norm": 0.01953125, + "learning_rate": 0.02919990476622271, + "loss": 0.8052, + "num_input_tokens_seen": 15120656, + "step": 26050 + }, + { + "epoch": 3.8806970509383376, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029199276403370764, + "loss": 0.817, + "num_input_tokens_seen": 15123728, + "step": 26055 + }, + { + "epoch": 3.8814417634792973, + "grad_norm": 0.012939453125, + "learning_rate": 0.029198647800636482, + "loss": 0.8173, + "num_input_tokens_seen": 15126480, + "step": 26060 + }, + { + "epoch": 3.882186476020256, + "grad_norm": 0.02734375, + "learning_rate": 0.029198018958030467, + "loss": 0.8087, + "num_input_tokens_seen": 15129136, + "step": 26065 + }, + { + "epoch": 3.882931188561215, + "grad_norm": 0.0230712890625, + "learning_rate": 0.029197389875563355, + "loss": 0.811, + "num_input_tokens_seen": 15132208, + "step": 26070 + }, + { + "epoch": 3.8836759011021744, + "grad_norm": 0.018798828125, + "learning_rate": 0.02919676055324577, + "loss": 0.7978, + "num_input_tokens_seen": 15135056, + "step": 26075 + }, + { + "epoch": 3.8844206136431336, + "grad_norm": 0.01318359375, + "learning_rate": 0.029196130991088344, + "loss": 0.8051, + "num_input_tokens_seen": 15137904, + "step": 26080 + }, + { + "epoch": 3.885165326184093, + "grad_norm": 0.02294921875, + "learning_rate": 0.029195501189101716, + "loss": 0.8144, + "num_input_tokens_seen": 15140656, + "step": 26085 + }, + { + "epoch": 3.885910038725052, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02919487114729652, + "loss": 0.8014, + "num_input_tokens_seen": 15143600, + "step": 26090 + }, + { + "epoch": 3.8866547512660112, + "grad_norm": 0.0146484375, + "learning_rate": 0.029194240865683403, + "loss": 0.8045, + "num_input_tokens_seen": 15146768, + "step": 26095 + }, + { + "epoch": 3.8873994638069704, + "grad_norm": 0.01519775390625, + "learning_rate": 0.029193610344273015, + "loss": 0.8152, + "num_input_tokens_seen": 15149776, + "step": 26100 + }, + { + "epoch": 3.8881441763479296, + "grad_norm": 0.02880859375, + "learning_rate": 0.029192979583076, + "loss": 0.8039, + "num_input_tokens_seen": 15152592, + "step": 26105 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.0135498046875, + "learning_rate": 0.02919234858210303, + "loss": 0.8028, + "num_input_tokens_seen": 15155216, + "step": 26110 + }, + { + "epoch": 3.889633601429848, + "grad_norm": 0.039794921875, + "learning_rate": 0.02919171734136475, + "loss": 0.8028, + "num_input_tokens_seen": 15158352, + "step": 26115 + }, + { + "epoch": 3.8903783139708072, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029191085860871827, + "loss": 0.806, + "num_input_tokens_seen": 15161328, + "step": 26120 + }, + { + "epoch": 3.8911230265117664, + "grad_norm": 0.024658203125, + "learning_rate": 0.029190454140634932, + "loss": 0.8026, + "num_input_tokens_seen": 15164208, + "step": 26125 + }, + { + "epoch": 3.8918677390527256, + "grad_norm": 0.020751953125, + "learning_rate": 0.02918982218066474, + "loss": 0.8049, + "num_input_tokens_seen": 15166992, + "step": 26130 + }, + { + "epoch": 3.892612451593685, + "grad_norm": 0.019287109375, + "learning_rate": 0.029189189980971923, + "loss": 0.8055, + "num_input_tokens_seen": 15171056, + "step": 26135 + }, + { + "epoch": 3.893357164134644, + "grad_norm": 0.020751953125, + "learning_rate": 0.029188557541567163, + "loss": 0.8165, + "num_input_tokens_seen": 15173840, + "step": 26140 + }, + { + "epoch": 3.8941018766756033, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029187924862461145, + "loss": 0.8029, + "num_input_tokens_seen": 15176752, + "step": 26145 + }, + { + "epoch": 3.8948465892165625, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029187291943664553, + "loss": 0.8074, + "num_input_tokens_seen": 15179440, + "step": 26150 + }, + { + "epoch": 3.8955913017575217, + "grad_norm": 0.01287841796875, + "learning_rate": 0.029186658785188085, + "loss": 0.8102, + "num_input_tokens_seen": 15182384, + "step": 26155 + }, + { + "epoch": 3.896336014298481, + "grad_norm": 0.02099609375, + "learning_rate": 0.029186025387042437, + "loss": 0.8067, + "num_input_tokens_seen": 15185200, + "step": 26160 + }, + { + "epoch": 3.89708072683944, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029185391749238305, + "loss": 0.798, + "num_input_tokens_seen": 15188368, + "step": 26165 + }, + { + "epoch": 3.8978254393803993, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0291847578717864, + "loss": 0.8101, + "num_input_tokens_seen": 15191152, + "step": 26170 + }, + { + "epoch": 3.8985701519213585, + "grad_norm": 0.02001953125, + "learning_rate": 0.029184123754697425, + "loss": 0.807, + "num_input_tokens_seen": 15194096, + "step": 26175 + }, + { + "epoch": 3.8993148644623177, + "grad_norm": 0.024658203125, + "learning_rate": 0.029183489397982095, + "loss": 0.8164, + "num_input_tokens_seen": 15196976, + "step": 26180 + }, + { + "epoch": 3.900059577003277, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029182854801651132, + "loss": 0.8037, + "num_input_tokens_seen": 15199536, + "step": 26185 + }, + { + "epoch": 3.900804289544236, + "grad_norm": 0.0118408203125, + "learning_rate": 0.029182219965715246, + "loss": 0.7937, + "num_input_tokens_seen": 15202288, + "step": 26190 + }, + { + "epoch": 3.9015490020851953, + "grad_norm": 0.02197265625, + "learning_rate": 0.02918158489018517, + "loss": 0.7961, + "num_input_tokens_seen": 15205424, + "step": 26195 + }, + { + "epoch": 3.902293714626154, + "grad_norm": 0.01397705078125, + "learning_rate": 0.02918094957507163, + "loss": 0.8142, + "num_input_tokens_seen": 15208272, + "step": 26200 + }, + { + "epoch": 3.9030384271671137, + "grad_norm": 0.01141357421875, + "learning_rate": 0.02918031402038536, + "loss": 0.8089, + "num_input_tokens_seen": 15211120, + "step": 26205 + }, + { + "epoch": 3.9037831397080724, + "grad_norm": 0.0218505859375, + "learning_rate": 0.029179678226137096, + "loss": 0.7968, + "num_input_tokens_seen": 15213936, + "step": 26210 + }, + { + "epoch": 3.904527852249032, + "grad_norm": 0.035400390625, + "learning_rate": 0.029179042192337582, + "loss": 0.7959, + "num_input_tokens_seen": 15216560, + "step": 26215 + }, + { + "epoch": 3.905272564789991, + "grad_norm": 0.039794921875, + "learning_rate": 0.02917840591899756, + "loss": 0.8108, + "num_input_tokens_seen": 15219472, + "step": 26220 + }, + { + "epoch": 3.9060172773309505, + "grad_norm": 0.01324462890625, + "learning_rate": 0.02917776940612778, + "loss": 0.8211, + "num_input_tokens_seen": 15222832, + "step": 26225 + }, + { + "epoch": 3.9067619898719093, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029177132653738995, + "loss": 0.7988, + "num_input_tokens_seen": 15225808, + "step": 26230 + }, + { + "epoch": 3.907506702412869, + "grad_norm": 0.01904296875, + "learning_rate": 0.029176495661841967, + "loss": 0.8016, + "num_input_tokens_seen": 15228688, + "step": 26235 + }, + { + "epoch": 3.9082514149538277, + "grad_norm": 0.030517578125, + "learning_rate": 0.029175858430447447, + "loss": 0.8128, + "num_input_tokens_seen": 15231888, + "step": 26240 + }, + { + "epoch": 3.908996127494787, + "grad_norm": 0.02099609375, + "learning_rate": 0.029175220959566213, + "loss": 0.7973, + "num_input_tokens_seen": 15234960, + "step": 26245 + }, + { + "epoch": 3.909740840035746, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029174583249209022, + "loss": 0.7831, + "num_input_tokens_seen": 15237744, + "step": 26250 + }, + { + "epoch": 3.9104855525767053, + "grad_norm": 0.03564453125, + "learning_rate": 0.029173945299386658, + "loss": 0.7847, + "num_input_tokens_seen": 15240624, + "step": 26255 + }, + { + "epoch": 3.9112302651176645, + "grad_norm": 0.02880859375, + "learning_rate": 0.029173307110109892, + "loss": 0.7968, + "num_input_tokens_seen": 15243536, + "step": 26260 + }, + { + "epoch": 3.9119749776586237, + "grad_norm": 0.024658203125, + "learning_rate": 0.029172668681389506, + "loss": 0.7816, + "num_input_tokens_seen": 15246640, + "step": 26265 + }, + { + "epoch": 3.912719690199583, + "grad_norm": 0.031982421875, + "learning_rate": 0.02917203001323629, + "loss": 0.8215, + "num_input_tokens_seen": 15249456, + "step": 26270 + }, + { + "epoch": 3.913464402740542, + "grad_norm": 0.02490234375, + "learning_rate": 0.029171391105661026, + "loss": 0.7878, + "num_input_tokens_seen": 15252592, + "step": 26275 + }, + { + "epoch": 3.9142091152815013, + "grad_norm": 0.01507568359375, + "learning_rate": 0.029170751958674516, + "loss": 0.8124, + "num_input_tokens_seen": 15255728, + "step": 26280 + }, + { + "epoch": 3.9149538278224605, + "grad_norm": 0.0234375, + "learning_rate": 0.029170112572287556, + "loss": 0.7908, + "num_input_tokens_seen": 15258384, + "step": 26285 + }, + { + "epoch": 3.9156985403634197, + "grad_norm": 0.029541015625, + "learning_rate": 0.029169472946510944, + "loss": 0.8142, + "num_input_tokens_seen": 15261104, + "step": 26290 + }, + { + "epoch": 3.916443252904379, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029168833081355486, + "loss": 0.8156, + "num_input_tokens_seen": 15263824, + "step": 26295 + }, + { + "epoch": 3.917187965445338, + "grad_norm": 0.0152587890625, + "learning_rate": 0.029168192976832, + "loss": 0.7999, + "num_input_tokens_seen": 15266640, + "step": 26300 + }, + { + "epoch": 3.9179326779862973, + "grad_norm": 0.021240234375, + "learning_rate": 0.029167552632951285, + "loss": 0.7907, + "num_input_tokens_seen": 15269552, + "step": 26305 + }, + { + "epoch": 3.9186773905272565, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02916691204972417, + "loss": 0.8236, + "num_input_tokens_seen": 15272560, + "step": 26310 + }, + { + "epoch": 3.9194221030682157, + "grad_norm": 0.0205078125, + "learning_rate": 0.029166271227161474, + "loss": 0.7908, + "num_input_tokens_seen": 15275312, + "step": 26315 + }, + { + "epoch": 3.920166815609175, + "grad_norm": 0.04345703125, + "learning_rate": 0.02916563016527403, + "loss": 0.8214, + "num_input_tokens_seen": 15278256, + "step": 26320 + }, + { + "epoch": 3.920911528150134, + "grad_norm": 0.0322265625, + "learning_rate": 0.029164988864072657, + "loss": 0.7996, + "num_input_tokens_seen": 15280944, + "step": 26325 + }, + { + "epoch": 3.9216562406910933, + "grad_norm": 0.021484375, + "learning_rate": 0.02916434732356819, + "loss": 0.8003, + "num_input_tokens_seen": 15283568, + "step": 26330 + }, + { + "epoch": 3.9224009532320525, + "grad_norm": 0.032958984375, + "learning_rate": 0.029163705543771477, + "loss": 0.827, + "num_input_tokens_seen": 15286320, + "step": 26335 + }, + { + "epoch": 3.9231456657730117, + "grad_norm": 0.01904296875, + "learning_rate": 0.02916306352469335, + "loss": 0.8109, + "num_input_tokens_seen": 15288976, + "step": 26340 + }, + { + "epoch": 3.923890378313971, + "grad_norm": 0.0203857421875, + "learning_rate": 0.029162421266344662, + "loss": 0.8028, + "num_input_tokens_seen": 15291728, + "step": 26345 + }, + { + "epoch": 3.92463509085493, + "grad_norm": 0.019775390625, + "learning_rate": 0.02916177876873626, + "loss": 0.8116, + "num_input_tokens_seen": 15294416, + "step": 26350 + }, + { + "epoch": 3.9253798033958893, + "grad_norm": 0.020751953125, + "learning_rate": 0.029161136031879, + "loss": 0.8114, + "num_input_tokens_seen": 15297264, + "step": 26355 + }, + { + "epoch": 3.9261245159368485, + "grad_norm": 0.0245361328125, + "learning_rate": 0.029160493055783738, + "loss": 0.7871, + "num_input_tokens_seen": 15300368, + "step": 26360 + }, + { + "epoch": 3.9268692284778077, + "grad_norm": 0.025390625, + "learning_rate": 0.02915984984046134, + "loss": 0.8143, + "num_input_tokens_seen": 15302960, + "step": 26365 + }, + { + "epoch": 3.927613941018767, + "grad_norm": 0.01904296875, + "learning_rate": 0.02915920638592267, + "loss": 0.802, + "num_input_tokens_seen": 15305648, + "step": 26370 + }, + { + "epoch": 3.9283586535597257, + "grad_norm": 0.0120849609375, + "learning_rate": 0.029158562692178598, + "loss": 0.8102, + "num_input_tokens_seen": 15308624, + "step": 26375 + }, + { + "epoch": 3.9291033661006853, + "grad_norm": 0.020263671875, + "learning_rate": 0.02915791875924, + "loss": 0.818, + "num_input_tokens_seen": 15311440, + "step": 26380 + }, + { + "epoch": 3.929848078641644, + "grad_norm": 0.0185546875, + "learning_rate": 0.029157274587117747, + "loss": 0.8017, + "num_input_tokens_seen": 15314224, + "step": 26385 + }, + { + "epoch": 3.9305927911826037, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029156630175822738, + "loss": 0.812, + "num_input_tokens_seen": 15317296, + "step": 26390 + }, + { + "epoch": 3.9313375037235625, + "grad_norm": 0.01806640625, + "learning_rate": 0.029155985525365847, + "loss": 0.8199, + "num_input_tokens_seen": 15320080, + "step": 26395 + }, + { + "epoch": 3.932082216264522, + "grad_norm": 0.0311279296875, + "learning_rate": 0.02915534063575797, + "loss": 0.7971, + "num_input_tokens_seen": 15322768, + "step": 26400 + }, + { + "epoch": 3.932826928805481, + "grad_norm": 0.020263671875, + "learning_rate": 0.029154695507009998, + "loss": 0.8171, + "num_input_tokens_seen": 15325776, + "step": 26405 + }, + { + "epoch": 3.9335716413464406, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02915405013913283, + "loss": 0.8094, + "num_input_tokens_seen": 15328720, + "step": 26410 + }, + { + "epoch": 3.9343163538873993, + "grad_norm": 0.021484375, + "learning_rate": 0.029153404532137373, + "loss": 0.8035, + "num_input_tokens_seen": 15331664, + "step": 26415 + }, + { + "epoch": 3.9350610664283585, + "grad_norm": 0.0113525390625, + "learning_rate": 0.02915275868603453, + "loss": 0.798, + "num_input_tokens_seen": 15334928, + "step": 26420 + }, + { + "epoch": 3.9358057789693177, + "grad_norm": 0.03076171875, + "learning_rate": 0.029152112600835216, + "loss": 0.8124, + "num_input_tokens_seen": 15337680, + "step": 26425 + }, + { + "epoch": 3.936550491510277, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02915146627655034, + "loss": 0.8035, + "num_input_tokens_seen": 15340592, + "step": 26430 + }, + { + "epoch": 3.937295204051236, + "grad_norm": 0.019287109375, + "learning_rate": 0.029150819713190824, + "loss": 0.8167, + "num_input_tokens_seen": 15343920, + "step": 26435 + }, + { + "epoch": 3.9380399165921953, + "grad_norm": 0.01202392578125, + "learning_rate": 0.029150172910767592, + "loss": 0.8042, + "num_input_tokens_seen": 15346544, + "step": 26440 + }, + { + "epoch": 3.9387846291331545, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029149525869291572, + "loss": 0.8028, + "num_input_tokens_seen": 15349520, + "step": 26445 + }, + { + "epoch": 3.9395293416741137, + "grad_norm": 0.0283203125, + "learning_rate": 0.029148878588773693, + "loss": 0.8003, + "num_input_tokens_seen": 15352496, + "step": 26450 + }, + { + "epoch": 3.940274054215073, + "grad_norm": 0.0361328125, + "learning_rate": 0.029148231069224886, + "loss": 0.8091, + "num_input_tokens_seen": 15355088, + "step": 26455 + }, + { + "epoch": 3.941018766756032, + "grad_norm": 0.022705078125, + "learning_rate": 0.0291475833106561, + "loss": 0.8003, + "num_input_tokens_seen": 15357936, + "step": 26460 + }, + { + "epoch": 3.9417634792969913, + "grad_norm": 0.0308837890625, + "learning_rate": 0.029146935313078274, + "loss": 0.8039, + "num_input_tokens_seen": 15360592, + "step": 26465 + }, + { + "epoch": 3.9425081918379505, + "grad_norm": 0.022705078125, + "learning_rate": 0.02914628707650235, + "loss": 0.8047, + "num_input_tokens_seen": 15363664, + "step": 26470 + }, + { + "epoch": 3.9432529043789097, + "grad_norm": 0.02880859375, + "learning_rate": 0.029145638600939282, + "loss": 0.8071, + "num_input_tokens_seen": 15366608, + "step": 26475 + }, + { + "epoch": 3.943997616919869, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02914498988640003, + "loss": 0.796, + "num_input_tokens_seen": 15369744, + "step": 26480 + }, + { + "epoch": 3.944742329460828, + "grad_norm": 0.0291748046875, + "learning_rate": 0.029144340932895547, + "loss": 0.8091, + "num_input_tokens_seen": 15372624, + "step": 26485 + }, + { + "epoch": 3.9454870420017873, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029143691740436805, + "loss": 0.8066, + "num_input_tokens_seen": 15375440, + "step": 26490 + }, + { + "epoch": 3.9462317545427466, + "grad_norm": 0.0123291015625, + "learning_rate": 0.029143042309034764, + "loss": 0.8016, + "num_input_tokens_seen": 15378128, + "step": 26495 + }, + { + "epoch": 3.9469764670837058, + "grad_norm": 0.029541015625, + "learning_rate": 0.0291423926387004, + "loss": 0.8068, + "num_input_tokens_seen": 15380848, + "step": 26500 + }, + { + "epoch": 3.947721179624665, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02914174272944468, + "loss": 0.8068, + "num_input_tokens_seen": 15383600, + "step": 26505 + }, + { + "epoch": 3.948465892165624, + "grad_norm": 0.01202392578125, + "learning_rate": 0.029141092581278595, + "loss": 0.8038, + "num_input_tokens_seen": 15386288, + "step": 26510 + }, + { + "epoch": 3.9492106047065834, + "grad_norm": 0.0205078125, + "learning_rate": 0.02914044219421312, + "loss": 0.8074, + "num_input_tokens_seen": 15389040, + "step": 26515 + }, + { + "epoch": 3.9499553172475426, + "grad_norm": 0.032470703125, + "learning_rate": 0.029139791568259247, + "loss": 0.8117, + "num_input_tokens_seen": 15392080, + "step": 26520 + }, + { + "epoch": 3.9507000297885018, + "grad_norm": 0.0213623046875, + "learning_rate": 0.029139140703427963, + "loss": 0.8063, + "num_input_tokens_seen": 15394768, + "step": 26525 + }, + { + "epoch": 3.951444742329461, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029138489599730272, + "loss": 0.8007, + "num_input_tokens_seen": 15397648, + "step": 26530 + }, + { + "epoch": 3.95218945487042, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029137838257177164, + "loss": 0.7971, + "num_input_tokens_seen": 15400400, + "step": 26535 + }, + { + "epoch": 3.9529341674113794, + "grad_norm": 0.01177978515625, + "learning_rate": 0.029137186675779653, + "loss": 0.806, + "num_input_tokens_seen": 15403152, + "step": 26540 + }, + { + "epoch": 3.9536788799523386, + "grad_norm": 0.0224609375, + "learning_rate": 0.02913653485554874, + "loss": 0.808, + "num_input_tokens_seen": 15405936, + "step": 26545 + }, + { + "epoch": 3.9544235924932973, + "grad_norm": 0.02197265625, + "learning_rate": 0.02913588279649544, + "loss": 0.7889, + "num_input_tokens_seen": 15408976, + "step": 26550 + }, + { + "epoch": 3.955168305034257, + "grad_norm": 0.01068115234375, + "learning_rate": 0.029135230498630763, + "loss": 0.8115, + "num_input_tokens_seen": 15411824, + "step": 26555 + }, + { + "epoch": 3.9559130175752157, + "grad_norm": 0.017822265625, + "learning_rate": 0.029134577961965735, + "loss": 0.7862, + "num_input_tokens_seen": 15414576, + "step": 26560 + }, + { + "epoch": 3.9566577301161754, + "grad_norm": 0.01251220703125, + "learning_rate": 0.029133925186511377, + "loss": 0.7835, + "num_input_tokens_seen": 15417456, + "step": 26565 + }, + { + "epoch": 3.957402442657134, + "grad_norm": 0.031982421875, + "learning_rate": 0.02913327217227872, + "loss": 0.8113, + "num_input_tokens_seen": 15420528, + "step": 26570 + }, + { + "epoch": 3.958147155198094, + "grad_norm": 0.0179443359375, + "learning_rate": 0.029132618919278794, + "loss": 0.7923, + "num_input_tokens_seen": 15423536, + "step": 26575 + }, + { + "epoch": 3.9588918677390526, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029131965427522635, + "loss": 0.8176, + "num_input_tokens_seen": 15426160, + "step": 26580 + }, + { + "epoch": 3.9596365802800118, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02913131169702128, + "loss": 0.8201, + "num_input_tokens_seen": 15429296, + "step": 26585 + }, + { + "epoch": 3.960381292820971, + "grad_norm": 0.03125, + "learning_rate": 0.029130657727785784, + "loss": 0.846, + "num_input_tokens_seen": 15432112, + "step": 26590 + }, + { + "epoch": 3.96112600536193, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029130003519827183, + "loss": 0.8132, + "num_input_tokens_seen": 15435088, + "step": 26595 + }, + { + "epoch": 3.9618707179028894, + "grad_norm": 0.01904296875, + "learning_rate": 0.02912934907315653, + "loss": 0.8014, + "num_input_tokens_seen": 15437840, + "step": 26600 + }, + { + "epoch": 3.9626154304438486, + "grad_norm": 0.01806640625, + "learning_rate": 0.029128694387784888, + "loss": 0.8133, + "num_input_tokens_seen": 15440720, + "step": 26605 + }, + { + "epoch": 3.9633601429848078, + "grad_norm": 0.0283203125, + "learning_rate": 0.029128039463723315, + "loss": 0.8061, + "num_input_tokens_seen": 15443600, + "step": 26610 + }, + { + "epoch": 3.964104855525767, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02912738430098287, + "loss": 0.7979, + "num_input_tokens_seen": 15446448, + "step": 26615 + }, + { + "epoch": 3.964849568066726, + "grad_norm": 0.01190185546875, + "learning_rate": 0.029126728899574635, + "loss": 0.7974, + "num_input_tokens_seen": 15449072, + "step": 26620 + }, + { + "epoch": 3.9655942806076854, + "grad_norm": 0.031005859375, + "learning_rate": 0.029126073259509663, + "loss": 0.8061, + "num_input_tokens_seen": 15451856, + "step": 26625 + }, + { + "epoch": 3.9663389931486446, + "grad_norm": 0.01165771484375, + "learning_rate": 0.029125417380799046, + "loss": 0.8002, + "num_input_tokens_seen": 15454896, + "step": 26630 + }, + { + "epoch": 3.967083705689604, + "grad_norm": 0.021240234375, + "learning_rate": 0.029124761263453855, + "loss": 0.8122, + "num_input_tokens_seen": 15457808, + "step": 26635 + }, + { + "epoch": 3.967828418230563, + "grad_norm": 0.021728515625, + "learning_rate": 0.029124104907485182, + "loss": 0.8024, + "num_input_tokens_seen": 15460624, + "step": 26640 + }, + { + "epoch": 3.968573130771522, + "grad_norm": 0.019287109375, + "learning_rate": 0.02912344831290411, + "loss": 0.8049, + "num_input_tokens_seen": 15463472, + "step": 26645 + }, + { + "epoch": 3.9693178433124814, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02912279147972173, + "loss": 0.8151, + "num_input_tokens_seen": 15466320, + "step": 26650 + }, + { + "epoch": 3.9700625558534406, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029122134407949148, + "loss": 0.8111, + "num_input_tokens_seen": 15469168, + "step": 26655 + }, + { + "epoch": 3.9708072683944, + "grad_norm": 0.0242919921875, + "learning_rate": 0.029121477097597454, + "loss": 0.8035, + "num_input_tokens_seen": 15471888, + "step": 26660 + }, + { + "epoch": 3.971551980935359, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02912081954867776, + "loss": 0.7973, + "num_input_tokens_seen": 15475088, + "step": 26665 + }, + { + "epoch": 3.972296693476318, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02912016176120117, + "loss": 0.792, + "num_input_tokens_seen": 15478064, + "step": 26670 + }, + { + "epoch": 3.9730414060172774, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029119503735178798, + "loss": 0.7968, + "num_input_tokens_seen": 15480912, + "step": 26675 + }, + { + "epoch": 3.9737861185582366, + "grad_norm": 0.018798828125, + "learning_rate": 0.029118845470621758, + "loss": 0.8213, + "num_input_tokens_seen": 15483952, + "step": 26680 + }, + { + "epoch": 3.974530831099196, + "grad_norm": 0.019775390625, + "learning_rate": 0.02911818696754118, + "loss": 0.7974, + "num_input_tokens_seen": 15487088, + "step": 26685 + }, + { + "epoch": 3.975275543640155, + "grad_norm": 0.0186767578125, + "learning_rate": 0.029117528225948174, + "loss": 0.7862, + "num_input_tokens_seen": 15489872, + "step": 26690 + }, + { + "epoch": 3.976020256181114, + "grad_norm": 0.0185546875, + "learning_rate": 0.029116869245853882, + "loss": 0.7967, + "num_input_tokens_seen": 15492784, + "step": 26695 + }, + { + "epoch": 3.9767649687220734, + "grad_norm": 0.0274658203125, + "learning_rate": 0.029116210027269433, + "loss": 0.8222, + "num_input_tokens_seen": 15495440, + "step": 26700 + }, + { + "epoch": 3.9775096812630326, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029115550570205963, + "loss": 0.8123, + "num_input_tokens_seen": 15498480, + "step": 26705 + }, + { + "epoch": 3.978254393803992, + "grad_norm": 0.019287109375, + "learning_rate": 0.029114890874674608, + "loss": 0.8079, + "num_input_tokens_seen": 15501424, + "step": 26710 + }, + { + "epoch": 3.9789991063449506, + "grad_norm": 0.017578125, + "learning_rate": 0.029114230940686516, + "loss": 0.8048, + "num_input_tokens_seen": 15504304, + "step": 26715 + }, + { + "epoch": 3.9797438188859102, + "grad_norm": 0.0128173828125, + "learning_rate": 0.029113570768252845, + "loss": 0.808, + "num_input_tokens_seen": 15507440, + "step": 26720 + }, + { + "epoch": 3.980488531426869, + "grad_norm": 0.0205078125, + "learning_rate": 0.029112910357384735, + "loss": 0.7859, + "num_input_tokens_seen": 15510256, + "step": 26725 + }, + { + "epoch": 3.9812332439678286, + "grad_norm": 0.0260009765625, + "learning_rate": 0.029112249708093355, + "loss": 0.8015, + "num_input_tokens_seen": 15513168, + "step": 26730 + }, + { + "epoch": 3.9819779565087874, + "grad_norm": 0.012451171875, + "learning_rate": 0.02911158882038985, + "loss": 0.8095, + "num_input_tokens_seen": 15516368, + "step": 26735 + }, + { + "epoch": 3.982722669049747, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029110927694285398, + "loss": 0.7918, + "num_input_tokens_seen": 15518928, + "step": 26740 + }, + { + "epoch": 3.983467381590706, + "grad_norm": 0.0262451171875, + "learning_rate": 0.029110266329791166, + "loss": 0.8148, + "num_input_tokens_seen": 15521808, + "step": 26745 + }, + { + "epoch": 3.9842120941316654, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02910960472691832, + "loss": 0.7949, + "num_input_tokens_seen": 15524624, + "step": 26750 + }, + { + "epoch": 3.984956806672624, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029108942885678047, + "loss": 0.8009, + "num_input_tokens_seen": 15527184, + "step": 26755 + }, + { + "epoch": 3.9857015192135834, + "grad_norm": 0.0184326171875, + "learning_rate": 0.029108280806081525, + "loss": 0.827, + "num_input_tokens_seen": 15530064, + "step": 26760 + }, + { + "epoch": 3.9864462317545426, + "grad_norm": 0.0283203125, + "learning_rate": 0.029107618488139933, + "loss": 0.7846, + "num_input_tokens_seen": 15533168, + "step": 26765 + }, + { + "epoch": 3.987190944295502, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02910695593186447, + "loss": 0.7893, + "num_input_tokens_seen": 15536240, + "step": 26770 + }, + { + "epoch": 3.987935656836461, + "grad_norm": 0.01153564453125, + "learning_rate": 0.02910629313726632, + "loss": 0.8029, + "num_input_tokens_seen": 15539120, + "step": 26775 + }, + { + "epoch": 3.98868036937742, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02910563010435669, + "loss": 0.8082, + "num_input_tokens_seen": 15542096, + "step": 26780 + }, + { + "epoch": 3.9894250819183794, + "grad_norm": 0.0185546875, + "learning_rate": 0.02910496683314677, + "loss": 0.8049, + "num_input_tokens_seen": 15544976, + "step": 26785 + }, + { + "epoch": 3.9901697944593386, + "grad_norm": 0.01153564453125, + "learning_rate": 0.029104303323647773, + "loss": 0.7858, + "num_input_tokens_seen": 15547888, + "step": 26790 + }, + { + "epoch": 3.990914507000298, + "grad_norm": 0.017578125, + "learning_rate": 0.029103639575870907, + "loss": 0.7851, + "num_input_tokens_seen": 15550960, + "step": 26795 + }, + { + "epoch": 3.991659219541257, + "grad_norm": 0.0205078125, + "learning_rate": 0.02910297558982738, + "loss": 0.8027, + "num_input_tokens_seen": 15553552, + "step": 26800 + }, + { + "epoch": 3.9924039320822162, + "grad_norm": 0.0211181640625, + "learning_rate": 0.029102311365528422, + "loss": 0.8132, + "num_input_tokens_seen": 15556272, + "step": 26805 + }, + { + "epoch": 3.9931486446231754, + "grad_norm": 0.01458740234375, + "learning_rate": 0.029101646902985244, + "loss": 0.798, + "num_input_tokens_seen": 15559408, + "step": 26810 + }, + { + "epoch": 3.9938933571641346, + "grad_norm": 0.028564453125, + "learning_rate": 0.02910098220220907, + "loss": 0.7947, + "num_input_tokens_seen": 15562352, + "step": 26815 + }, + { + "epoch": 3.994638069705094, + "grad_norm": 0.0186767578125, + "learning_rate": 0.029100317263211137, + "loss": 0.8443, + "num_input_tokens_seen": 15565360, + "step": 26820 + }, + { + "epoch": 3.995382782246053, + "grad_norm": 0.020263671875, + "learning_rate": 0.029099652086002674, + "loss": 0.8007, + "num_input_tokens_seen": 15568176, + "step": 26825 + }, + { + "epoch": 3.9961274947870122, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02909898667059492, + "loss": 0.8155, + "num_input_tokens_seen": 15571152, + "step": 26830 + }, + { + "epoch": 3.9968722073279714, + "grad_norm": 0.02392578125, + "learning_rate": 0.02909832101699911, + "loss": 0.8175, + "num_input_tokens_seen": 15574032, + "step": 26835 + }, + { + "epoch": 3.9976169198689306, + "grad_norm": 0.0179443359375, + "learning_rate": 0.029097655125226503, + "loss": 0.81, + "num_input_tokens_seen": 15577072, + "step": 26840 + }, + { + "epoch": 3.99836163240989, + "grad_norm": 0.0174560546875, + "learning_rate": 0.029096988995288335, + "loss": 0.7796, + "num_input_tokens_seen": 15579824, + "step": 26845 + }, + { + "epoch": 3.999106344950849, + "grad_norm": 0.01904296875, + "learning_rate": 0.029096322627195872, + "loss": 0.804, + "num_input_tokens_seen": 15582480, + "step": 26850 + }, + { + "epoch": 3.9998510574918082, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02909565602096036, + "loss": 0.7935, + "num_input_tokens_seen": 15585744, + "step": 26855 + }, + { + "epoch": 4.0, + "eval_loss": 0.8017039895057678, + "eval_runtime": 70.7018, + "eval_samples_per_second": 42.205, + "eval_steps_per_second": 10.551, + "num_input_tokens_seen": 15585872, + "step": 26856 + }, + { + "epoch": 4.000595770032767, + "grad_norm": 0.0220947265625, + "learning_rate": 0.029094989176593068, + "loss": 0.8007, + "num_input_tokens_seen": 15588400, + "step": 26860 + }, + { + "epoch": 4.001340482573727, + "grad_norm": 0.0128173828125, + "learning_rate": 0.029094322094105257, + "loss": 0.8109, + "num_input_tokens_seen": 15591440, + "step": 26865 + }, + { + "epoch": 4.002085195114685, + "grad_norm": 0.029296875, + "learning_rate": 0.0290936547735082, + "loss": 0.7808, + "num_input_tokens_seen": 15594608, + "step": 26870 + }, + { + "epoch": 4.002829907655645, + "grad_norm": 0.0206298828125, + "learning_rate": 0.029092987214813168, + "loss": 0.7978, + "num_input_tokens_seen": 15597488, + "step": 26875 + }, + { + "epoch": 4.003574620196604, + "grad_norm": 0.022705078125, + "learning_rate": 0.029092319418031445, + "loss": 0.8012, + "num_input_tokens_seen": 15600784, + "step": 26880 + }, + { + "epoch": 4.0043193327375635, + "grad_norm": 0.028564453125, + "learning_rate": 0.029091651383174304, + "loss": 0.8057, + "num_input_tokens_seen": 15603664, + "step": 26885 + }, + { + "epoch": 4.005064045278522, + "grad_norm": 0.018310546875, + "learning_rate": 0.029090983110253037, + "loss": 0.8137, + "num_input_tokens_seen": 15606448, + "step": 26890 + }, + { + "epoch": 4.005808757819482, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029090314599278935, + "loss": 0.803, + "num_input_tokens_seen": 15609232, + "step": 26895 + }, + { + "epoch": 4.006553470360441, + "grad_norm": 0.017578125, + "learning_rate": 0.029089645850263284, + "loss": 0.7899, + "num_input_tokens_seen": 15612080, + "step": 26900 + }, + { + "epoch": 4.0072981829014, + "grad_norm": 0.01953125, + "learning_rate": 0.029088976863217385, + "loss": 0.7868, + "num_input_tokens_seen": 15614896, + "step": 26905 + }, + { + "epoch": 4.008042895442359, + "grad_norm": 0.0191650390625, + "learning_rate": 0.029088307638152545, + "loss": 0.8008, + "num_input_tokens_seen": 15617584, + "step": 26910 + }, + { + "epoch": 4.008787607983319, + "grad_norm": 0.0208740234375, + "learning_rate": 0.029087638175080067, + "loss": 0.7929, + "num_input_tokens_seen": 15620592, + "step": 26915 + }, + { + "epoch": 4.009532320524277, + "grad_norm": 0.01953125, + "learning_rate": 0.029086968474011255, + "loss": 0.802, + "num_input_tokens_seen": 15623344, + "step": 26920 + }, + { + "epoch": 4.010277033065237, + "grad_norm": 0.0279541015625, + "learning_rate": 0.029086298534957436, + "loss": 0.7923, + "num_input_tokens_seen": 15626320, + "step": 26925 + }, + { + "epoch": 4.011021745606196, + "grad_norm": 0.0322265625, + "learning_rate": 0.02908562835792991, + "loss": 0.8072, + "num_input_tokens_seen": 15629136, + "step": 26930 + }, + { + "epoch": 4.0117664581471555, + "grad_norm": 0.019287109375, + "learning_rate": 0.029084957942940016, + "loss": 0.7827, + "num_input_tokens_seen": 15631952, + "step": 26935 + }, + { + "epoch": 4.012511170688114, + "grad_norm": 0.0205078125, + "learning_rate": 0.02908428728999907, + "loss": 0.821, + "num_input_tokens_seen": 15634896, + "step": 26940 + }, + { + "epoch": 4.013255883229074, + "grad_norm": 0.0166015625, + "learning_rate": 0.029083616399118408, + "loss": 0.8221, + "num_input_tokens_seen": 15637712, + "step": 26945 + }, + { + "epoch": 4.014000595770033, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029082945270309357, + "loss": 0.8, + "num_input_tokens_seen": 15640720, + "step": 26950 + }, + { + "epoch": 4.014745308310992, + "grad_norm": 0.0255126953125, + "learning_rate": 0.029082273903583263, + "loss": 0.7864, + "num_input_tokens_seen": 15643696, + "step": 26955 + }, + { + "epoch": 4.015490020851951, + "grad_norm": 0.0115966796875, + "learning_rate": 0.02908160229895146, + "loss": 0.7711, + "num_input_tokens_seen": 15646896, + "step": 26960 + }, + { + "epoch": 4.016234733392911, + "grad_norm": 0.0234375, + "learning_rate": 0.029080930456425298, + "loss": 0.8371, + "num_input_tokens_seen": 15650064, + "step": 26965 + }, + { + "epoch": 4.0169794459338695, + "grad_norm": 0.010498046875, + "learning_rate": 0.02908025837601613, + "loss": 0.8058, + "num_input_tokens_seen": 15652880, + "step": 26970 + }, + { + "epoch": 4.017724158474829, + "grad_norm": 0.02978515625, + "learning_rate": 0.029079586057735304, + "loss": 0.8299, + "num_input_tokens_seen": 15655824, + "step": 26975 + }, + { + "epoch": 4.018468871015788, + "grad_norm": 0.023681640625, + "learning_rate": 0.02907891350159418, + "loss": 0.7999, + "num_input_tokens_seen": 15658384, + "step": 26980 + }, + { + "epoch": 4.0192135835567475, + "grad_norm": 0.01806640625, + "learning_rate": 0.02907824070760413, + "loss": 0.8042, + "num_input_tokens_seen": 15661232, + "step": 26985 + }, + { + "epoch": 4.019958296097706, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029077567675776506, + "loss": 0.78, + "num_input_tokens_seen": 15664304, + "step": 26990 + }, + { + "epoch": 4.020703008638666, + "grad_norm": 0.018798828125, + "learning_rate": 0.029076894406122687, + "loss": 0.7979, + "num_input_tokens_seen": 15667344, + "step": 26995 + }, + { + "epoch": 4.021447721179625, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02907622089865404, + "loss": 0.8253, + "num_input_tokens_seen": 15670128, + "step": 27000 + }, + { + "epoch": 4.022192433720583, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02907554715338195, + "loss": 0.808, + "num_input_tokens_seen": 15673328, + "step": 27005 + }, + { + "epoch": 4.022937146261543, + "grad_norm": 0.01409912109375, + "learning_rate": 0.029074873170317792, + "loss": 0.7979, + "num_input_tokens_seen": 15676304, + "step": 27010 + }, + { + "epoch": 4.023681858802502, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029074198949472965, + "loss": 0.8111, + "num_input_tokens_seen": 15679312, + "step": 27015 + }, + { + "epoch": 4.0244265713434615, + "grad_norm": 0.0174560546875, + "learning_rate": 0.029073524490858847, + "loss": 0.7918, + "num_input_tokens_seen": 15682320, + "step": 27020 + }, + { + "epoch": 4.02517128388442, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029072849794486835, + "loss": 0.7956, + "num_input_tokens_seen": 15685328, + "step": 27025 + }, + { + "epoch": 4.02591599642538, + "grad_norm": 0.022216796875, + "learning_rate": 0.029072174860368324, + "loss": 0.8265, + "num_input_tokens_seen": 15688144, + "step": 27030 + }, + { + "epoch": 4.026660708966339, + "grad_norm": 0.024658203125, + "learning_rate": 0.029071499688514732, + "loss": 0.8052, + "num_input_tokens_seen": 15690928, + "step": 27035 + }, + { + "epoch": 4.027405421507298, + "grad_norm": 0.022216796875, + "learning_rate": 0.029070824278937448, + "loss": 0.7883, + "num_input_tokens_seen": 15694032, + "step": 27040 + }, + { + "epoch": 4.028150134048257, + "grad_norm": 0.01904296875, + "learning_rate": 0.02907014863164789, + "loss": 0.8075, + "num_input_tokens_seen": 15697264, + "step": 27045 + }, + { + "epoch": 4.028894846589217, + "grad_norm": 0.0140380859375, + "learning_rate": 0.029069472746657467, + "loss": 0.8002, + "num_input_tokens_seen": 15700368, + "step": 27050 + }, + { + "epoch": 4.0296395591301755, + "grad_norm": 0.017578125, + "learning_rate": 0.02906879662397761, + "loss": 0.8115, + "num_input_tokens_seen": 15703536, + "step": 27055 + }, + { + "epoch": 4.030384271671135, + "grad_norm": 0.02490234375, + "learning_rate": 0.029068120263619726, + "loss": 0.8093, + "num_input_tokens_seen": 15706512, + "step": 27060 + }, + { + "epoch": 4.031128984212094, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02906744366559525, + "loss": 0.8117, + "num_input_tokens_seen": 15709232, + "step": 27065 + }, + { + "epoch": 4.0318736967530535, + "grad_norm": 0.0235595703125, + "learning_rate": 0.029066766829915616, + "loss": 0.8327, + "num_input_tokens_seen": 15712400, + "step": 27070 + }, + { + "epoch": 4.032618409294012, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029066089756592247, + "loss": 0.8208, + "num_input_tokens_seen": 15715024, + "step": 27075 + }, + { + "epoch": 4.033363121834972, + "grad_norm": 0.0380859375, + "learning_rate": 0.02906541244563659, + "loss": 0.8099, + "num_input_tokens_seen": 15717584, + "step": 27080 + }, + { + "epoch": 4.034107834375931, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02906473489706008, + "loss": 0.8003, + "num_input_tokens_seen": 15720432, + "step": 27085 + }, + { + "epoch": 4.03485254691689, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02906405711087418, + "loss": 0.8063, + "num_input_tokens_seen": 15723888, + "step": 27090 + }, + { + "epoch": 4.035597259457849, + "grad_norm": 0.01141357421875, + "learning_rate": 0.02906337908709032, + "loss": 0.782, + "num_input_tokens_seen": 15726800, + "step": 27095 + }, + { + "epoch": 4.036341971998809, + "grad_norm": 0.027099609375, + "learning_rate": 0.029062700825719965, + "loss": 0.805, + "num_input_tokens_seen": 15729744, + "step": 27100 + }, + { + "epoch": 4.0370866845397675, + "grad_norm": 0.01123046875, + "learning_rate": 0.029062022326774576, + "loss": 0.8037, + "num_input_tokens_seen": 15732944, + "step": 27105 + }, + { + "epoch": 4.037831397080727, + "grad_norm": 0.0194091796875, + "learning_rate": 0.029061343590265607, + "loss": 0.8265, + "num_input_tokens_seen": 15736016, + "step": 27110 + }, + { + "epoch": 4.038576109621686, + "grad_norm": 0.02099609375, + "learning_rate": 0.029060664616204532, + "loss": 0.7917, + "num_input_tokens_seen": 15738928, + "step": 27115 + }, + { + "epoch": 4.0393208221626455, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02905998540460282, + "loss": 0.799, + "num_input_tokens_seen": 15742032, + "step": 27120 + }, + { + "epoch": 4.040065534703604, + "grad_norm": 0.021728515625, + "learning_rate": 0.02905930595547194, + "loss": 0.8, + "num_input_tokens_seen": 15744880, + "step": 27125 + }, + { + "epoch": 4.040810247244564, + "grad_norm": 0.0228271484375, + "learning_rate": 0.029058626268823384, + "loss": 0.8007, + "num_input_tokens_seen": 15747856, + "step": 27130 + }, + { + "epoch": 4.041554959785523, + "grad_norm": 0.0118408203125, + "learning_rate": 0.029057946344668618, + "loss": 0.8184, + "num_input_tokens_seen": 15750832, + "step": 27135 + }, + { + "epoch": 4.042299672326482, + "grad_norm": 0.0257568359375, + "learning_rate": 0.029057266183019136, + "loss": 0.8109, + "num_input_tokens_seen": 15753552, + "step": 27140 + }, + { + "epoch": 4.043044384867441, + "grad_norm": 0.041259765625, + "learning_rate": 0.029056585783886434, + "loss": 0.8195, + "num_input_tokens_seen": 15756496, + "step": 27145 + }, + { + "epoch": 4.043789097408401, + "grad_norm": 0.02294921875, + "learning_rate": 0.029055905147281996, + "loss": 0.804, + "num_input_tokens_seen": 15759472, + "step": 27150 + }, + { + "epoch": 4.0445338099493595, + "grad_norm": 0.020751953125, + "learning_rate": 0.02905522427321733, + "loss": 0.7973, + "num_input_tokens_seen": 15762480, + "step": 27155 + }, + { + "epoch": 4.045278522490319, + "grad_norm": 0.0281982421875, + "learning_rate": 0.029054543161703937, + "loss": 0.8023, + "num_input_tokens_seen": 15765232, + "step": 27160 + }, + { + "epoch": 4.046023235031278, + "grad_norm": 0.030517578125, + "learning_rate": 0.029053861812753316, + "loss": 0.8006, + "num_input_tokens_seen": 15768176, + "step": 27165 + }, + { + "epoch": 4.046767947572237, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029053180226376984, + "loss": 0.7978, + "num_input_tokens_seen": 15770768, + "step": 27170 + }, + { + "epoch": 4.047512660113196, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02905249840258646, + "loss": 0.7997, + "num_input_tokens_seen": 15773424, + "step": 27175 + }, + { + "epoch": 4.048257372654155, + "grad_norm": 0.01300048828125, + "learning_rate": 0.02905181634139326, + "loss": 0.808, + "num_input_tokens_seen": 15776016, + "step": 27180 + }, + { + "epoch": 4.049002085195115, + "grad_norm": 0.020751953125, + "learning_rate": 0.029051134042808895, + "loss": 0.7974, + "num_input_tokens_seen": 15779088, + "step": 27185 + }, + { + "epoch": 4.0497467977360735, + "grad_norm": 0.02783203125, + "learning_rate": 0.029050451506844908, + "loss": 0.8156, + "num_input_tokens_seen": 15781936, + "step": 27190 + }, + { + "epoch": 4.050491510277033, + "grad_norm": 0.0283203125, + "learning_rate": 0.029049768733512824, + "loss": 0.8184, + "num_input_tokens_seen": 15785392, + "step": 27195 + }, + { + "epoch": 4.051236222817992, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02904908572282417, + "loss": 0.7962, + "num_input_tokens_seen": 15788144, + "step": 27200 + }, + { + "epoch": 4.0519809353589515, + "grad_norm": 0.021240234375, + "learning_rate": 0.029048402474790496, + "loss": 0.8037, + "num_input_tokens_seen": 15790864, + "step": 27205 + }, + { + "epoch": 4.05272564789991, + "grad_norm": 0.033935546875, + "learning_rate": 0.029047718989423342, + "loss": 0.7967, + "num_input_tokens_seen": 15794256, + "step": 27210 + }, + { + "epoch": 4.05347036044087, + "grad_norm": 0.03173828125, + "learning_rate": 0.02904703526673425, + "loss": 0.82, + "num_input_tokens_seen": 15797424, + "step": 27215 + }, + { + "epoch": 4.054215072981829, + "grad_norm": 0.024658203125, + "learning_rate": 0.029046351306734775, + "loss": 0.8029, + "num_input_tokens_seen": 15800112, + "step": 27220 + }, + { + "epoch": 4.054959785522788, + "grad_norm": 0.01361083984375, + "learning_rate": 0.02904566710943647, + "loss": 0.805, + "num_input_tokens_seen": 15803248, + "step": 27225 + }, + { + "epoch": 4.055704498063747, + "grad_norm": 0.0244140625, + "learning_rate": 0.029044982674850894, + "loss": 0.8092, + "num_input_tokens_seen": 15806224, + "step": 27230 + }, + { + "epoch": 4.056449210604707, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02904429800298961, + "loss": 0.8067, + "num_input_tokens_seen": 15809072, + "step": 27235 + }, + { + "epoch": 4.0571939231456655, + "grad_norm": 0.0267333984375, + "learning_rate": 0.029043613093864187, + "loss": 0.7839, + "num_input_tokens_seen": 15811920, + "step": 27240 + }, + { + "epoch": 4.057938635686625, + "grad_norm": 0.0125732421875, + "learning_rate": 0.029042927947486197, + "loss": 0.7879, + "num_input_tokens_seen": 15814608, + "step": 27245 + }, + { + "epoch": 4.058683348227584, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02904224256386721, + "loss": 0.8325, + "num_input_tokens_seen": 15817392, + "step": 27250 + }, + { + "epoch": 4.059428060768544, + "grad_norm": 0.01251220703125, + "learning_rate": 0.029041556943018805, + "loss": 0.7868, + "num_input_tokens_seen": 15820240, + "step": 27255 + }, + { + "epoch": 4.060172773309502, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02904087108495257, + "loss": 0.8048, + "num_input_tokens_seen": 15823024, + "step": 27260 + }, + { + "epoch": 4.060917485850462, + "grad_norm": 0.017333984375, + "learning_rate": 0.02904018498968008, + "loss": 0.7813, + "num_input_tokens_seen": 15825936, + "step": 27265 + }, + { + "epoch": 4.061662198391421, + "grad_norm": 0.02880859375, + "learning_rate": 0.029039498657212944, + "loss": 0.7843, + "num_input_tokens_seen": 15828752, + "step": 27270 + }, + { + "epoch": 4.06240691093238, + "grad_norm": 0.0238037109375, + "learning_rate": 0.029038812087562742, + "loss": 0.7828, + "num_input_tokens_seen": 15831568, + "step": 27275 + }, + { + "epoch": 4.063151623473339, + "grad_norm": 0.020751953125, + "learning_rate": 0.02903812528074108, + "loss": 0.8044, + "num_input_tokens_seen": 15834480, + "step": 27280 + }, + { + "epoch": 4.063896336014299, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02903743823675956, + "loss": 0.777, + "num_input_tokens_seen": 15837360, + "step": 27285 + }, + { + "epoch": 4.0646410485552575, + "grad_norm": 0.0196533203125, + "learning_rate": 0.029036750955629784, + "loss": 0.8075, + "num_input_tokens_seen": 15840720, + "step": 27290 + }, + { + "epoch": 4.065385761096217, + "grad_norm": 0.01214599609375, + "learning_rate": 0.029036063437363372, + "loss": 0.8017, + "num_input_tokens_seen": 15843568, + "step": 27295 + }, + { + "epoch": 4.066130473637176, + "grad_norm": 0.01953125, + "learning_rate": 0.02903537568197193, + "loss": 0.7769, + "num_input_tokens_seen": 15846352, + "step": 27300 + }, + { + "epoch": 4.066875186178136, + "grad_norm": 0.01904296875, + "learning_rate": 0.029034687689467084, + "loss": 0.7623, + "num_input_tokens_seen": 15849200, + "step": 27305 + }, + { + "epoch": 4.067619898719094, + "grad_norm": 0.0224609375, + "learning_rate": 0.029033999459860453, + "loss": 0.8345, + "num_input_tokens_seen": 15852304, + "step": 27310 + }, + { + "epoch": 4.068364611260054, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02903331099316366, + "loss": 0.7902, + "num_input_tokens_seen": 15855088, + "step": 27315 + }, + { + "epoch": 4.069109323801013, + "grad_norm": 0.018798828125, + "learning_rate": 0.029032622289388344, + "loss": 0.8225, + "num_input_tokens_seen": 15858288, + "step": 27320 + }, + { + "epoch": 4.069854036341972, + "grad_norm": 0.026611328125, + "learning_rate": 0.02903193334854614, + "loss": 0.844, + "num_input_tokens_seen": 15861264, + "step": 27325 + }, + { + "epoch": 4.070598748882931, + "grad_norm": 0.0166015625, + "learning_rate": 0.02903124417064868, + "loss": 0.7611, + "num_input_tokens_seen": 15863888, + "step": 27330 + }, + { + "epoch": 4.071343461423891, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02903055475570761, + "loss": 0.8353, + "num_input_tokens_seen": 15866576, + "step": 27335 + }, + { + "epoch": 4.07208817396485, + "grad_norm": 0.01025390625, + "learning_rate": 0.02902986510373458, + "loss": 0.7517, + "num_input_tokens_seen": 15869680, + "step": 27340 + }, + { + "epoch": 4.072832886505808, + "grad_norm": 0.0247802734375, + "learning_rate": 0.029029175214741236, + "loss": 0.8127, + "num_input_tokens_seen": 15872656, + "step": 27345 + }, + { + "epoch": 4.073577599046768, + "grad_norm": 0.01287841796875, + "learning_rate": 0.029028485088739238, + "loss": 0.8064, + "num_input_tokens_seen": 15875600, + "step": 27350 + }, + { + "epoch": 4.074322311587727, + "grad_norm": 0.014404296875, + "learning_rate": 0.029027794725740234, + "loss": 0.7881, + "num_input_tokens_seen": 15878416, + "step": 27355 + }, + { + "epoch": 4.075067024128686, + "grad_norm": 0.017578125, + "learning_rate": 0.029027104125755904, + "loss": 0.8164, + "num_input_tokens_seen": 15881328, + "step": 27360 + }, + { + "epoch": 4.075811736669645, + "grad_norm": 0.0281982421875, + "learning_rate": 0.029026413288797903, + "loss": 0.8047, + "num_input_tokens_seen": 15884368, + "step": 27365 + }, + { + "epoch": 4.076556449210605, + "grad_norm": 0.014404296875, + "learning_rate": 0.029025722214877907, + "loss": 0.7986, + "num_input_tokens_seen": 15887344, + "step": 27370 + }, + { + "epoch": 4.0773011617515635, + "grad_norm": 0.0224609375, + "learning_rate": 0.029025030904007588, + "loss": 0.8168, + "num_input_tokens_seen": 15890096, + "step": 27375 + }, + { + "epoch": 4.078045874292523, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02902433935619863, + "loss": 0.7772, + "num_input_tokens_seen": 15892912, + "step": 27380 + }, + { + "epoch": 4.078790586833482, + "grad_norm": 0.02490234375, + "learning_rate": 0.029023647571462707, + "loss": 0.8023, + "num_input_tokens_seen": 15895568, + "step": 27385 + }, + { + "epoch": 4.079535299374442, + "grad_norm": 0.0225830078125, + "learning_rate": 0.029022955549811512, + "loss": 0.8043, + "num_input_tokens_seen": 15898160, + "step": 27390 + }, + { + "epoch": 4.0802800119154, + "grad_norm": 0.0181884765625, + "learning_rate": 0.029022263291256737, + "loss": 0.7824, + "num_input_tokens_seen": 15901296, + "step": 27395 + }, + { + "epoch": 4.08102472445636, + "grad_norm": 0.01171875, + "learning_rate": 0.029021570795810078, + "loss": 0.8036, + "num_input_tokens_seen": 15903952, + "step": 27400 + }, + { + "epoch": 4.081769436997319, + "grad_norm": 0.0201416015625, + "learning_rate": 0.029020878063483226, + "loss": 0.8138, + "num_input_tokens_seen": 15907024, + "step": 27405 + }, + { + "epoch": 4.082514149538278, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02902018509428789, + "loss": 0.8108, + "num_input_tokens_seen": 15909744, + "step": 27410 + }, + { + "epoch": 4.083258862079237, + "grad_norm": 0.017333984375, + "learning_rate": 0.02901949188823578, + "loss": 0.8266, + "num_input_tokens_seen": 15912688, + "step": 27415 + }, + { + "epoch": 4.084003574620197, + "grad_norm": 0.0224609375, + "learning_rate": 0.029018798445338598, + "loss": 0.7929, + "num_input_tokens_seen": 15915856, + "step": 27420 + }, + { + "epoch": 4.084748287161156, + "grad_norm": 0.016845703125, + "learning_rate": 0.02901810476560807, + "loss": 0.7896, + "num_input_tokens_seen": 15918864, + "step": 27425 + }, + { + "epoch": 4.085492999702115, + "grad_norm": 0.0181884765625, + "learning_rate": 0.029017410849055906, + "loss": 0.8011, + "num_input_tokens_seen": 15921552, + "step": 27430 + }, + { + "epoch": 4.086237712243074, + "grad_norm": 0.0174560546875, + "learning_rate": 0.029016716695693834, + "loss": 0.8104, + "num_input_tokens_seen": 15925008, + "step": 27435 + }, + { + "epoch": 4.086982424784034, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02901602230553358, + "loss": 0.7872, + "num_input_tokens_seen": 15927696, + "step": 27440 + }, + { + "epoch": 4.087727137324992, + "grad_norm": 0.018310546875, + "learning_rate": 0.02901532767858687, + "loss": 0.7847, + "num_input_tokens_seen": 15930576, + "step": 27445 + }, + { + "epoch": 4.088471849865952, + "grad_norm": 0.011474609375, + "learning_rate": 0.02901463281486545, + "loss": 0.8046, + "num_input_tokens_seen": 15933808, + "step": 27450 + }, + { + "epoch": 4.089216562406911, + "grad_norm": 0.018798828125, + "learning_rate": 0.029013937714381044, + "loss": 0.8212, + "num_input_tokens_seen": 15936624, + "step": 27455 + }, + { + "epoch": 4.08996127494787, + "grad_norm": 0.0125732421875, + "learning_rate": 0.02901324237714541, + "loss": 0.8179, + "num_input_tokens_seen": 15939344, + "step": 27460 + }, + { + "epoch": 4.090705987488829, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02901254680317029, + "loss": 0.7939, + "num_input_tokens_seen": 15941968, + "step": 27465 + }, + { + "epoch": 4.091450700029789, + "grad_norm": 0.018310546875, + "learning_rate": 0.02901185099246743, + "loss": 0.8062, + "num_input_tokens_seen": 15944720, + "step": 27470 + }, + { + "epoch": 4.092195412570748, + "grad_norm": 0.03759765625, + "learning_rate": 0.029011154945048585, + "loss": 0.8414, + "num_input_tokens_seen": 15947600, + "step": 27475 + }, + { + "epoch": 4.092940125111707, + "grad_norm": 0.018798828125, + "learning_rate": 0.029010458660925522, + "loss": 0.8055, + "num_input_tokens_seen": 15950416, + "step": 27480 + }, + { + "epoch": 4.093684837652666, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02900976214011, + "loss": 0.788, + "num_input_tokens_seen": 15953168, + "step": 27485 + }, + { + "epoch": 4.094429550193626, + "grad_norm": 0.0205078125, + "learning_rate": 0.029009065382613785, + "loss": 0.7981, + "num_input_tokens_seen": 15956112, + "step": 27490 + }, + { + "epoch": 4.095174262734584, + "grad_norm": 0.0244140625, + "learning_rate": 0.029008368388448648, + "loss": 0.7881, + "num_input_tokens_seen": 15958928, + "step": 27495 + }, + { + "epoch": 4.095918975275544, + "grad_norm": 0.0184326171875, + "learning_rate": 0.029007671157626362, + "loss": 0.79, + "num_input_tokens_seen": 15961808, + "step": 27500 + }, + { + "epoch": 4.096663687816503, + "grad_norm": 0.02783203125, + "learning_rate": 0.029006973690158713, + "loss": 0.8086, + "num_input_tokens_seen": 15964624, + "step": 27505 + }, + { + "epoch": 4.0974084003574625, + "grad_norm": 0.0189208984375, + "learning_rate": 0.029006275986057477, + "loss": 0.792, + "num_input_tokens_seen": 15967632, + "step": 27510 + }, + { + "epoch": 4.098153112898421, + "grad_norm": 0.02099609375, + "learning_rate": 0.029005578045334445, + "loss": 0.7942, + "num_input_tokens_seen": 15970352, + "step": 27515 + }, + { + "epoch": 4.09889782543938, + "grad_norm": 0.01409912109375, + "learning_rate": 0.029004879868001408, + "loss": 0.7988, + "num_input_tokens_seen": 15973552, + "step": 27520 + }, + { + "epoch": 4.09964253798034, + "grad_norm": 0.0244140625, + "learning_rate": 0.02900418145407015, + "loss": 0.7938, + "num_input_tokens_seen": 15976560, + "step": 27525 + }, + { + "epoch": 4.100387250521298, + "grad_norm": 0.027099609375, + "learning_rate": 0.029003482803552492, + "loss": 0.774, + "num_input_tokens_seen": 15979568, + "step": 27530 + }, + { + "epoch": 4.101131963062258, + "grad_norm": 0.0205078125, + "learning_rate": 0.029002783916460217, + "loss": 0.7996, + "num_input_tokens_seen": 15982512, + "step": 27535 + }, + { + "epoch": 4.101876675603217, + "grad_norm": 0.02197265625, + "learning_rate": 0.029002084792805142, + "loss": 0.8277, + "num_input_tokens_seen": 15985520, + "step": 27540 + }, + { + "epoch": 4.102621388144176, + "grad_norm": 0.0185546875, + "learning_rate": 0.029001385432599076, + "loss": 0.7802, + "num_input_tokens_seen": 15988112, + "step": 27545 + }, + { + "epoch": 4.103366100685135, + "grad_norm": 0.018798828125, + "learning_rate": 0.029000685835853832, + "loss": 0.809, + "num_input_tokens_seen": 15990928, + "step": 27550 + }, + { + "epoch": 4.104110813226095, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028999986002581232, + "loss": 0.7633, + "num_input_tokens_seen": 15993520, + "step": 27555 + }, + { + "epoch": 4.104855525767054, + "grad_norm": 0.036865234375, + "learning_rate": 0.028999285932793096, + "loss": 0.8196, + "num_input_tokens_seen": 15996336, + "step": 27560 + }, + { + "epoch": 4.105600238308013, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028998585626501254, + "loss": 0.8022, + "num_input_tokens_seen": 15999376, + "step": 27565 + }, + { + "epoch": 4.106344950848972, + "grad_norm": 0.01953125, + "learning_rate": 0.028997885083717534, + "loss": 0.7872, + "num_input_tokens_seen": 16002736, + "step": 27570 + }, + { + "epoch": 4.107089663389932, + "grad_norm": 0.0125732421875, + "learning_rate": 0.028997184304453773, + "loss": 0.7992, + "num_input_tokens_seen": 16005584, + "step": 27575 + }, + { + "epoch": 4.10783437593089, + "grad_norm": 0.01220703125, + "learning_rate": 0.028996483288721807, + "loss": 0.7841, + "num_input_tokens_seen": 16008176, + "step": 27580 + }, + { + "epoch": 4.10857908847185, + "grad_norm": 0.0244140625, + "learning_rate": 0.028995782036533482, + "loss": 0.8331, + "num_input_tokens_seen": 16011216, + "step": 27585 + }, + { + "epoch": 4.109323801012809, + "grad_norm": 0.0284423828125, + "learning_rate": 0.028995080547900644, + "loss": 0.8043, + "num_input_tokens_seen": 16014160, + "step": 27590 + }, + { + "epoch": 4.1100685135537685, + "grad_norm": 0.0244140625, + "learning_rate": 0.028994378822835152, + "loss": 0.79, + "num_input_tokens_seen": 16016944, + "step": 27595 + }, + { + "epoch": 4.110813226094727, + "grad_norm": 0.03369140625, + "learning_rate": 0.028993676861348846, + "loss": 0.8203, + "num_input_tokens_seen": 16019824, + "step": 27600 + }, + { + "epoch": 4.111557938635687, + "grad_norm": 0.0439453125, + "learning_rate": 0.02899297466345359, + "loss": 0.8038, + "num_input_tokens_seen": 16022736, + "step": 27605 + }, + { + "epoch": 4.112302651176646, + "grad_norm": 0.01324462890625, + "learning_rate": 0.02899227222916125, + "loss": 0.7987, + "num_input_tokens_seen": 16025904, + "step": 27610 + }, + { + "epoch": 4.113047363717605, + "grad_norm": 0.021240234375, + "learning_rate": 0.02899156955848369, + "loss": 0.7949, + "num_input_tokens_seen": 16029168, + "step": 27615 + }, + { + "epoch": 4.113792076258564, + "grad_norm": 0.023681640625, + "learning_rate": 0.02899086665143279, + "loss": 0.7905, + "num_input_tokens_seen": 16031920, + "step": 27620 + }, + { + "epoch": 4.114536788799524, + "grad_norm": 0.03173828125, + "learning_rate": 0.028990163508020413, + "loss": 0.795, + "num_input_tokens_seen": 16034864, + "step": 27625 + }, + { + "epoch": 4.115281501340482, + "grad_norm": 0.029541015625, + "learning_rate": 0.028989460128258444, + "loss": 0.8069, + "num_input_tokens_seen": 16037584, + "step": 27630 + }, + { + "epoch": 4.116026213881442, + "grad_norm": 0.0225830078125, + "learning_rate": 0.028988756512158764, + "loss": 0.8155, + "num_input_tokens_seen": 16040496, + "step": 27635 + }, + { + "epoch": 4.116770926422401, + "grad_norm": 0.01953125, + "learning_rate": 0.028988052659733258, + "loss": 0.7944, + "num_input_tokens_seen": 16043280, + "step": 27640 + }, + { + "epoch": 4.1175156389633605, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02898734857099382, + "loss": 0.7895, + "num_input_tokens_seen": 16046160, + "step": 27645 + }, + { + "epoch": 4.118260351504319, + "grad_norm": 0.0238037109375, + "learning_rate": 0.028986644245952348, + "loss": 0.8044, + "num_input_tokens_seen": 16049040, + "step": 27650 + }, + { + "epoch": 4.119005064045279, + "grad_norm": 0.031494140625, + "learning_rate": 0.028985939684620734, + "loss": 0.7966, + "num_input_tokens_seen": 16051696, + "step": 27655 + }, + { + "epoch": 4.119749776586238, + "grad_norm": 0.0311279296875, + "learning_rate": 0.028985234887010883, + "loss": 0.8025, + "num_input_tokens_seen": 16054832, + "step": 27660 + }, + { + "epoch": 4.120494489127197, + "grad_norm": 0.0164794921875, + "learning_rate": 0.0289845298531347, + "loss": 0.8164, + "num_input_tokens_seen": 16057808, + "step": 27665 + }, + { + "epoch": 4.121239201668156, + "grad_norm": 0.029296875, + "learning_rate": 0.0289838245830041, + "loss": 0.8031, + "num_input_tokens_seen": 16060656, + "step": 27670 + }, + { + "epoch": 4.121983914209116, + "grad_norm": 0.011962890625, + "learning_rate": 0.028983119076631, + "loss": 0.8091, + "num_input_tokens_seen": 16063408, + "step": 27675 + }, + { + "epoch": 4.1227286267500745, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02898241333402731, + "loss": 0.8175, + "num_input_tokens_seen": 16066288, + "step": 27680 + }, + { + "epoch": 4.123473339291033, + "grad_norm": 0.01348876953125, + "learning_rate": 0.028981707355204965, + "loss": 0.7983, + "num_input_tokens_seen": 16068880, + "step": 27685 + }, + { + "epoch": 4.124218051831993, + "grad_norm": 0.01318359375, + "learning_rate": 0.028981001140175877, + "loss": 0.7736, + "num_input_tokens_seen": 16071824, + "step": 27690 + }, + { + "epoch": 4.124962764372952, + "grad_norm": 0.025634765625, + "learning_rate": 0.02898029468895199, + "loss": 0.8066, + "num_input_tokens_seen": 16074576, + "step": 27695 + }, + { + "epoch": 4.125707476913911, + "grad_norm": 0.024658203125, + "learning_rate": 0.02897958800154523, + "loss": 0.8051, + "num_input_tokens_seen": 16077264, + "step": 27700 + }, + { + "epoch": 4.12645218945487, + "grad_norm": 0.0223388671875, + "learning_rate": 0.028978881077967537, + "loss": 0.8161, + "num_input_tokens_seen": 16079984, + "step": 27705 + }, + { + "epoch": 4.12719690199583, + "grad_norm": 0.02490234375, + "learning_rate": 0.028978173918230862, + "loss": 0.8046, + "num_input_tokens_seen": 16082704, + "step": 27710 + }, + { + "epoch": 4.127941614536788, + "grad_norm": 0.0201416015625, + "learning_rate": 0.028977466522347138, + "loss": 0.7886, + "num_input_tokens_seen": 16085296, + "step": 27715 + }, + { + "epoch": 4.128686327077748, + "grad_norm": 0.026123046875, + "learning_rate": 0.02897675889032833, + "loss": 0.8055, + "num_input_tokens_seen": 16088080, + "step": 27720 + }, + { + "epoch": 4.129431039618707, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02897605102218638, + "loss": 0.8259, + "num_input_tokens_seen": 16090960, + "step": 27725 + }, + { + "epoch": 4.1301757521596665, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028975342917933255, + "loss": 0.8062, + "num_input_tokens_seen": 16093648, + "step": 27730 + }, + { + "epoch": 4.130920464700625, + "grad_norm": 0.0126953125, + "learning_rate": 0.028974634577580913, + "loss": 0.7942, + "num_input_tokens_seen": 16096528, + "step": 27735 + }, + { + "epoch": 4.131665177241585, + "grad_norm": 0.027587890625, + "learning_rate": 0.028973926001141327, + "loss": 0.8108, + "num_input_tokens_seen": 16099536, + "step": 27740 + }, + { + "epoch": 4.132409889782544, + "grad_norm": 0.01434326171875, + "learning_rate": 0.02897321718862646, + "loss": 0.8118, + "num_input_tokens_seen": 16102480, + "step": 27745 + }, + { + "epoch": 4.133154602323503, + "grad_norm": 0.0250244140625, + "learning_rate": 0.028972508140048297, + "loss": 0.8076, + "num_input_tokens_seen": 16105360, + "step": 27750 + }, + { + "epoch": 4.133899314864462, + "grad_norm": 0.01287841796875, + "learning_rate": 0.028971798855418805, + "loss": 0.781, + "num_input_tokens_seen": 16108336, + "step": 27755 + }, + { + "epoch": 4.134644027405422, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02897108933474997, + "loss": 0.792, + "num_input_tokens_seen": 16111056, + "step": 27760 + }, + { + "epoch": 4.1353887399463805, + "grad_norm": 0.019287109375, + "learning_rate": 0.028970379578053783, + "loss": 0.7967, + "num_input_tokens_seen": 16113776, + "step": 27765 + }, + { + "epoch": 4.13613345248734, + "grad_norm": 0.0125732421875, + "learning_rate": 0.028969669585342233, + "loss": 0.8311, + "num_input_tokens_seen": 16116464, + "step": 27770 + }, + { + "epoch": 4.136878165028299, + "grad_norm": 0.025146484375, + "learning_rate": 0.028968959356627313, + "loss": 0.7756, + "num_input_tokens_seen": 16119472, + "step": 27775 + }, + { + "epoch": 4.1376228775692585, + "grad_norm": 0.029052734375, + "learning_rate": 0.028968248891921018, + "loss": 0.7904, + "num_input_tokens_seen": 16122480, + "step": 27780 + }, + { + "epoch": 4.138367590110217, + "grad_norm": 0.025146484375, + "learning_rate": 0.02896753819123536, + "loss": 0.8016, + "num_input_tokens_seen": 16125296, + "step": 27785 + }, + { + "epoch": 4.139112302651177, + "grad_norm": 0.0213623046875, + "learning_rate": 0.028966827254582335, + "loss": 0.7947, + "num_input_tokens_seen": 16128240, + "step": 27790 + }, + { + "epoch": 4.139857015192136, + "grad_norm": 0.026123046875, + "learning_rate": 0.028966116081973962, + "loss": 0.7964, + "num_input_tokens_seen": 16131440, + "step": 27795 + }, + { + "epoch": 4.140601727733095, + "grad_norm": 0.039794921875, + "learning_rate": 0.028965404673422252, + "loss": 0.8283, + "num_input_tokens_seen": 16134224, + "step": 27800 + }, + { + "epoch": 4.141346440274054, + "grad_norm": 0.031982421875, + "learning_rate": 0.028964693028939222, + "loss": 0.8215, + "num_input_tokens_seen": 16136944, + "step": 27805 + }, + { + "epoch": 4.142091152815014, + "grad_norm": 0.0341796875, + "learning_rate": 0.0289639811485369, + "loss": 0.797, + "num_input_tokens_seen": 16139792, + "step": 27810 + }, + { + "epoch": 4.1428358653559725, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028963269032227307, + "loss": 0.796, + "num_input_tokens_seen": 16142608, + "step": 27815 + }, + { + "epoch": 4.143580577896932, + "grad_norm": 0.0257568359375, + "learning_rate": 0.028962556680022476, + "loss": 0.783, + "num_input_tokens_seen": 16145520, + "step": 27820 + }, + { + "epoch": 4.144325290437891, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02896184409193444, + "loss": 0.8115, + "num_input_tokens_seen": 16148304, + "step": 27825 + }, + { + "epoch": 4.1450700029788505, + "grad_norm": 0.0245361328125, + "learning_rate": 0.028961131267975237, + "loss": 0.8352, + "num_input_tokens_seen": 16151184, + "step": 27830 + }, + { + "epoch": 4.145814715519809, + "grad_norm": 0.0125732421875, + "learning_rate": 0.02896041820815691, + "loss": 0.7896, + "num_input_tokens_seen": 16154224, + "step": 27835 + }, + { + "epoch": 4.146559428060769, + "grad_norm": 0.01220703125, + "learning_rate": 0.028959704912491512, + "loss": 0.8158, + "num_input_tokens_seen": 16156848, + "step": 27840 + }, + { + "epoch": 4.147304140601728, + "grad_norm": 0.031494140625, + "learning_rate": 0.02895899138099108, + "loss": 0.7791, + "num_input_tokens_seen": 16159760, + "step": 27845 + }, + { + "epoch": 4.148048853142687, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02895827761366768, + "loss": 0.8055, + "num_input_tokens_seen": 16162672, + "step": 27850 + }, + { + "epoch": 4.148793565683646, + "grad_norm": 0.021484375, + "learning_rate": 0.02895756361053337, + "loss": 0.786, + "num_input_tokens_seen": 16165584, + "step": 27855 + }, + { + "epoch": 4.149538278224606, + "grad_norm": 0.024169921875, + "learning_rate": 0.028956849371600202, + "loss": 0.7933, + "num_input_tokens_seen": 16168624, + "step": 27860 + }, + { + "epoch": 4.1502829907655645, + "grad_norm": 0.0230712890625, + "learning_rate": 0.028956134896880252, + "loss": 0.8036, + "num_input_tokens_seen": 16171440, + "step": 27865 + }, + { + "epoch": 4.151027703306523, + "grad_norm": 0.02099609375, + "learning_rate": 0.028955420186385587, + "loss": 0.8294, + "num_input_tokens_seen": 16174416, + "step": 27870 + }, + { + "epoch": 4.151772415847483, + "grad_norm": 0.018798828125, + "learning_rate": 0.02895470524012828, + "loss": 0.8041, + "num_input_tokens_seen": 16177328, + "step": 27875 + }, + { + "epoch": 4.152517128388442, + "grad_norm": 0.0299072265625, + "learning_rate": 0.028953990058120414, + "loss": 0.8196, + "num_input_tokens_seen": 16180208, + "step": 27880 + }, + { + "epoch": 4.153261840929401, + "grad_norm": 0.01226806640625, + "learning_rate": 0.028953274640374064, + "loss": 0.812, + "num_input_tokens_seen": 16183440, + "step": 27885 + }, + { + "epoch": 4.15400655347036, + "grad_norm": 0.020263671875, + "learning_rate": 0.028952558986901324, + "loss": 0.7887, + "num_input_tokens_seen": 16186448, + "step": 27890 + }, + { + "epoch": 4.15475126601132, + "grad_norm": 0.01708984375, + "learning_rate": 0.028951843097714276, + "loss": 0.7977, + "num_input_tokens_seen": 16189456, + "step": 27895 + }, + { + "epoch": 4.1554959785522785, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028951126972825026, + "loss": 0.8046, + "num_input_tokens_seen": 16192496, + "step": 27900 + }, + { + "epoch": 4.156240691093238, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02895041061224566, + "loss": 0.7851, + "num_input_tokens_seen": 16195568, + "step": 27905 + }, + { + "epoch": 4.156985403634197, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028949694015988285, + "loss": 0.8022, + "num_input_tokens_seen": 16198384, + "step": 27910 + }, + { + "epoch": 4.1577301161751565, + "grad_norm": 0.026123046875, + "learning_rate": 0.028948977184065014, + "loss": 0.7936, + "num_input_tokens_seen": 16201072, + "step": 27915 + }, + { + "epoch": 4.158474828716115, + "grad_norm": 0.028076171875, + "learning_rate": 0.028948260116487944, + "loss": 0.7939, + "num_input_tokens_seen": 16203920, + "step": 27920 + }, + { + "epoch": 4.159219541257075, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028947542813269195, + "loss": 0.7952, + "num_input_tokens_seen": 16206640, + "step": 27925 + }, + { + "epoch": 4.159964253798034, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02894682527442089, + "loss": 0.7775, + "num_input_tokens_seen": 16209328, + "step": 27930 + }, + { + "epoch": 4.160708966338993, + "grad_norm": 0.03173828125, + "learning_rate": 0.028946107499955145, + "loss": 0.8316, + "num_input_tokens_seen": 16211984, + "step": 27935 + }, + { + "epoch": 4.161453678879952, + "grad_norm": 0.0380859375, + "learning_rate": 0.028945389489884087, + "loss": 0.7757, + "num_input_tokens_seen": 16214960, + "step": 27940 + }, + { + "epoch": 4.162198391420912, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02894467124421985, + "loss": 0.8007, + "num_input_tokens_seen": 16217904, + "step": 27945 + }, + { + "epoch": 4.1629431039618705, + "grad_norm": 0.020751953125, + "learning_rate": 0.028943952762974564, + "loss": 0.8169, + "num_input_tokens_seen": 16220976, + "step": 27950 + }, + { + "epoch": 4.16368781650283, + "grad_norm": 0.030517578125, + "learning_rate": 0.02894323404616037, + "loss": 0.7954, + "num_input_tokens_seen": 16223760, + "step": 27955 + }, + { + "epoch": 4.164432529043789, + "grad_norm": 0.0272216796875, + "learning_rate": 0.028942515093789405, + "loss": 0.7904, + "num_input_tokens_seen": 16226672, + "step": 27960 + }, + { + "epoch": 4.165177241584749, + "grad_norm": 0.021728515625, + "learning_rate": 0.028941795905873816, + "loss": 0.8043, + "num_input_tokens_seen": 16229776, + "step": 27965 + }, + { + "epoch": 4.165921954125707, + "grad_norm": 0.0189208984375, + "learning_rate": 0.028941076482425755, + "loss": 0.7937, + "num_input_tokens_seen": 16232336, + "step": 27970 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.0257568359375, + "learning_rate": 0.028940356823457376, + "loss": 0.7972, + "num_input_tokens_seen": 16235472, + "step": 27975 + }, + { + "epoch": 4.167411379207626, + "grad_norm": 0.02783203125, + "learning_rate": 0.02893963692898084, + "loss": 0.7871, + "num_input_tokens_seen": 16238448, + "step": 27980 + }, + { + "epoch": 4.168156091748585, + "grad_norm": 0.01300048828125, + "learning_rate": 0.028938916799008306, + "loss": 0.8118, + "num_input_tokens_seen": 16241296, + "step": 27985 + }, + { + "epoch": 4.168900804289544, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02893819643355194, + "loss": 0.7932, + "num_input_tokens_seen": 16244592, + "step": 27990 + }, + { + "epoch": 4.169645516830504, + "grad_norm": 0.01904296875, + "learning_rate": 0.028937475832623908, + "loss": 0.7819, + "num_input_tokens_seen": 16247408, + "step": 27995 + }, + { + "epoch": 4.1703902293714625, + "grad_norm": 0.021240234375, + "learning_rate": 0.028936754996236394, + "loss": 0.8088, + "num_input_tokens_seen": 16250544, + "step": 28000 + }, + { + "epoch": 4.171134941912422, + "grad_norm": 0.02734375, + "learning_rate": 0.028936033924401562, + "loss": 0.7888, + "num_input_tokens_seen": 16253232, + "step": 28005 + }, + { + "epoch": 4.171879654453381, + "grad_norm": 0.0167236328125, + "learning_rate": 0.0289353126171316, + "loss": 0.7916, + "num_input_tokens_seen": 16256368, + "step": 28010 + }, + { + "epoch": 4.172624366994341, + "grad_norm": 0.01251220703125, + "learning_rate": 0.028934591074438697, + "loss": 0.8014, + "num_input_tokens_seen": 16259312, + "step": 28015 + }, + { + "epoch": 4.173369079535299, + "grad_norm": 0.018310546875, + "learning_rate": 0.02893386929633504, + "loss": 0.7754, + "num_input_tokens_seen": 16262128, + "step": 28020 + }, + { + "epoch": 4.174113792076259, + "grad_norm": 0.0113525390625, + "learning_rate": 0.028933147282832826, + "loss": 0.8043, + "num_input_tokens_seen": 16265232, + "step": 28025 + }, + { + "epoch": 4.174858504617218, + "grad_norm": 0.0205078125, + "learning_rate": 0.028932425033944248, + "loss": 0.7839, + "num_input_tokens_seen": 16268240, + "step": 28030 + }, + { + "epoch": 4.1756032171581765, + "grad_norm": 0.019287109375, + "learning_rate": 0.02893170254968151, + "loss": 0.81, + "num_input_tokens_seen": 16270896, + "step": 28035 + }, + { + "epoch": 4.176347929699136, + "grad_norm": 0.0230712890625, + "learning_rate": 0.028930979830056814, + "loss": 0.8248, + "num_input_tokens_seen": 16273488, + "step": 28040 + }, + { + "epoch": 4.177092642240095, + "grad_norm": 0.0302734375, + "learning_rate": 0.028930256875082376, + "loss": 0.8548, + "num_input_tokens_seen": 16276048, + "step": 28045 + }, + { + "epoch": 4.177837354781055, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0289295336847704, + "loss": 0.8071, + "num_input_tokens_seen": 16279120, + "step": 28050 + }, + { + "epoch": 4.178582067322013, + "grad_norm": 0.0308837890625, + "learning_rate": 0.028928810259133117, + "loss": 0.7916, + "num_input_tokens_seen": 16282352, + "step": 28055 + }, + { + "epoch": 4.179326779862973, + "grad_norm": 0.01385498046875, + "learning_rate": 0.028928086598182735, + "loss": 0.8374, + "num_input_tokens_seen": 16285008, + "step": 28060 + }, + { + "epoch": 4.180071492403932, + "grad_norm": 0.0322265625, + "learning_rate": 0.028927362701931494, + "loss": 0.8086, + "num_input_tokens_seen": 16288048, + "step": 28065 + }, + { + "epoch": 4.180816204944891, + "grad_norm": 0.041259765625, + "learning_rate": 0.02892663857039161, + "loss": 0.7769, + "num_input_tokens_seen": 16291184, + "step": 28070 + }, + { + "epoch": 4.18156091748585, + "grad_norm": 0.01953125, + "learning_rate": 0.02892591420357532, + "loss": 0.835, + "num_input_tokens_seen": 16293936, + "step": 28075 + }, + { + "epoch": 4.18230563002681, + "grad_norm": 0.0189208984375, + "learning_rate": 0.028925189601494865, + "loss": 0.7863, + "num_input_tokens_seen": 16297040, + "step": 28080 + }, + { + "epoch": 4.1830503425677685, + "grad_norm": 0.020263671875, + "learning_rate": 0.028924464764162487, + "loss": 0.7946, + "num_input_tokens_seen": 16300272, + "step": 28085 + }, + { + "epoch": 4.183795055108728, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028923739691590424, + "loss": 0.8021, + "num_input_tokens_seen": 16303312, + "step": 28090 + }, + { + "epoch": 4.184539767649687, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028923014383790937, + "loss": 0.7985, + "num_input_tokens_seen": 16306512, + "step": 28095 + }, + { + "epoch": 4.185284480190647, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02892228884077627, + "loss": 0.8157, + "num_input_tokens_seen": 16309104, + "step": 28100 + }, + { + "epoch": 4.186029192731605, + "grad_norm": 0.0250244140625, + "learning_rate": 0.028921563062558678, + "loss": 0.7924, + "num_input_tokens_seen": 16311920, + "step": 28105 + }, + { + "epoch": 4.186773905272565, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02892083704915043, + "loss": 0.7712, + "num_input_tokens_seen": 16314768, + "step": 28110 + }, + { + "epoch": 4.187518617813524, + "grad_norm": 0.018798828125, + "learning_rate": 0.02892011080056379, + "loss": 0.8055, + "num_input_tokens_seen": 16317392, + "step": 28115 + }, + { + "epoch": 4.188263330354483, + "grad_norm": 0.031982421875, + "learning_rate": 0.028919384316811028, + "loss": 0.8095, + "num_input_tokens_seen": 16320432, + "step": 28120 + }, + { + "epoch": 4.189008042895442, + "grad_norm": 0.0224609375, + "learning_rate": 0.028918657597904414, + "loss": 0.8309, + "num_input_tokens_seen": 16323120, + "step": 28125 + }, + { + "epoch": 4.189752755436402, + "grad_norm": 0.02001953125, + "learning_rate": 0.028917930643856223, + "loss": 0.8174, + "num_input_tokens_seen": 16325904, + "step": 28130 + }, + { + "epoch": 4.190497467977361, + "grad_norm": 0.01904296875, + "learning_rate": 0.02891720345467874, + "loss": 0.7904, + "num_input_tokens_seen": 16328848, + "step": 28135 + }, + { + "epoch": 4.19124218051832, + "grad_norm": 0.020263671875, + "learning_rate": 0.028916476030384254, + "loss": 0.801, + "num_input_tokens_seen": 16331632, + "step": 28140 + }, + { + "epoch": 4.191986893059279, + "grad_norm": 0.01904296875, + "learning_rate": 0.028915748370985046, + "loss": 0.8033, + "num_input_tokens_seen": 16334160, + "step": 28145 + }, + { + "epoch": 4.192731605600239, + "grad_norm": 0.01953125, + "learning_rate": 0.028915020476493414, + "loss": 0.8148, + "num_input_tokens_seen": 16336816, + "step": 28150 + }, + { + "epoch": 4.193476318141197, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02891429234692165, + "loss": 0.8093, + "num_input_tokens_seen": 16339600, + "step": 28155 + }, + { + "epoch": 4.194221030682157, + "grad_norm": 0.01953125, + "learning_rate": 0.02891356398228206, + "loss": 0.8073, + "num_input_tokens_seen": 16342480, + "step": 28160 + }, + { + "epoch": 4.194965743223116, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02891283538258695, + "loss": 0.8151, + "num_input_tokens_seen": 16345616, + "step": 28165 + }, + { + "epoch": 4.195710455764075, + "grad_norm": 0.0311279296875, + "learning_rate": 0.028912106547848627, + "loss": 0.8082, + "num_input_tokens_seen": 16348272, + "step": 28170 + }, + { + "epoch": 4.196455168305034, + "grad_norm": 0.0322265625, + "learning_rate": 0.0289113774780794, + "loss": 0.7951, + "num_input_tokens_seen": 16351184, + "step": 28175 + }, + { + "epoch": 4.197199880845994, + "grad_norm": 0.020751953125, + "learning_rate": 0.028910648173291593, + "loss": 0.8067, + "num_input_tokens_seen": 16354000, + "step": 28180 + }, + { + "epoch": 4.197944593386953, + "grad_norm": 0.0205078125, + "learning_rate": 0.028909918633497525, + "loss": 0.8029, + "num_input_tokens_seen": 16356688, + "step": 28185 + }, + { + "epoch": 4.198689305927912, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02890918885870951, + "loss": 0.7999, + "num_input_tokens_seen": 16359728, + "step": 28190 + }, + { + "epoch": 4.199434018468871, + "grad_norm": 0.0147705078125, + "learning_rate": 0.028908458848939892, + "loss": 0.8205, + "num_input_tokens_seen": 16362736, + "step": 28195 + }, + { + "epoch": 4.200178731009831, + "grad_norm": 0.021728515625, + "learning_rate": 0.028907728604201, + "loss": 0.8082, + "num_input_tokens_seen": 16365456, + "step": 28200 + }, + { + "epoch": 4.200923443550789, + "grad_norm": 0.0286865234375, + "learning_rate": 0.028906998124505166, + "loss": 0.8069, + "num_input_tokens_seen": 16368080, + "step": 28205 + }, + { + "epoch": 4.201668156091749, + "grad_norm": 0.031005859375, + "learning_rate": 0.028906267409864732, + "loss": 0.799, + "num_input_tokens_seen": 16371088, + "step": 28210 + }, + { + "epoch": 4.202412868632708, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028905536460292043, + "loss": 0.7914, + "num_input_tokens_seen": 16374224, + "step": 28215 + }, + { + "epoch": 4.203157581173667, + "grad_norm": 0.040283203125, + "learning_rate": 0.028904805275799448, + "loss": 0.827, + "num_input_tokens_seen": 16376848, + "step": 28220 + }, + { + "epoch": 4.203902293714626, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0289040738563993, + "loss": 0.8039, + "num_input_tokens_seen": 16380112, + "step": 28225 + }, + { + "epoch": 4.204647006255585, + "grad_norm": 0.019775390625, + "learning_rate": 0.028903342202103964, + "loss": 0.7926, + "num_input_tokens_seen": 16382960, + "step": 28230 + }, + { + "epoch": 4.205391718796545, + "grad_norm": 0.0250244140625, + "learning_rate": 0.028902610312925785, + "loss": 0.796, + "num_input_tokens_seen": 16385776, + "step": 28235 + }, + { + "epoch": 4.206136431337503, + "grad_norm": 0.01953125, + "learning_rate": 0.02890187818887714, + "loss": 0.7928, + "num_input_tokens_seen": 16388592, + "step": 28240 + }, + { + "epoch": 4.206881143878463, + "grad_norm": 0.0198974609375, + "learning_rate": 0.028901145829970387, + "loss": 0.8158, + "num_input_tokens_seen": 16391504, + "step": 28245 + }, + { + "epoch": 4.207625856419422, + "grad_norm": 0.021240234375, + "learning_rate": 0.028900413236217903, + "loss": 0.8243, + "num_input_tokens_seen": 16394672, + "step": 28250 + }, + { + "epoch": 4.208370568960381, + "grad_norm": 0.01416015625, + "learning_rate": 0.028899680407632064, + "loss": 0.7836, + "num_input_tokens_seen": 16397616, + "step": 28255 + }, + { + "epoch": 4.20911528150134, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028898947344225257, + "loss": 0.8005, + "num_input_tokens_seen": 16400400, + "step": 28260 + }, + { + "epoch": 4.2098599940423, + "grad_norm": 0.0272216796875, + "learning_rate": 0.028898214046009858, + "loss": 0.7995, + "num_input_tokens_seen": 16403312, + "step": 28265 + }, + { + "epoch": 4.210604706583259, + "grad_norm": 0.0205078125, + "learning_rate": 0.028897480512998258, + "loss": 0.7981, + "num_input_tokens_seen": 16406160, + "step": 28270 + }, + { + "epoch": 4.211349419124218, + "grad_norm": 0.01373291015625, + "learning_rate": 0.02889674674520285, + "loss": 0.8116, + "num_input_tokens_seen": 16409296, + "step": 28275 + }, + { + "epoch": 4.212094131665177, + "grad_norm": 0.024658203125, + "learning_rate": 0.028896012742636028, + "loss": 0.8168, + "num_input_tokens_seen": 16412144, + "step": 28280 + }, + { + "epoch": 4.212838844206137, + "grad_norm": 0.0228271484375, + "learning_rate": 0.028895278505310193, + "loss": 0.8094, + "num_input_tokens_seen": 16415056, + "step": 28285 + }, + { + "epoch": 4.213583556747095, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028894544033237757, + "loss": 0.7958, + "num_input_tokens_seen": 16417904, + "step": 28290 + }, + { + "epoch": 4.214328269288055, + "grad_norm": 0.01708984375, + "learning_rate": 0.028893809326431118, + "loss": 0.8025, + "num_input_tokens_seen": 16420432, + "step": 28295 + }, + { + "epoch": 4.215072981829014, + "grad_norm": 0.018798828125, + "learning_rate": 0.02889307438490269, + "loss": 0.8036, + "num_input_tokens_seen": 16423408, + "step": 28300 + }, + { + "epoch": 4.2158176943699734, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02889233920866489, + "loss": 0.8098, + "num_input_tokens_seen": 16426576, + "step": 28305 + }, + { + "epoch": 4.216562406910932, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02889160379773014, + "loss": 0.8034, + "num_input_tokens_seen": 16429424, + "step": 28310 + }, + { + "epoch": 4.217307119451892, + "grad_norm": 0.0205078125, + "learning_rate": 0.028890868152110862, + "loss": 0.7883, + "num_input_tokens_seen": 16432368, + "step": 28315 + }, + { + "epoch": 4.218051831992851, + "grad_norm": 0.019287109375, + "learning_rate": 0.028890132271819486, + "loss": 0.7928, + "num_input_tokens_seen": 16435216, + "step": 28320 + }, + { + "epoch": 4.21879654453381, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02888939615686844, + "loss": 0.8031, + "num_input_tokens_seen": 16438576, + "step": 28325 + }, + { + "epoch": 4.219541257074769, + "grad_norm": 0.0306396484375, + "learning_rate": 0.028888659807270164, + "loss": 0.7936, + "num_input_tokens_seen": 16441392, + "step": 28330 + }, + { + "epoch": 4.220285969615729, + "grad_norm": 0.01263427734375, + "learning_rate": 0.0288879232230371, + "loss": 0.8086, + "num_input_tokens_seen": 16444208, + "step": 28335 + }, + { + "epoch": 4.221030682156687, + "grad_norm": 0.027099609375, + "learning_rate": 0.028887186404181683, + "loss": 0.8167, + "num_input_tokens_seen": 16446896, + "step": 28340 + }, + { + "epoch": 4.221775394697647, + "grad_norm": 0.0289306640625, + "learning_rate": 0.028886449350716367, + "loss": 0.7839, + "num_input_tokens_seen": 16449680, + "step": 28345 + }, + { + "epoch": 4.222520107238606, + "grad_norm": 0.019287109375, + "learning_rate": 0.028885712062653602, + "loss": 0.7788, + "num_input_tokens_seen": 16452528, + "step": 28350 + }, + { + "epoch": 4.2232648197795655, + "grad_norm": 0.01507568359375, + "learning_rate": 0.02888497454000585, + "loss": 0.7974, + "num_input_tokens_seen": 16455824, + "step": 28355 + }, + { + "epoch": 4.224009532320524, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02888423678278556, + "loss": 0.7978, + "num_input_tokens_seen": 16458672, + "step": 28360 + }, + { + "epoch": 4.224754244861484, + "grad_norm": 0.01300048828125, + "learning_rate": 0.0288834987910052, + "loss": 0.7922, + "num_input_tokens_seen": 16461584, + "step": 28365 + }, + { + "epoch": 4.225498957402443, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02888276056467724, + "loss": 0.8065, + "num_input_tokens_seen": 16464368, + "step": 28370 + }, + { + "epoch": 4.226243669943402, + "grad_norm": 0.02197265625, + "learning_rate": 0.028882022103814155, + "loss": 0.8167, + "num_input_tokens_seen": 16467440, + "step": 28375 + }, + { + "epoch": 4.226988382484361, + "grad_norm": 0.0137939453125, + "learning_rate": 0.028881283408428406, + "loss": 0.8222, + "num_input_tokens_seen": 16470192, + "step": 28380 + }, + { + "epoch": 4.22773309502532, + "grad_norm": 0.028076171875, + "learning_rate": 0.02888054447853249, + "loss": 0.7872, + "num_input_tokens_seen": 16473552, + "step": 28385 + }, + { + "epoch": 4.2284778075662794, + "grad_norm": 0.046630859375, + "learning_rate": 0.02887980531413888, + "loss": 0.8307, + "num_input_tokens_seen": 16476368, + "step": 28390 + }, + { + "epoch": 4.229222520107238, + "grad_norm": 0.0301513671875, + "learning_rate": 0.028879065915260065, + "loss": 0.8063, + "num_input_tokens_seen": 16479120, + "step": 28395 + }, + { + "epoch": 4.229967232648198, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02887832628190854, + "loss": 0.7901, + "num_input_tokens_seen": 16482064, + "step": 28400 + }, + { + "epoch": 4.230711945189157, + "grad_norm": 0.036376953125, + "learning_rate": 0.0288775864140968, + "loss": 0.807, + "num_input_tokens_seen": 16484688, + "step": 28405 + }, + { + "epoch": 4.231456657730116, + "grad_norm": 0.01953125, + "learning_rate": 0.02887684631183733, + "loss": 0.7908, + "num_input_tokens_seen": 16487536, + "step": 28410 + }, + { + "epoch": 4.232201370271075, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028876105975142657, + "loss": 0.8341, + "num_input_tokens_seen": 16490000, + "step": 28415 + }, + { + "epoch": 4.232946082812035, + "grad_norm": 0.027099609375, + "learning_rate": 0.028875365404025275, + "loss": 0.8385, + "num_input_tokens_seen": 16492880, + "step": 28420 + }, + { + "epoch": 4.233690795352993, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028874624598497694, + "loss": 0.7762, + "num_input_tokens_seen": 16495600, + "step": 28425 + }, + { + "epoch": 4.234435507893953, + "grad_norm": 0.0113525390625, + "learning_rate": 0.028873883558572434, + "loss": 0.7979, + "num_input_tokens_seen": 16498480, + "step": 28430 + }, + { + "epoch": 4.235180220434912, + "grad_norm": 0.014404296875, + "learning_rate": 0.028873142284262007, + "loss": 0.8216, + "num_input_tokens_seen": 16501424, + "step": 28435 + }, + { + "epoch": 4.2359249329758715, + "grad_norm": 0.02099609375, + "learning_rate": 0.028872400775578948, + "loss": 0.8027, + "num_input_tokens_seen": 16504400, + "step": 28440 + }, + { + "epoch": 4.23666964551683, + "grad_norm": 0.0145263671875, + "learning_rate": 0.028871659032535774, + "loss": 0.7931, + "num_input_tokens_seen": 16507248, + "step": 28445 + }, + { + "epoch": 4.23741435805779, + "grad_norm": 0.020751953125, + "learning_rate": 0.02887091705514502, + "loss": 0.804, + "num_input_tokens_seen": 16510032, + "step": 28450 + }, + { + "epoch": 4.238159070598749, + "grad_norm": 0.034912109375, + "learning_rate": 0.02887017484341922, + "loss": 0.7948, + "num_input_tokens_seen": 16512816, + "step": 28455 + }, + { + "epoch": 4.238903783139708, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02886943239737091, + "loss": 0.8183, + "num_input_tokens_seen": 16515504, + "step": 28460 + }, + { + "epoch": 4.239648495680667, + "grad_norm": 0.0390625, + "learning_rate": 0.02886868971701264, + "loss": 0.8096, + "num_input_tokens_seen": 16518128, + "step": 28465 + }, + { + "epoch": 4.240393208221627, + "grad_norm": 0.0223388671875, + "learning_rate": 0.028867946802356954, + "loss": 0.7951, + "num_input_tokens_seen": 16521040, + "step": 28470 + }, + { + "epoch": 4.2411379207625854, + "grad_norm": 0.038818359375, + "learning_rate": 0.028867203653416395, + "loss": 0.8216, + "num_input_tokens_seen": 16524112, + "step": 28475 + }, + { + "epoch": 4.241882633303545, + "grad_norm": 0.02392578125, + "learning_rate": 0.028866460270203526, + "loss": 0.8047, + "num_input_tokens_seen": 16526800, + "step": 28480 + }, + { + "epoch": 4.242627345844504, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02886571665273091, + "loss": 0.799, + "num_input_tokens_seen": 16529488, + "step": 28485 + }, + { + "epoch": 4.2433720583854635, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0288649728010111, + "loss": 0.8025, + "num_input_tokens_seen": 16532336, + "step": 28490 + }, + { + "epoch": 4.244116770926422, + "grad_norm": 0.0264892578125, + "learning_rate": 0.028864228715056665, + "loss": 0.8093, + "num_input_tokens_seen": 16535024, + "step": 28495 + }, + { + "epoch": 4.244861483467382, + "grad_norm": 0.02392578125, + "learning_rate": 0.02886348439488018, + "loss": 0.814, + "num_input_tokens_seen": 16537872, + "step": 28500 + }, + { + "epoch": 4.245606196008341, + "grad_norm": 0.0198974609375, + "learning_rate": 0.028862739840494214, + "loss": 0.7904, + "num_input_tokens_seen": 16540624, + "step": 28505 + }, + { + "epoch": 4.2463509085493, + "grad_norm": 0.019775390625, + "learning_rate": 0.028861995051911345, + "loss": 0.8155, + "num_input_tokens_seen": 16543632, + "step": 28510 + }, + { + "epoch": 4.247095621090259, + "grad_norm": 0.022705078125, + "learning_rate": 0.02886125002914416, + "loss": 0.8086, + "num_input_tokens_seen": 16546480, + "step": 28515 + }, + { + "epoch": 4.247840333631219, + "grad_norm": 0.025390625, + "learning_rate": 0.028860504772205246, + "loss": 0.8123, + "num_input_tokens_seen": 16549328, + "step": 28520 + }, + { + "epoch": 4.2485850461721775, + "grad_norm": 0.0126953125, + "learning_rate": 0.028859759281107192, + "loss": 0.7889, + "num_input_tokens_seen": 16552144, + "step": 28525 + }, + { + "epoch": 4.249329758713137, + "grad_norm": 0.01373291015625, + "learning_rate": 0.02885901355586259, + "loss": 0.7953, + "num_input_tokens_seen": 16555120, + "step": 28530 + }, + { + "epoch": 4.250074471254096, + "grad_norm": 0.02001953125, + "learning_rate": 0.028858267596484036, + "loss": 0.8041, + "num_input_tokens_seen": 16558032, + "step": 28535 + }, + { + "epoch": 4.2508191837950555, + "grad_norm": 0.020263671875, + "learning_rate": 0.02885752140298414, + "loss": 0.7973, + "num_input_tokens_seen": 16561040, + "step": 28540 + }, + { + "epoch": 4.251563896336014, + "grad_norm": 0.02783203125, + "learning_rate": 0.028856774975375504, + "loss": 0.7986, + "num_input_tokens_seen": 16563792, + "step": 28545 + }, + { + "epoch": 4.252308608876973, + "grad_norm": 0.021484375, + "learning_rate": 0.028856028313670738, + "loss": 0.7934, + "num_input_tokens_seen": 16566672, + "step": 28550 + }, + { + "epoch": 4.253053321417933, + "grad_norm": 0.0218505859375, + "learning_rate": 0.028855281417882454, + "loss": 0.7989, + "num_input_tokens_seen": 16569648, + "step": 28555 + }, + { + "epoch": 4.253798033958892, + "grad_norm": 0.0224609375, + "learning_rate": 0.028854534288023276, + "loss": 0.8214, + "num_input_tokens_seen": 16572432, + "step": 28560 + }, + { + "epoch": 4.254542746499851, + "grad_norm": 0.024169921875, + "learning_rate": 0.028853786924105815, + "loss": 0.8088, + "num_input_tokens_seen": 16575088, + "step": 28565 + }, + { + "epoch": 4.25528745904081, + "grad_norm": 0.01300048828125, + "learning_rate": 0.028853039326142714, + "loss": 0.8104, + "num_input_tokens_seen": 16578096, + "step": 28570 + }, + { + "epoch": 4.2560321715817695, + "grad_norm": 0.031005859375, + "learning_rate": 0.028852291494146587, + "loss": 0.8121, + "num_input_tokens_seen": 16580944, + "step": 28575 + }, + { + "epoch": 4.256776884122728, + "grad_norm": 0.033203125, + "learning_rate": 0.028851543428130073, + "loss": 0.8067, + "num_input_tokens_seen": 16583664, + "step": 28580 + }, + { + "epoch": 4.257521596663688, + "grad_norm": 0.021484375, + "learning_rate": 0.028850795128105816, + "loss": 0.8101, + "num_input_tokens_seen": 16586640, + "step": 28585 + }, + { + "epoch": 4.258266309204647, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028850046594086447, + "loss": 0.7872, + "num_input_tokens_seen": 16589424, + "step": 28590 + }, + { + "epoch": 4.259011021745606, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028849297826084624, + "loss": 0.7935, + "num_input_tokens_seen": 16592112, + "step": 28595 + }, + { + "epoch": 4.259755734286565, + "grad_norm": 0.020751953125, + "learning_rate": 0.028848548824112988, + "loss": 0.8116, + "num_input_tokens_seen": 16595120, + "step": 28600 + }, + { + "epoch": 4.260500446827525, + "grad_norm": 0.025634765625, + "learning_rate": 0.02884779958818419, + "loss": 0.8004, + "num_input_tokens_seen": 16597808, + "step": 28605 + }, + { + "epoch": 4.2612451593684835, + "grad_norm": 0.0125732421875, + "learning_rate": 0.0288470501183109, + "loss": 0.8087, + "num_input_tokens_seen": 16600656, + "step": 28610 + }, + { + "epoch": 4.261989871909443, + "grad_norm": 0.0247802734375, + "learning_rate": 0.028846300414505765, + "loss": 0.8122, + "num_input_tokens_seen": 16603408, + "step": 28615 + }, + { + "epoch": 4.262734584450402, + "grad_norm": 0.020263671875, + "learning_rate": 0.02884555047678146, + "loss": 0.7927, + "num_input_tokens_seen": 16606640, + "step": 28620 + }, + { + "epoch": 4.2634792969913615, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028844800305150648, + "loss": 0.7889, + "num_input_tokens_seen": 16609360, + "step": 28625 + }, + { + "epoch": 4.26422400953232, + "grad_norm": 0.01324462890625, + "learning_rate": 0.02884404989962601, + "loss": 0.7953, + "num_input_tokens_seen": 16612112, + "step": 28630 + }, + { + "epoch": 4.26496872207328, + "grad_norm": 0.02880859375, + "learning_rate": 0.028843299260220216, + "loss": 0.7995, + "num_input_tokens_seen": 16615248, + "step": 28635 + }, + { + "epoch": 4.265713434614239, + "grad_norm": 0.03271484375, + "learning_rate": 0.028842548386945957, + "loss": 0.8114, + "num_input_tokens_seen": 16618256, + "step": 28640 + }, + { + "epoch": 4.266458147155198, + "grad_norm": 0.023681640625, + "learning_rate": 0.02884179727981591, + "loss": 0.8001, + "num_input_tokens_seen": 16621616, + "step": 28645 + }, + { + "epoch": 4.267202859696157, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028841045938842763, + "loss": 0.798, + "num_input_tokens_seen": 16624336, + "step": 28650 + }, + { + "epoch": 4.267947572237117, + "grad_norm": 0.01373291015625, + "learning_rate": 0.02884029436403921, + "loss": 0.8111, + "num_input_tokens_seen": 16627376, + "step": 28655 + }, + { + "epoch": 4.2686922847780755, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028839542555417957, + "loss": 0.7942, + "num_input_tokens_seen": 16630512, + "step": 28660 + }, + { + "epoch": 4.269436997319035, + "grad_norm": 0.031005859375, + "learning_rate": 0.028838790512991694, + "loss": 0.7896, + "num_input_tokens_seen": 16633424, + "step": 28665 + }, + { + "epoch": 4.270181709859994, + "grad_norm": 0.032958984375, + "learning_rate": 0.028838038236773136, + "loss": 0.7937, + "num_input_tokens_seen": 16636336, + "step": 28670 + }, + { + "epoch": 4.2709264224009535, + "grad_norm": 0.020263671875, + "learning_rate": 0.02883728572677498, + "loss": 0.8169, + "num_input_tokens_seen": 16638928, + "step": 28675 + }, + { + "epoch": 4.271671134941912, + "grad_norm": 0.0311279296875, + "learning_rate": 0.028836532983009946, + "loss": 0.8178, + "num_input_tokens_seen": 16641872, + "step": 28680 + }, + { + "epoch": 4.272415847482872, + "grad_norm": 0.0283203125, + "learning_rate": 0.028835780005490752, + "loss": 0.7904, + "num_input_tokens_seen": 16644560, + "step": 28685 + }, + { + "epoch": 4.273160560023831, + "grad_norm": 0.01318359375, + "learning_rate": 0.028835026794230115, + "loss": 0.8185, + "num_input_tokens_seen": 16647344, + "step": 28690 + }, + { + "epoch": 4.27390527256479, + "grad_norm": 0.0242919921875, + "learning_rate": 0.028834273349240762, + "loss": 0.8119, + "num_input_tokens_seen": 16650224, + "step": 28695 + }, + { + "epoch": 4.274649985105749, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028833519670535424, + "loss": 0.7908, + "num_input_tokens_seen": 16653424, + "step": 28700 + }, + { + "epoch": 4.275394697646709, + "grad_norm": 0.019775390625, + "learning_rate": 0.02883276575812683, + "loss": 0.7785, + "num_input_tokens_seen": 16656112, + "step": 28705 + }, + { + "epoch": 4.2761394101876675, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028832011612027714, + "loss": 0.7966, + "num_input_tokens_seen": 16658832, + "step": 28710 + }, + { + "epoch": 4.276884122728626, + "grad_norm": 0.020263671875, + "learning_rate": 0.028831257232250825, + "loss": 0.8094, + "num_input_tokens_seen": 16661840, + "step": 28715 + }, + { + "epoch": 4.277628835269586, + "grad_norm": 0.03076171875, + "learning_rate": 0.0288305026188089, + "loss": 0.796, + "num_input_tokens_seen": 16665008, + "step": 28720 + }, + { + "epoch": 4.278373547810546, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02882974777171469, + "loss": 0.8114, + "num_input_tokens_seen": 16668080, + "step": 28725 + }, + { + "epoch": 4.279118260351504, + "grad_norm": 0.019287109375, + "learning_rate": 0.028828992690980944, + "loss": 0.8229, + "num_input_tokens_seen": 16670928, + "step": 28730 + }, + { + "epoch": 4.279862972892463, + "grad_norm": 0.020751953125, + "learning_rate": 0.02882823737662043, + "loss": 0.7977, + "num_input_tokens_seen": 16673296, + "step": 28735 + }, + { + "epoch": 4.280607685433423, + "grad_norm": 0.0201416015625, + "learning_rate": 0.028827481828645894, + "loss": 0.7949, + "num_input_tokens_seen": 16676208, + "step": 28740 + }, + { + "epoch": 4.2813523979743815, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02882672604707011, + "loss": 0.7697, + "num_input_tokens_seen": 16679248, + "step": 28745 + }, + { + "epoch": 4.282097110515341, + "grad_norm": 0.0289306640625, + "learning_rate": 0.028825970031905835, + "loss": 0.8053, + "num_input_tokens_seen": 16682128, + "step": 28750 + }, + { + "epoch": 4.2828418230563, + "grad_norm": 0.020263671875, + "learning_rate": 0.02882521378316585, + "loss": 0.828, + "num_input_tokens_seen": 16685040, + "step": 28755 + }, + { + "epoch": 4.2835865355972595, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028824457300862937, + "loss": 0.7926, + "num_input_tokens_seen": 16687696, + "step": 28760 + }, + { + "epoch": 4.284331248138218, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02882370058500986, + "loss": 0.813, + "num_input_tokens_seen": 16690448, + "step": 28765 + }, + { + "epoch": 4.285075960679178, + "grad_norm": 0.01953125, + "learning_rate": 0.02882294363561942, + "loss": 0.7882, + "num_input_tokens_seen": 16693264, + "step": 28770 + }, + { + "epoch": 4.285820673220137, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02882218645270439, + "loss": 0.7977, + "num_input_tokens_seen": 16695984, + "step": 28775 + }, + { + "epoch": 4.286565385761096, + "grad_norm": 0.0184326171875, + "learning_rate": 0.028821429036277567, + "loss": 0.809, + "num_input_tokens_seen": 16698864, + "step": 28780 + }, + { + "epoch": 4.287310098302055, + "grad_norm": 0.021484375, + "learning_rate": 0.02882067138635175, + "loss": 0.7987, + "num_input_tokens_seen": 16701648, + "step": 28785 + }, + { + "epoch": 4.288054810843015, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02881991350293974, + "loss": 0.8067, + "num_input_tokens_seen": 16704528, + "step": 28790 + }, + { + "epoch": 4.2887995233839735, + "grad_norm": 0.01251220703125, + "learning_rate": 0.028819155386054333, + "loss": 0.8158, + "num_input_tokens_seen": 16707344, + "step": 28795 + }, + { + "epoch": 4.289544235924933, + "grad_norm": 0.02001953125, + "learning_rate": 0.02881839703570834, + "loss": 0.7983, + "num_input_tokens_seen": 16710320, + "step": 28800 + }, + { + "epoch": 4.290288948465892, + "grad_norm": 0.0286865234375, + "learning_rate": 0.028817638451914578, + "loss": 0.8046, + "num_input_tokens_seen": 16713520, + "step": 28805 + }, + { + "epoch": 4.291033661006852, + "grad_norm": 0.020751953125, + "learning_rate": 0.028816879634685856, + "loss": 0.8165, + "num_input_tokens_seen": 16716304, + "step": 28810 + }, + { + "epoch": 4.29177837354781, + "grad_norm": 0.012939453125, + "learning_rate": 0.028816120584034998, + "loss": 0.793, + "num_input_tokens_seen": 16719152, + "step": 28815 + }, + { + "epoch": 4.29252308608877, + "grad_norm": 0.021240234375, + "learning_rate": 0.02881536129997482, + "loss": 0.8, + "num_input_tokens_seen": 16722096, + "step": 28820 + }, + { + "epoch": 4.293267798629729, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028814601782518153, + "loss": 0.8151, + "num_input_tokens_seen": 16724880, + "step": 28825 + }, + { + "epoch": 4.294012511170688, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028813842031677833, + "loss": 0.8017, + "num_input_tokens_seen": 16727792, + "step": 28830 + }, + { + "epoch": 4.294757223711647, + "grad_norm": 0.013916015625, + "learning_rate": 0.028813082047466694, + "loss": 0.8132, + "num_input_tokens_seen": 16730896, + "step": 28835 + }, + { + "epoch": 4.295501936252607, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02881232182989757, + "loss": 0.7828, + "num_input_tokens_seen": 16733872, + "step": 28840 + }, + { + "epoch": 4.2962466487935655, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028811561378983306, + "loss": 0.7956, + "num_input_tokens_seen": 16736528, + "step": 28845 + }, + { + "epoch": 4.296991361334525, + "grad_norm": 0.0185546875, + "learning_rate": 0.028810800694736752, + "loss": 0.8124, + "num_input_tokens_seen": 16739344, + "step": 28850 + }, + { + "epoch": 4.297736073875484, + "grad_norm": 0.01953125, + "learning_rate": 0.028810039777170754, + "loss": 0.7824, + "num_input_tokens_seen": 16742416, + "step": 28855 + }, + { + "epoch": 4.298480786416444, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02880927862629817, + "loss": 0.7988, + "num_input_tokens_seen": 16745456, + "step": 28860 + }, + { + "epoch": 4.299225498957402, + "grad_norm": 0.020751953125, + "learning_rate": 0.028808517242131862, + "loss": 0.7958, + "num_input_tokens_seen": 16748432, + "step": 28865 + }, + { + "epoch": 4.299970211498362, + "grad_norm": 0.0279541015625, + "learning_rate": 0.028807755624684687, + "loss": 0.8158, + "num_input_tokens_seen": 16751408, + "step": 28870 + }, + { + "epoch": 4.300714924039321, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02880699377396952, + "loss": 0.8046, + "num_input_tokens_seen": 16754384, + "step": 28875 + }, + { + "epoch": 4.30145963658028, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028806231689999216, + "loss": 0.7934, + "num_input_tokens_seen": 16757104, + "step": 28880 + }, + { + "epoch": 4.302204349121239, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02880546937278667, + "loss": 0.7944, + "num_input_tokens_seen": 16759824, + "step": 28885 + }, + { + "epoch": 4.302949061662199, + "grad_norm": 0.020751953125, + "learning_rate": 0.028804706822344744, + "loss": 0.8003, + "num_input_tokens_seen": 16762800, + "step": 28890 + }, + { + "epoch": 4.303693774203158, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028803944038686326, + "loss": 0.7937, + "num_input_tokens_seen": 16765616, + "step": 28895 + }, + { + "epoch": 4.304438486744116, + "grad_norm": 0.019775390625, + "learning_rate": 0.028803181021824303, + "loss": 0.8254, + "num_input_tokens_seen": 16768272, + "step": 28900 + }, + { + "epoch": 4.305183199285076, + "grad_norm": 0.01953125, + "learning_rate": 0.028802417771771567, + "loss": 0.8038, + "num_input_tokens_seen": 16771088, + "step": 28905 + }, + { + "epoch": 4.305927911826035, + "grad_norm": 0.0201416015625, + "learning_rate": 0.028801654288541013, + "loss": 0.8041, + "num_input_tokens_seen": 16774352, + "step": 28910 + }, + { + "epoch": 4.306672624366994, + "grad_norm": 0.0284423828125, + "learning_rate": 0.028800890572145535, + "loss": 0.8051, + "num_input_tokens_seen": 16777296, + "step": 28915 + }, + { + "epoch": 4.307417336907953, + "grad_norm": 0.01312255859375, + "learning_rate": 0.028800126622598038, + "loss": 0.8048, + "num_input_tokens_seen": 16780144, + "step": 28920 + }, + { + "epoch": 4.308162049448913, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02879936243991143, + "loss": 0.8152, + "num_input_tokens_seen": 16782992, + "step": 28925 + }, + { + "epoch": 4.3089067619898715, + "grad_norm": 0.01214599609375, + "learning_rate": 0.028798598024098614, + "loss": 0.7976, + "num_input_tokens_seen": 16786192, + "step": 28930 + }, + { + "epoch": 4.309651474530831, + "grad_norm": 0.019775390625, + "learning_rate": 0.02879783337517251, + "loss": 0.7927, + "num_input_tokens_seen": 16789200, + "step": 28935 + }, + { + "epoch": 4.31039618707179, + "grad_norm": 0.01171875, + "learning_rate": 0.028797068493146036, + "loss": 0.8133, + "num_input_tokens_seen": 16792080, + "step": 28940 + }, + { + "epoch": 4.31114089961275, + "grad_norm": 0.017822265625, + "learning_rate": 0.028796303378032115, + "loss": 0.8173, + "num_input_tokens_seen": 16794928, + "step": 28945 + }, + { + "epoch": 4.311885612153708, + "grad_norm": 0.0322265625, + "learning_rate": 0.02879553802984366, + "loss": 0.808, + "num_input_tokens_seen": 16797776, + "step": 28950 + }, + { + "epoch": 4.312630324694668, + "grad_norm": 0.021240234375, + "learning_rate": 0.028794772448593624, + "loss": 0.7968, + "num_input_tokens_seen": 16800848, + "step": 28955 + }, + { + "epoch": 4.313375037235627, + "grad_norm": 0.0189208984375, + "learning_rate": 0.028794006634294923, + "loss": 0.8113, + "num_input_tokens_seen": 16803632, + "step": 28960 + }, + { + "epoch": 4.314119749776586, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0287932405869605, + "loss": 0.7849, + "num_input_tokens_seen": 16806224, + "step": 28965 + }, + { + "epoch": 4.314864462317545, + "grad_norm": 0.0220947265625, + "learning_rate": 0.028792474306603293, + "loss": 0.8253, + "num_input_tokens_seen": 16808912, + "step": 28970 + }, + { + "epoch": 4.315609174858505, + "grad_norm": 0.020263671875, + "learning_rate": 0.028791707793236254, + "loss": 0.8158, + "num_input_tokens_seen": 16811728, + "step": 28975 + }, + { + "epoch": 4.316353887399464, + "grad_norm": 0.01202392578125, + "learning_rate": 0.028790941046872326, + "loss": 0.8283, + "num_input_tokens_seen": 16815024, + "step": 28980 + }, + { + "epoch": 4.317098599940423, + "grad_norm": 0.03271484375, + "learning_rate": 0.028790174067524467, + "loss": 0.7938, + "num_input_tokens_seen": 16817712, + "step": 28985 + }, + { + "epoch": 4.317843312481382, + "grad_norm": 0.01312255859375, + "learning_rate": 0.028789406855205632, + "loss": 0.7963, + "num_input_tokens_seen": 16820240, + "step": 28990 + }, + { + "epoch": 4.318588025022342, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028788639409928783, + "loss": 0.7992, + "num_input_tokens_seen": 16822992, + "step": 28995 + }, + { + "epoch": 4.3193327375633, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028787871731706886, + "loss": 0.804, + "num_input_tokens_seen": 16825872, + "step": 29000 + }, + { + "epoch": 4.32007745010426, + "grad_norm": 0.0115966796875, + "learning_rate": 0.02878710382055291, + "loss": 0.8061, + "num_input_tokens_seen": 16828688, + "step": 29005 + }, + { + "epoch": 4.320822162645219, + "grad_norm": 0.0240478515625, + "learning_rate": 0.028786335676479827, + "loss": 0.805, + "num_input_tokens_seen": 16831632, + "step": 29010 + }, + { + "epoch": 4.321566875186178, + "grad_norm": 0.018798828125, + "learning_rate": 0.028785567299500617, + "loss": 0.7983, + "num_input_tokens_seen": 16834384, + "step": 29015 + }, + { + "epoch": 4.322311587727137, + "grad_norm": 0.0252685546875, + "learning_rate": 0.028784798689628254, + "loss": 0.8197, + "num_input_tokens_seen": 16837168, + "step": 29020 + }, + { + "epoch": 4.323056300268097, + "grad_norm": 0.0262451171875, + "learning_rate": 0.028784029846875732, + "loss": 0.8095, + "num_input_tokens_seen": 16840432, + "step": 29025 + }, + { + "epoch": 4.323801012809056, + "grad_norm": 0.01220703125, + "learning_rate": 0.028783260771256036, + "loss": 0.8073, + "num_input_tokens_seen": 16843088, + "step": 29030 + }, + { + "epoch": 4.324545725350015, + "grad_norm": 0.01312255859375, + "learning_rate": 0.02878249146278215, + "loss": 0.7959, + "num_input_tokens_seen": 16846000, + "step": 29035 + }, + { + "epoch": 4.325290437890974, + "grad_norm": 0.012939453125, + "learning_rate": 0.028781721921467086, + "loss": 0.7855, + "num_input_tokens_seen": 16848752, + "step": 29040 + }, + { + "epoch": 4.326035150431934, + "grad_norm": 0.0291748046875, + "learning_rate": 0.028780952147323832, + "loss": 0.8135, + "num_input_tokens_seen": 16851728, + "step": 29045 + }, + { + "epoch": 4.326779862972892, + "grad_norm": 0.0218505859375, + "learning_rate": 0.028780182140365405, + "loss": 0.8156, + "num_input_tokens_seen": 16854832, + "step": 29050 + }, + { + "epoch": 4.327524575513852, + "grad_norm": 0.0294189453125, + "learning_rate": 0.028779411900604797, + "loss": 0.7999, + "num_input_tokens_seen": 16857680, + "step": 29055 + }, + { + "epoch": 4.328269288054811, + "grad_norm": 0.032470703125, + "learning_rate": 0.028778641428055036, + "loss": 0.8047, + "num_input_tokens_seen": 16860784, + "step": 29060 + }, + { + "epoch": 4.32901400059577, + "grad_norm": 0.01141357421875, + "learning_rate": 0.02877787072272913, + "loss": 0.803, + "num_input_tokens_seen": 16864080, + "step": 29065 + }, + { + "epoch": 4.329758713136729, + "grad_norm": 0.01251220703125, + "learning_rate": 0.028777099784640103, + "loss": 0.7933, + "num_input_tokens_seen": 16866960, + "step": 29070 + }, + { + "epoch": 4.330503425677689, + "grad_norm": 0.0113525390625, + "learning_rate": 0.02877632861380098, + "loss": 0.8079, + "num_input_tokens_seen": 16870128, + "step": 29075 + }, + { + "epoch": 4.331248138218648, + "grad_norm": 0.0260009765625, + "learning_rate": 0.028775557210224782, + "loss": 0.8021, + "num_input_tokens_seen": 16873072, + "step": 29080 + }, + { + "epoch": 4.331992850759606, + "grad_norm": 0.0263671875, + "learning_rate": 0.028774785573924544, + "loss": 0.8014, + "num_input_tokens_seen": 16875984, + "step": 29085 + }, + { + "epoch": 4.332737563300566, + "grad_norm": 0.012939453125, + "learning_rate": 0.02877401370491331, + "loss": 0.7911, + "num_input_tokens_seen": 16879056, + "step": 29090 + }, + { + "epoch": 4.333482275841525, + "grad_norm": 0.038330078125, + "learning_rate": 0.02877324160320411, + "loss": 0.8162, + "num_input_tokens_seen": 16881776, + "step": 29095 + }, + { + "epoch": 4.334226988382484, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02877246926880999, + "loss": 0.7956, + "num_input_tokens_seen": 16884592, + "step": 29100 + }, + { + "epoch": 4.334971700923443, + "grad_norm": 0.018798828125, + "learning_rate": 0.028771696701744003, + "loss": 0.7967, + "num_input_tokens_seen": 16887344, + "step": 29105 + }, + { + "epoch": 4.335716413464403, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0287709239020192, + "loss": 0.7903, + "num_input_tokens_seen": 16890576, + "step": 29110 + }, + { + "epoch": 4.336461126005362, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028770150869648625, + "loss": 0.7816, + "num_input_tokens_seen": 16893488, + "step": 29115 + }, + { + "epoch": 4.337205838546321, + "grad_norm": 0.01953125, + "learning_rate": 0.028769377604645353, + "loss": 0.8034, + "num_input_tokens_seen": 16896528, + "step": 29120 + }, + { + "epoch": 4.33795055108728, + "grad_norm": 0.021240234375, + "learning_rate": 0.028768604107022436, + "loss": 0.8105, + "num_input_tokens_seen": 16899568, + "step": 29125 + }, + { + "epoch": 4.33869526362824, + "grad_norm": 0.019775390625, + "learning_rate": 0.028767830376792952, + "loss": 0.8048, + "num_input_tokens_seen": 16902256, + "step": 29130 + }, + { + "epoch": 4.339439976169198, + "grad_norm": 0.01171875, + "learning_rate": 0.028767056413969965, + "loss": 0.8034, + "num_input_tokens_seen": 16904976, + "step": 29135 + }, + { + "epoch": 4.340184688710158, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02876628221856655, + "loss": 0.8054, + "num_input_tokens_seen": 16907760, + "step": 29140 + }, + { + "epoch": 4.340929401251117, + "grad_norm": 0.01318359375, + "learning_rate": 0.028765507790595786, + "loss": 0.812, + "num_input_tokens_seen": 16910640, + "step": 29145 + }, + { + "epoch": 4.3416741137920765, + "grad_norm": 0.0244140625, + "learning_rate": 0.028764733130070767, + "loss": 0.8362, + "num_input_tokens_seen": 16913296, + "step": 29150 + }, + { + "epoch": 4.342418826333035, + "grad_norm": 0.0201416015625, + "learning_rate": 0.028763958237004568, + "loss": 0.7985, + "num_input_tokens_seen": 16916112, + "step": 29155 + }, + { + "epoch": 4.343163538873995, + "grad_norm": 0.021484375, + "learning_rate": 0.028763183111410278, + "loss": 0.8081, + "num_input_tokens_seen": 16918960, + "step": 29160 + }, + { + "epoch": 4.343908251414954, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028762407753301, + "loss": 0.8219, + "num_input_tokens_seen": 16922064, + "step": 29165 + }, + { + "epoch": 4.344652963955913, + "grad_norm": 0.01336669921875, + "learning_rate": 0.028761632162689836, + "loss": 0.8112, + "num_input_tokens_seen": 16924912, + "step": 29170 + }, + { + "epoch": 4.345397676496872, + "grad_norm": 0.029296875, + "learning_rate": 0.028760856339589882, + "loss": 0.8045, + "num_input_tokens_seen": 16927760, + "step": 29175 + }, + { + "epoch": 4.346142389037832, + "grad_norm": 0.014404296875, + "learning_rate": 0.02876008028401424, + "loss": 0.7986, + "num_input_tokens_seen": 16930352, + "step": 29180 + }, + { + "epoch": 4.34688710157879, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02875930399597603, + "loss": 0.7969, + "num_input_tokens_seen": 16933040, + "step": 29185 + }, + { + "epoch": 4.34763181411975, + "grad_norm": 0.0205078125, + "learning_rate": 0.028758527475488363, + "loss": 0.8027, + "num_input_tokens_seen": 16935632, + "step": 29190 + }, + { + "epoch": 4.348376526660709, + "grad_norm": 0.042236328125, + "learning_rate": 0.02875775072256436, + "loss": 0.8265, + "num_input_tokens_seen": 16938768, + "step": 29195 + }, + { + "epoch": 4.3491212392016685, + "grad_norm": 0.033447265625, + "learning_rate": 0.02875697373721714, + "loss": 0.7933, + "num_input_tokens_seen": 16941936, + "step": 29200 + }, + { + "epoch": 4.349865951742627, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02875619651945983, + "loss": 0.8157, + "num_input_tokens_seen": 16944816, + "step": 29205 + }, + { + "epoch": 4.350610664283587, + "grad_norm": 0.0247802734375, + "learning_rate": 0.028755419069305562, + "loss": 0.806, + "num_input_tokens_seen": 16947760, + "step": 29210 + }, + { + "epoch": 4.351355376824546, + "grad_norm": 0.021240234375, + "learning_rate": 0.02875464138676747, + "loss": 0.8233, + "num_input_tokens_seen": 16950480, + "step": 29215 + }, + { + "epoch": 4.352100089365505, + "grad_norm": 0.022216796875, + "learning_rate": 0.028753863471858692, + "loss": 0.8094, + "num_input_tokens_seen": 16953296, + "step": 29220 + }, + { + "epoch": 4.352844801906464, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028753085324592368, + "loss": 0.7987, + "num_input_tokens_seen": 16956112, + "step": 29225 + }, + { + "epoch": 4.353589514447424, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02875230694498164, + "loss": 0.8042, + "num_input_tokens_seen": 16958992, + "step": 29230 + }, + { + "epoch": 4.3543342269883825, + "grad_norm": 0.02490234375, + "learning_rate": 0.02875152833303967, + "loss": 0.8111, + "num_input_tokens_seen": 16961904, + "step": 29235 + }, + { + "epoch": 4.355078939529342, + "grad_norm": 0.01318359375, + "learning_rate": 0.028750749488779606, + "loss": 0.8109, + "num_input_tokens_seen": 16964784, + "step": 29240 + }, + { + "epoch": 4.355823652070301, + "grad_norm": 0.031494140625, + "learning_rate": 0.028749970412214602, + "loss": 0.8132, + "num_input_tokens_seen": 16967664, + "step": 29245 + }, + { + "epoch": 4.35656836461126, + "grad_norm": 0.012939453125, + "learning_rate": 0.028749191103357823, + "loss": 0.7905, + "num_input_tokens_seen": 16970384, + "step": 29250 + }, + { + "epoch": 4.357313077152219, + "grad_norm": 0.029052734375, + "learning_rate": 0.028748411562222437, + "loss": 0.8069, + "num_input_tokens_seen": 16973328, + "step": 29255 + }, + { + "epoch": 4.358057789693178, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028747631788821608, + "loss": 0.8083, + "num_input_tokens_seen": 16976176, + "step": 29260 + }, + { + "epoch": 4.358802502234138, + "grad_norm": 0.012939453125, + "learning_rate": 0.028746851783168513, + "loss": 0.7915, + "num_input_tokens_seen": 16979024, + "step": 29265 + }, + { + "epoch": 4.359547214775096, + "grad_norm": 0.0189208984375, + "learning_rate": 0.028746071545276326, + "loss": 0.8021, + "num_input_tokens_seen": 16982192, + "step": 29270 + }, + { + "epoch": 4.360291927316056, + "grad_norm": 0.027587890625, + "learning_rate": 0.028745291075158236, + "loss": 0.8125, + "num_input_tokens_seen": 16985104, + "step": 29275 + }, + { + "epoch": 4.361036639857015, + "grad_norm": 0.020263671875, + "learning_rate": 0.028744510372827423, + "loss": 0.8092, + "num_input_tokens_seen": 16988112, + "step": 29280 + }, + { + "epoch": 4.3617813523979745, + "grad_norm": 0.011962890625, + "learning_rate": 0.028743729438297075, + "loss": 0.7869, + "num_input_tokens_seen": 16990960, + "step": 29285 + }, + { + "epoch": 4.362526064938933, + "grad_norm": 0.038818359375, + "learning_rate": 0.028742948271580384, + "loss": 0.7845, + "num_input_tokens_seen": 16993840, + "step": 29290 + }, + { + "epoch": 4.363270777479893, + "grad_norm": 0.024658203125, + "learning_rate": 0.028742166872690558, + "loss": 0.7816, + "num_input_tokens_seen": 16996816, + "step": 29295 + }, + { + "epoch": 4.364015490020852, + "grad_norm": 0.01226806640625, + "learning_rate": 0.028741385241640778, + "loss": 0.7909, + "num_input_tokens_seen": 16999920, + "step": 29300 + }, + { + "epoch": 4.364760202561811, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028740603378444265, + "loss": 0.7952, + "num_input_tokens_seen": 17002704, + "step": 29305 + }, + { + "epoch": 4.36550491510277, + "grad_norm": 0.02001953125, + "learning_rate": 0.02873982128311423, + "loss": 0.7948, + "num_input_tokens_seen": 17005552, + "step": 29310 + }, + { + "epoch": 4.36624962764373, + "grad_norm": 0.0250244140625, + "learning_rate": 0.028739038955663872, + "loss": 0.8091, + "num_input_tokens_seen": 17008528, + "step": 29315 + }, + { + "epoch": 4.3669943401846885, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02873825639610642, + "loss": 0.7983, + "num_input_tokens_seen": 17011216, + "step": 29320 + }, + { + "epoch": 4.367739052725648, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02873747360445508, + "loss": 0.794, + "num_input_tokens_seen": 17014192, + "step": 29325 + }, + { + "epoch": 4.368483765266607, + "grad_norm": 0.029296875, + "learning_rate": 0.02873669058072309, + "loss": 0.8125, + "num_input_tokens_seen": 17017072, + "step": 29330 + }, + { + "epoch": 4.3692284778075665, + "grad_norm": 0.017578125, + "learning_rate": 0.02873590732492368, + "loss": 0.7691, + "num_input_tokens_seen": 17019952, + "step": 29335 + }, + { + "epoch": 4.369973190348525, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02873512383707007, + "loss": 0.7743, + "num_input_tokens_seen": 17022864, + "step": 29340 + }, + { + "epoch": 4.370717902889485, + "grad_norm": 0.0303955078125, + "learning_rate": 0.028734340117175503, + "loss": 0.8227, + "num_input_tokens_seen": 17025616, + "step": 29345 + }, + { + "epoch": 4.371462615430444, + "grad_norm": 0.01904296875, + "learning_rate": 0.028733556165253218, + "loss": 0.7928, + "num_input_tokens_seen": 17028464, + "step": 29350 + }, + { + "epoch": 4.372207327971403, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02873277198131646, + "loss": 0.7917, + "num_input_tokens_seen": 17031504, + "step": 29355 + }, + { + "epoch": 4.372952040512362, + "grad_norm": 0.01953125, + "learning_rate": 0.02873198756537848, + "loss": 0.7817, + "num_input_tokens_seen": 17034384, + "step": 29360 + }, + { + "epoch": 4.373696753053322, + "grad_norm": 0.0230712890625, + "learning_rate": 0.028731202917452524, + "loss": 0.7882, + "num_input_tokens_seen": 17037296, + "step": 29365 + }, + { + "epoch": 4.3744414655942805, + "grad_norm": 0.025634765625, + "learning_rate": 0.02873041803755185, + "loss": 0.7914, + "num_input_tokens_seen": 17040144, + "step": 29370 + }, + { + "epoch": 4.37518617813524, + "grad_norm": 0.0224609375, + "learning_rate": 0.028729632925689717, + "loss": 0.8113, + "num_input_tokens_seen": 17043120, + "step": 29375 + }, + { + "epoch": 4.375930890676199, + "grad_norm": 0.02294921875, + "learning_rate": 0.028728847581879394, + "loss": 0.7939, + "num_input_tokens_seen": 17045904, + "step": 29380 + }, + { + "epoch": 4.3766756032171585, + "grad_norm": 0.0189208984375, + "learning_rate": 0.028728062006134144, + "loss": 0.7757, + "num_input_tokens_seen": 17048496, + "step": 29385 + }, + { + "epoch": 4.377420315758117, + "grad_norm": 0.02490234375, + "learning_rate": 0.028727276198467235, + "loss": 0.7878, + "num_input_tokens_seen": 17051600, + "step": 29390 + }, + { + "epoch": 4.378165028299077, + "grad_norm": 0.0233154296875, + "learning_rate": 0.028726490158891952, + "loss": 0.8034, + "num_input_tokens_seen": 17054448, + "step": 29395 + }, + { + "epoch": 4.378909740840036, + "grad_norm": 0.035888671875, + "learning_rate": 0.028725703887421565, + "loss": 0.8164, + "num_input_tokens_seen": 17057424, + "step": 29400 + }, + { + "epoch": 4.379654453380995, + "grad_norm": 0.0247802734375, + "learning_rate": 0.028724917384069362, + "loss": 0.7801, + "num_input_tokens_seen": 17060336, + "step": 29405 + }, + { + "epoch": 4.380399165921954, + "grad_norm": 0.033447265625, + "learning_rate": 0.028724130648848628, + "loss": 0.8119, + "num_input_tokens_seen": 17063216, + "step": 29410 + }, + { + "epoch": 4.381143878462913, + "grad_norm": 0.041748046875, + "learning_rate": 0.028723343681772658, + "loss": 0.7975, + "num_input_tokens_seen": 17066320, + "step": 29415 + }, + { + "epoch": 4.3818885910038725, + "grad_norm": 0.0238037109375, + "learning_rate": 0.028722556482854742, + "loss": 0.8062, + "num_input_tokens_seen": 17069456, + "step": 29420 + }, + { + "epoch": 4.382633303544832, + "grad_norm": 0.02001953125, + "learning_rate": 0.028721769052108184, + "loss": 0.8106, + "num_input_tokens_seen": 17072176, + "step": 29425 + }, + { + "epoch": 4.383378016085791, + "grad_norm": 0.012451171875, + "learning_rate": 0.028720981389546277, + "loss": 0.8139, + "num_input_tokens_seen": 17075056, + "step": 29430 + }, + { + "epoch": 4.38412272862675, + "grad_norm": 0.01409912109375, + "learning_rate": 0.028720193495182345, + "loss": 0.8292, + "num_input_tokens_seen": 17077712, + "step": 29435 + }, + { + "epoch": 4.384867441167709, + "grad_norm": 0.02880859375, + "learning_rate": 0.02871940536902968, + "loss": 0.8245, + "num_input_tokens_seen": 17080688, + "step": 29440 + }, + { + "epoch": 4.385612153708668, + "grad_norm": 0.0172119140625, + "learning_rate": 0.028718617011101608, + "loss": 0.8232, + "num_input_tokens_seen": 17083792, + "step": 29445 + }, + { + "epoch": 4.386356866249628, + "grad_norm": 0.026123046875, + "learning_rate": 0.028717828421411445, + "loss": 0.8151, + "num_input_tokens_seen": 17086800, + "step": 29450 + }, + { + "epoch": 4.3871015787905865, + "grad_norm": 0.01055908203125, + "learning_rate": 0.028717039599972512, + "loss": 0.7856, + "num_input_tokens_seen": 17089616, + "step": 29455 + }, + { + "epoch": 4.387846291331546, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028716250546798137, + "loss": 0.8003, + "num_input_tokens_seen": 17092592, + "step": 29460 + }, + { + "epoch": 4.388591003872505, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028715461261901647, + "loss": 0.8048, + "num_input_tokens_seen": 17095440, + "step": 29465 + }, + { + "epoch": 4.3893357164134645, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02871467174529638, + "loss": 0.7855, + "num_input_tokens_seen": 17098000, + "step": 29470 + }, + { + "epoch": 4.390080428954423, + "grad_norm": 0.0123291015625, + "learning_rate": 0.02871388199699568, + "loss": 0.7998, + "num_input_tokens_seen": 17100816, + "step": 29475 + }, + { + "epoch": 4.390825141495383, + "grad_norm": 0.0185546875, + "learning_rate": 0.028713092017012873, + "loss": 0.7971, + "num_input_tokens_seen": 17103600, + "step": 29480 + }, + { + "epoch": 4.391569854036342, + "grad_norm": 0.026123046875, + "learning_rate": 0.028712301805361315, + "loss": 0.7998, + "num_input_tokens_seen": 17106416, + "step": 29485 + }, + { + "epoch": 4.392314566577301, + "grad_norm": 0.0179443359375, + "learning_rate": 0.028711511362054357, + "loss": 0.8124, + "num_input_tokens_seen": 17109648, + "step": 29490 + }, + { + "epoch": 4.39305927911826, + "grad_norm": 0.029052734375, + "learning_rate": 0.02871072068710535, + "loss": 0.8286, + "num_input_tokens_seen": 17112464, + "step": 29495 + }, + { + "epoch": 4.39380399165922, + "grad_norm": 0.0146484375, + "learning_rate": 0.02870992978052765, + "loss": 0.8333, + "num_input_tokens_seen": 17115568, + "step": 29500 + }, + { + "epoch": 4.3945487042001785, + "grad_norm": 0.018798828125, + "learning_rate": 0.02870913864233463, + "loss": 0.8117, + "num_input_tokens_seen": 17118736, + "step": 29505 + }, + { + "epoch": 4.395293416741138, + "grad_norm": 0.012451171875, + "learning_rate": 0.028708347272539632, + "loss": 0.8091, + "num_input_tokens_seen": 17121296, + "step": 29510 + }, + { + "epoch": 4.396038129282097, + "grad_norm": 0.01904296875, + "learning_rate": 0.028707555671156047, + "loss": 0.8005, + "num_input_tokens_seen": 17123952, + "step": 29515 + }, + { + "epoch": 4.396782841823057, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02870676383819724, + "loss": 0.7965, + "num_input_tokens_seen": 17126896, + "step": 29520 + }, + { + "epoch": 4.397527554364015, + "grad_norm": 0.01324462890625, + "learning_rate": 0.028705971773676585, + "loss": 0.814, + "num_input_tokens_seen": 17129744, + "step": 29525 + }, + { + "epoch": 4.398272266904975, + "grad_norm": 0.0247802734375, + "learning_rate": 0.028705179477607472, + "loss": 0.7872, + "num_input_tokens_seen": 17132656, + "step": 29530 + }, + { + "epoch": 4.399016979445934, + "grad_norm": 0.02392578125, + "learning_rate": 0.028704386950003283, + "loss": 0.8159, + "num_input_tokens_seen": 17135408, + "step": 29535 + }, + { + "epoch": 4.399761691986893, + "grad_norm": 0.010986328125, + "learning_rate": 0.028703594190877396, + "loss": 0.8018, + "num_input_tokens_seen": 17138352, + "step": 29540 + }, + { + "epoch": 4.400506404527852, + "grad_norm": 0.0220947265625, + "learning_rate": 0.028702801200243218, + "loss": 0.8103, + "num_input_tokens_seen": 17141168, + "step": 29545 + }, + { + "epoch": 4.401251117068812, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02870200797811414, + "loss": 0.8181, + "num_input_tokens_seen": 17144144, + "step": 29550 + }, + { + "epoch": 4.4019958296097705, + "grad_norm": 0.0137939453125, + "learning_rate": 0.028701214524503565, + "loss": 0.7945, + "num_input_tokens_seen": 17147024, + "step": 29555 + }, + { + "epoch": 4.40274054215073, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0287004208394249, + "loss": 0.7948, + "num_input_tokens_seen": 17149744, + "step": 29560 + }, + { + "epoch": 4.403485254691689, + "grad_norm": 0.01092529296875, + "learning_rate": 0.02869962692289154, + "loss": 0.8005, + "num_input_tokens_seen": 17152784, + "step": 29565 + }, + { + "epoch": 4.404229967232649, + "grad_norm": 0.01904296875, + "learning_rate": 0.02869883277491691, + "loss": 0.802, + "num_input_tokens_seen": 17155760, + "step": 29570 + }, + { + "epoch": 4.404974679773607, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028698038395514422, + "loss": 0.7959, + "num_input_tokens_seen": 17158384, + "step": 29575 + }, + { + "epoch": 4.405719392314566, + "grad_norm": 0.01116943359375, + "learning_rate": 0.028697243784697502, + "loss": 0.8069, + "num_input_tokens_seen": 17161328, + "step": 29580 + }, + { + "epoch": 4.406464104855526, + "grad_norm": 0.0201416015625, + "learning_rate": 0.028696448942479566, + "loss": 0.8041, + "num_input_tokens_seen": 17163984, + "step": 29585 + }, + { + "epoch": 4.407208817396485, + "grad_norm": 0.0179443359375, + "learning_rate": 0.028695653868874047, + "loss": 0.8186, + "num_input_tokens_seen": 17166864, + "step": 29590 + }, + { + "epoch": 4.407953529937444, + "grad_norm": 0.0269775390625, + "learning_rate": 0.028694858563894376, + "loss": 0.8093, + "num_input_tokens_seen": 17169584, + "step": 29595 + }, + { + "epoch": 4.408698242478403, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028694063027553986, + "loss": 0.8034, + "num_input_tokens_seen": 17172304, + "step": 29600 + }, + { + "epoch": 4.409442955019363, + "grad_norm": 0.018798828125, + "learning_rate": 0.02869326725986632, + "loss": 0.7937, + "num_input_tokens_seen": 17175408, + "step": 29605 + }, + { + "epoch": 4.410187667560321, + "grad_norm": 0.0281982421875, + "learning_rate": 0.028692471260844826, + "loss": 0.8139, + "num_input_tokens_seen": 17178352, + "step": 29610 + }, + { + "epoch": 4.410932380101281, + "grad_norm": 0.01251220703125, + "learning_rate": 0.028691675030502937, + "loss": 0.816, + "num_input_tokens_seen": 17181392, + "step": 29615 + }, + { + "epoch": 4.41167709264224, + "grad_norm": 0.019287109375, + "learning_rate": 0.02869087856885412, + "loss": 0.8085, + "num_input_tokens_seen": 17184240, + "step": 29620 + }, + { + "epoch": 4.412421805183199, + "grad_norm": 0.011962890625, + "learning_rate": 0.028690081875911825, + "loss": 0.803, + "num_input_tokens_seen": 17187344, + "step": 29625 + }, + { + "epoch": 4.413166517724158, + "grad_norm": 0.02490234375, + "learning_rate": 0.028689284951689507, + "loss": 0.807, + "num_input_tokens_seen": 17190064, + "step": 29630 + }, + { + "epoch": 4.413911230265118, + "grad_norm": 0.0286865234375, + "learning_rate": 0.028688487796200635, + "loss": 0.8036, + "num_input_tokens_seen": 17193136, + "step": 29635 + }, + { + "epoch": 4.4146559428060765, + "grad_norm": 0.0184326171875, + "learning_rate": 0.028687690409458674, + "loss": 0.796, + "num_input_tokens_seen": 17196048, + "step": 29640 + }, + { + "epoch": 4.415400655347036, + "grad_norm": 0.0238037109375, + "learning_rate": 0.028686892791477094, + "loss": 0.7981, + "num_input_tokens_seen": 17198992, + "step": 29645 + }, + { + "epoch": 4.416145367887995, + "grad_norm": 0.021484375, + "learning_rate": 0.028686094942269372, + "loss": 0.8038, + "num_input_tokens_seen": 17201872, + "step": 29650 + }, + { + "epoch": 4.416890080428955, + "grad_norm": 0.026123046875, + "learning_rate": 0.028685296861848987, + "loss": 0.7947, + "num_input_tokens_seen": 17204720, + "step": 29655 + }, + { + "epoch": 4.417634792969913, + "grad_norm": 0.0322265625, + "learning_rate": 0.02868449855022942, + "loss": 0.8242, + "num_input_tokens_seen": 17207568, + "step": 29660 + }, + { + "epoch": 4.418379505510873, + "grad_norm": 0.018798828125, + "learning_rate": 0.028683700007424158, + "loss": 0.7825, + "num_input_tokens_seen": 17210384, + "step": 29665 + }, + { + "epoch": 4.419124218051832, + "grad_norm": 0.0245361328125, + "learning_rate": 0.028682901233446693, + "loss": 0.8128, + "num_input_tokens_seen": 17213392, + "step": 29670 + }, + { + "epoch": 4.419868930592791, + "grad_norm": 0.0224609375, + "learning_rate": 0.028682102228310514, + "loss": 0.8033, + "num_input_tokens_seen": 17216368, + "step": 29675 + }, + { + "epoch": 4.42061364313375, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028681302992029128, + "loss": 0.7882, + "num_input_tokens_seen": 17219280, + "step": 29680 + }, + { + "epoch": 4.42135835567471, + "grad_norm": 0.021240234375, + "learning_rate": 0.028680503524616034, + "loss": 0.7868, + "num_input_tokens_seen": 17222128, + "step": 29685 + }, + { + "epoch": 4.422103068215669, + "grad_norm": 0.0311279296875, + "learning_rate": 0.028679703826084733, + "loss": 0.7985, + "num_input_tokens_seen": 17224688, + "step": 29690 + }, + { + "epoch": 4.422847780756628, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02867890389644874, + "loss": 0.8133, + "num_input_tokens_seen": 17227792, + "step": 29695 + }, + { + "epoch": 4.423592493297587, + "grad_norm": 0.01446533203125, + "learning_rate": 0.02867810373572157, + "loss": 0.7984, + "num_input_tokens_seen": 17230672, + "step": 29700 + }, + { + "epoch": 4.424337205838547, + "grad_norm": 0.0269775390625, + "learning_rate": 0.028677303343916737, + "loss": 0.8323, + "num_input_tokens_seen": 17233584, + "step": 29705 + }, + { + "epoch": 4.425081918379505, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02867650272104777, + "loss": 0.8331, + "num_input_tokens_seen": 17236240, + "step": 29710 + }, + { + "epoch": 4.425826630920465, + "grad_norm": 0.0179443359375, + "learning_rate": 0.028675701867128187, + "loss": 0.827, + "num_input_tokens_seen": 17239216, + "step": 29715 + }, + { + "epoch": 4.426571343461424, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028674900782171518, + "loss": 0.8114, + "num_input_tokens_seen": 17241840, + "step": 29720 + }, + { + "epoch": 4.427316056002383, + "grad_norm": 0.0181884765625, + "learning_rate": 0.028674099466191302, + "loss": 0.8122, + "num_input_tokens_seen": 17245136, + "step": 29725 + }, + { + "epoch": 4.428060768543342, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02867329791920107, + "loss": 0.7976, + "num_input_tokens_seen": 17247984, + "step": 29730 + }, + { + "epoch": 4.428805481084302, + "grad_norm": 0.02685546875, + "learning_rate": 0.02867249614121437, + "loss": 0.8037, + "num_input_tokens_seen": 17251056, + "step": 29735 + }, + { + "epoch": 4.429550193625261, + "grad_norm": 0.017578125, + "learning_rate": 0.02867169413224474, + "loss": 0.7934, + "num_input_tokens_seen": 17254000, + "step": 29740 + }, + { + "epoch": 4.43029490616622, + "grad_norm": 0.026611328125, + "learning_rate": 0.028670891892305737, + "loss": 0.8015, + "num_input_tokens_seen": 17256944, + "step": 29745 + }, + { + "epoch": 4.431039618707179, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02867008942141091, + "loss": 0.8073, + "num_input_tokens_seen": 17259888, + "step": 29750 + }, + { + "epoch": 4.431784331248139, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02866928671957381, + "loss": 0.7995, + "num_input_tokens_seen": 17262704, + "step": 29755 + }, + { + "epoch": 4.432529043789097, + "grad_norm": 0.01904296875, + "learning_rate": 0.028668483786808006, + "loss": 0.7938, + "num_input_tokens_seen": 17265520, + "step": 29760 + }, + { + "epoch": 4.433273756330056, + "grad_norm": 0.01287841796875, + "learning_rate": 0.028667680623127065, + "loss": 0.8085, + "num_input_tokens_seen": 17268496, + "step": 29765 + }, + { + "epoch": 4.434018468871016, + "grad_norm": 0.0184326171875, + "learning_rate": 0.028666877228544548, + "loss": 0.8093, + "num_input_tokens_seen": 17271280, + "step": 29770 + }, + { + "epoch": 4.434763181411975, + "grad_norm": 0.01385498046875, + "learning_rate": 0.02866607360307403, + "loss": 0.8154, + "num_input_tokens_seen": 17274256, + "step": 29775 + }, + { + "epoch": 4.435507893952934, + "grad_norm": 0.0125732421875, + "learning_rate": 0.02866526974672909, + "loss": 0.8069, + "num_input_tokens_seen": 17277360, + "step": 29780 + }, + { + "epoch": 4.436252606493893, + "grad_norm": 0.0220947265625, + "learning_rate": 0.028664465659523304, + "loss": 0.7846, + "num_input_tokens_seen": 17280464, + "step": 29785 + }, + { + "epoch": 4.436997319034853, + "grad_norm": 0.0108642578125, + "learning_rate": 0.028663661341470262, + "loss": 0.792, + "num_input_tokens_seen": 17282928, + "step": 29790 + }, + { + "epoch": 4.437742031575811, + "grad_norm": 0.02490234375, + "learning_rate": 0.028662856792583544, + "loss": 0.814, + "num_input_tokens_seen": 17285712, + "step": 29795 + }, + { + "epoch": 4.438486744116771, + "grad_norm": 0.030029296875, + "learning_rate": 0.028662052012876752, + "loss": 0.7939, + "num_input_tokens_seen": 17288560, + "step": 29800 + }, + { + "epoch": 4.43923145665773, + "grad_norm": 0.01708984375, + "learning_rate": 0.02866124700236347, + "loss": 0.7977, + "num_input_tokens_seen": 17291824, + "step": 29805 + }, + { + "epoch": 4.439976169198689, + "grad_norm": 0.0113525390625, + "learning_rate": 0.028660441761057313, + "loss": 0.8224, + "num_input_tokens_seen": 17294832, + "step": 29810 + }, + { + "epoch": 4.440720881739648, + "grad_norm": 0.0184326171875, + "learning_rate": 0.028659636288971873, + "loss": 0.7906, + "num_input_tokens_seen": 17297968, + "step": 29815 + }, + { + "epoch": 4.441465594280608, + "grad_norm": 0.013427734375, + "learning_rate": 0.02865883058612076, + "loss": 0.8062, + "num_input_tokens_seen": 17300912, + "step": 29820 + }, + { + "epoch": 4.442210306821567, + "grad_norm": 0.01806640625, + "learning_rate": 0.028658024652517586, + "loss": 0.8051, + "num_input_tokens_seen": 17303760, + "step": 29825 + }, + { + "epoch": 4.442955019362526, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028657218488175967, + "loss": 0.8172, + "num_input_tokens_seen": 17306704, + "step": 29830 + }, + { + "epoch": 4.443699731903485, + "grad_norm": 0.01507568359375, + "learning_rate": 0.02865641209310952, + "loss": 0.8009, + "num_input_tokens_seen": 17309680, + "step": 29835 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.0283203125, + "learning_rate": 0.028655605467331872, + "loss": 0.7941, + "num_input_tokens_seen": 17312432, + "step": 29840 + }, + { + "epoch": 4.445189156985403, + "grad_norm": 0.01806640625, + "learning_rate": 0.028654798610856652, + "loss": 0.8076, + "num_input_tokens_seen": 17315248, + "step": 29845 + }, + { + "epoch": 4.445933869526363, + "grad_norm": 0.01177978515625, + "learning_rate": 0.028653991523697485, + "loss": 0.7852, + "num_input_tokens_seen": 17318000, + "step": 29850 + }, + { + "epoch": 4.446678582067322, + "grad_norm": 0.0303955078125, + "learning_rate": 0.028653184205868005, + "loss": 0.8191, + "num_input_tokens_seen": 17320688, + "step": 29855 + }, + { + "epoch": 4.4474232946082815, + "grad_norm": 0.01287841796875, + "learning_rate": 0.02865237665738186, + "loss": 0.8143, + "num_input_tokens_seen": 17323312, + "step": 29860 + }, + { + "epoch": 4.44816800714924, + "grad_norm": 0.0244140625, + "learning_rate": 0.028651568878252685, + "loss": 0.8014, + "num_input_tokens_seen": 17326320, + "step": 29865 + }, + { + "epoch": 4.4489127196902, + "grad_norm": 0.03125, + "learning_rate": 0.02865076086849413, + "loss": 0.7912, + "num_input_tokens_seen": 17329328, + "step": 29870 + }, + { + "epoch": 4.449657432231159, + "grad_norm": 0.02099609375, + "learning_rate": 0.028649952628119844, + "loss": 0.7981, + "num_input_tokens_seen": 17332368, + "step": 29875 + }, + { + "epoch": 4.450402144772118, + "grad_norm": 0.0235595703125, + "learning_rate": 0.028649144157143477, + "loss": 0.8073, + "num_input_tokens_seen": 17335184, + "step": 29880 + }, + { + "epoch": 4.451146857313077, + "grad_norm": 0.018310546875, + "learning_rate": 0.028648335455578697, + "loss": 0.7834, + "num_input_tokens_seen": 17337872, + "step": 29885 + }, + { + "epoch": 4.451891569854037, + "grad_norm": 0.018798828125, + "learning_rate": 0.028647526523439156, + "loss": 0.7742, + "num_input_tokens_seen": 17340848, + "step": 29890 + }, + { + "epoch": 4.452636282394995, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02864671736073853, + "loss": 0.8094, + "num_input_tokens_seen": 17343984, + "step": 29895 + }, + { + "epoch": 4.453380994935955, + "grad_norm": 0.0230712890625, + "learning_rate": 0.028645907967490482, + "loss": 0.793, + "num_input_tokens_seen": 17347152, + "step": 29900 + }, + { + "epoch": 4.454125707476914, + "grad_norm": 0.03466796875, + "learning_rate": 0.02864509834370869, + "loss": 0.8088, + "num_input_tokens_seen": 17350224, + "step": 29905 + }, + { + "epoch": 4.4548704200178735, + "grad_norm": 0.0230712890625, + "learning_rate": 0.028644288489406828, + "loss": 0.8129, + "num_input_tokens_seen": 17353008, + "step": 29910 + }, + { + "epoch": 4.455615132558832, + "grad_norm": 0.014404296875, + "learning_rate": 0.028643478404598577, + "loss": 0.7928, + "num_input_tokens_seen": 17356240, + "step": 29915 + }, + { + "epoch": 4.456359845099792, + "grad_norm": 0.028564453125, + "learning_rate": 0.028642668089297628, + "loss": 0.8264, + "num_input_tokens_seen": 17359248, + "step": 29920 + }, + { + "epoch": 4.457104557640751, + "grad_norm": 0.02001953125, + "learning_rate": 0.028641857543517668, + "loss": 0.8164, + "num_input_tokens_seen": 17362384, + "step": 29925 + }, + { + "epoch": 4.457849270181709, + "grad_norm": 0.01385498046875, + "learning_rate": 0.02864104676727239, + "loss": 0.7989, + "num_input_tokens_seen": 17365456, + "step": 29930 + }, + { + "epoch": 4.458593982722669, + "grad_norm": 0.021484375, + "learning_rate": 0.02864023576057549, + "loss": 0.7892, + "num_input_tokens_seen": 17368368, + "step": 29935 + }, + { + "epoch": 4.459338695263629, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02863942452344067, + "loss": 0.8131, + "num_input_tokens_seen": 17371312, + "step": 29940 + }, + { + "epoch": 4.4600834078045875, + "grad_norm": 0.0177001953125, + "learning_rate": 0.028638613055881636, + "loss": 0.7873, + "num_input_tokens_seen": 17374000, + "step": 29945 + }, + { + "epoch": 4.460828120345546, + "grad_norm": 0.032470703125, + "learning_rate": 0.02863780135791209, + "loss": 0.8117, + "num_input_tokens_seen": 17376976, + "step": 29950 + }, + { + "epoch": 4.461572832886506, + "grad_norm": 0.017333984375, + "learning_rate": 0.028636989429545758, + "loss": 0.791, + "num_input_tokens_seen": 17380112, + "step": 29955 + }, + { + "epoch": 4.462317545427465, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02863617727079635, + "loss": 0.7993, + "num_input_tokens_seen": 17383120, + "step": 29960 + }, + { + "epoch": 4.463062257968424, + "grad_norm": 0.0126953125, + "learning_rate": 0.02863536488167758, + "loss": 0.8171, + "num_input_tokens_seen": 17386064, + "step": 29965 + }, + { + "epoch": 4.463806970509383, + "grad_norm": 0.024169921875, + "learning_rate": 0.028634552262203183, + "loss": 0.819, + "num_input_tokens_seen": 17388560, + "step": 29970 + }, + { + "epoch": 4.464551683050343, + "grad_norm": 0.018798828125, + "learning_rate": 0.02863373941238688, + "loss": 0.7904, + "num_input_tokens_seen": 17391504, + "step": 29975 + }, + { + "epoch": 4.465296395591301, + "grad_norm": 0.022705078125, + "learning_rate": 0.028632926332242414, + "loss": 0.8185, + "num_input_tokens_seen": 17394288, + "step": 29980 + }, + { + "epoch": 4.466041108132261, + "grad_norm": 0.01806640625, + "learning_rate": 0.02863211302178351, + "loss": 0.7705, + "num_input_tokens_seen": 17397360, + "step": 29985 + }, + { + "epoch": 4.46678582067322, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028631299481023907, + "loss": 0.7935, + "num_input_tokens_seen": 17400432, + "step": 29990 + }, + { + "epoch": 4.4675305332141795, + "grad_norm": 0.02001953125, + "learning_rate": 0.028630485709977355, + "loss": 0.8127, + "num_input_tokens_seen": 17403408, + "step": 29995 + }, + { + "epoch": 4.468275245755138, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028629671708657607, + "loss": 0.7707, + "num_input_tokens_seen": 17406608, + "step": 30000 + }, + { + "epoch": 4.469019958296098, + "grad_norm": 0.0216064453125, + "learning_rate": 0.028628857477078398, + "loss": 0.7969, + "num_input_tokens_seen": 17409360, + "step": 30005 + }, + { + "epoch": 4.469764670837057, + "grad_norm": 0.0145263671875, + "learning_rate": 0.0286280430152535, + "loss": 0.8239, + "num_input_tokens_seen": 17412464, + "step": 30010 + }, + { + "epoch": 4.470509383378016, + "grad_norm": 0.0303955078125, + "learning_rate": 0.028627228323196665, + "loss": 0.8087, + "num_input_tokens_seen": 17415344, + "step": 30015 + }, + { + "epoch": 4.471254095918975, + "grad_norm": 0.01202392578125, + "learning_rate": 0.02862641340092166, + "loss": 0.7876, + "num_input_tokens_seen": 17418224, + "step": 30020 + }, + { + "epoch": 4.471998808459935, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02862559824844224, + "loss": 0.8228, + "num_input_tokens_seen": 17421232, + "step": 30025 + }, + { + "epoch": 4.4727435210008935, + "grad_norm": 0.01458740234375, + "learning_rate": 0.028624782865772195, + "loss": 0.8014, + "num_input_tokens_seen": 17423920, + "step": 30030 + }, + { + "epoch": 4.473488233541853, + "grad_norm": 0.0233154296875, + "learning_rate": 0.028623967252925292, + "loss": 0.8014, + "num_input_tokens_seen": 17426736, + "step": 30035 + }, + { + "epoch": 4.474232946082812, + "grad_norm": 0.01953125, + "learning_rate": 0.028623151409915303, + "loss": 0.8074, + "num_input_tokens_seen": 17429712, + "step": 30040 + }, + { + "epoch": 4.4749776586237715, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02862233533675602, + "loss": 0.8086, + "num_input_tokens_seen": 17432656, + "step": 30045 + }, + { + "epoch": 4.47572237116473, + "grad_norm": 0.020263671875, + "learning_rate": 0.02862151903346123, + "loss": 0.829, + "num_input_tokens_seen": 17435696, + "step": 30050 + }, + { + "epoch": 4.47646708370569, + "grad_norm": 0.018310546875, + "learning_rate": 0.028620702500044712, + "loss": 0.7985, + "num_input_tokens_seen": 17438768, + "step": 30055 + }, + { + "epoch": 4.477211796246649, + "grad_norm": 0.02734375, + "learning_rate": 0.02861988573652028, + "loss": 0.7832, + "num_input_tokens_seen": 17441648, + "step": 30060 + }, + { + "epoch": 4.477956508787608, + "grad_norm": 0.0152587890625, + "learning_rate": 0.028619068742901713, + "loss": 0.8283, + "num_input_tokens_seen": 17444464, + "step": 30065 + }, + { + "epoch": 4.478701221328567, + "grad_norm": 0.0272216796875, + "learning_rate": 0.028618251519202824, + "loss": 0.7999, + "num_input_tokens_seen": 17447184, + "step": 30070 + }, + { + "epoch": 4.479445933869527, + "grad_norm": 0.0157470703125, + "learning_rate": 0.028617434065437417, + "loss": 0.8181, + "num_input_tokens_seen": 17450160, + "step": 30075 + }, + { + "epoch": 4.4801906464104855, + "grad_norm": 0.029296875, + "learning_rate": 0.028616616381619296, + "loss": 0.7969, + "num_input_tokens_seen": 17453360, + "step": 30080 + }, + { + "epoch": 4.480935358951445, + "grad_norm": 0.01904296875, + "learning_rate": 0.028615798467762285, + "loss": 0.8062, + "num_input_tokens_seen": 17456144, + "step": 30085 + }, + { + "epoch": 4.481680071492404, + "grad_norm": 0.01336669921875, + "learning_rate": 0.0286149803238802, + "loss": 0.8042, + "num_input_tokens_seen": 17459152, + "step": 30090 + }, + { + "epoch": 4.4824247840333635, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028614161949986856, + "loss": 0.7962, + "num_input_tokens_seen": 17462032, + "step": 30095 + }, + { + "epoch": 4.483169496574322, + "grad_norm": 0.02880859375, + "learning_rate": 0.028613343346096085, + "loss": 0.8153, + "num_input_tokens_seen": 17464848, + "step": 30100 + }, + { + "epoch": 4.483914209115282, + "grad_norm": 0.011962890625, + "learning_rate": 0.028612524512221713, + "loss": 0.7971, + "num_input_tokens_seen": 17467696, + "step": 30105 + }, + { + "epoch": 4.484658921656241, + "grad_norm": 0.01708984375, + "learning_rate": 0.028611705448377572, + "loss": 0.8003, + "num_input_tokens_seen": 17470480, + "step": 30110 + }, + { + "epoch": 4.4854036341971995, + "grad_norm": 0.024169921875, + "learning_rate": 0.028610886154577506, + "loss": 0.7916, + "num_input_tokens_seen": 17473232, + "step": 30115 + }, + { + "epoch": 4.486148346738159, + "grad_norm": 0.01153564453125, + "learning_rate": 0.028610066630835353, + "loss": 0.791, + "num_input_tokens_seen": 17475984, + "step": 30120 + }, + { + "epoch": 4.486893059279118, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028609246877164953, + "loss": 0.7847, + "num_input_tokens_seen": 17479056, + "step": 30125 + }, + { + "epoch": 4.4876377718200775, + "grad_norm": 0.01239013671875, + "learning_rate": 0.028608426893580157, + "loss": 0.7859, + "num_input_tokens_seen": 17482032, + "step": 30130 + }, + { + "epoch": 4.488382484361036, + "grad_norm": 0.01068115234375, + "learning_rate": 0.02860760668009483, + "loss": 0.8001, + "num_input_tokens_seen": 17484880, + "step": 30135 + }, + { + "epoch": 4.489127196901996, + "grad_norm": 0.02294921875, + "learning_rate": 0.028606786236722807, + "loss": 0.803, + "num_input_tokens_seen": 17487664, + "step": 30140 + }, + { + "epoch": 4.489871909442955, + "grad_norm": 0.017578125, + "learning_rate": 0.028605965563477966, + "loss": 0.7945, + "num_input_tokens_seen": 17490576, + "step": 30145 + }, + { + "epoch": 4.490616621983914, + "grad_norm": 0.0137939453125, + "learning_rate": 0.028605144660374157, + "loss": 0.7855, + "num_input_tokens_seen": 17493488, + "step": 30150 + }, + { + "epoch": 4.491361334524873, + "grad_norm": 0.01263427734375, + "learning_rate": 0.028604323527425268, + "loss": 0.7929, + "num_input_tokens_seen": 17496240, + "step": 30155 + }, + { + "epoch": 4.492106047065833, + "grad_norm": 0.0125732421875, + "learning_rate": 0.02860350216464515, + "loss": 0.782, + "num_input_tokens_seen": 17498800, + "step": 30160 + }, + { + "epoch": 4.4928507596067915, + "grad_norm": 0.018310546875, + "learning_rate": 0.028602680572047696, + "loss": 0.7869, + "num_input_tokens_seen": 17501456, + "step": 30165 + }, + { + "epoch": 4.493595472147751, + "grad_norm": 0.0262451171875, + "learning_rate": 0.02860185874964678, + "loss": 0.8151, + "num_input_tokens_seen": 17504368, + "step": 30170 + }, + { + "epoch": 4.49434018468871, + "grad_norm": 0.03466796875, + "learning_rate": 0.02860103669745628, + "loss": 0.7877, + "num_input_tokens_seen": 17507536, + "step": 30175 + }, + { + "epoch": 4.4950848972296695, + "grad_norm": 0.021728515625, + "learning_rate": 0.02860021441549009, + "loss": 0.8154, + "num_input_tokens_seen": 17510768, + "step": 30180 + }, + { + "epoch": 4.495829609770628, + "grad_norm": 0.020263671875, + "learning_rate": 0.028599391903762095, + "loss": 0.8078, + "num_input_tokens_seen": 17513744, + "step": 30185 + }, + { + "epoch": 4.496574322311588, + "grad_norm": 0.0213623046875, + "learning_rate": 0.028598569162286203, + "loss": 0.7986, + "num_input_tokens_seen": 17516976, + "step": 30190 + }, + { + "epoch": 4.497319034852547, + "grad_norm": 0.0224609375, + "learning_rate": 0.0285977461910763, + "loss": 0.8207, + "num_input_tokens_seen": 17520112, + "step": 30195 + }, + { + "epoch": 4.498063747393506, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0285969229901463, + "loss": 0.802, + "num_input_tokens_seen": 17522832, + "step": 30200 + }, + { + "epoch": 4.498808459934465, + "grad_norm": 0.01904296875, + "learning_rate": 0.028596099559510105, + "loss": 0.7754, + "num_input_tokens_seen": 17525744, + "step": 30205 + }, + { + "epoch": 4.499553172475425, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02859527589918163, + "loss": 0.7869, + "num_input_tokens_seen": 17528464, + "step": 30210 + }, + { + "epoch": 4.5002978850163835, + "grad_norm": 0.02734375, + "learning_rate": 0.02859445200917478, + "loss": 0.7848, + "num_input_tokens_seen": 17531408, + "step": 30215 + }, + { + "epoch": 4.501042597557343, + "grad_norm": 0.0120849609375, + "learning_rate": 0.028593627889503478, + "loss": 0.7982, + "num_input_tokens_seen": 17534224, + "step": 30220 + }, + { + "epoch": 4.501787310098302, + "grad_norm": 0.013427734375, + "learning_rate": 0.028592803540181653, + "loss": 0.8071, + "num_input_tokens_seen": 17537200, + "step": 30225 + }, + { + "epoch": 4.5025320226392616, + "grad_norm": 0.035400390625, + "learning_rate": 0.028591978961223227, + "loss": 0.8049, + "num_input_tokens_seen": 17540272, + "step": 30230 + }, + { + "epoch": 4.50327673518022, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028591154152642132, + "loss": 0.8054, + "num_input_tokens_seen": 17543280, + "step": 30235 + }, + { + "epoch": 4.50402144772118, + "grad_norm": 0.018798828125, + "learning_rate": 0.0285903291144523, + "loss": 0.8058, + "num_input_tokens_seen": 17546288, + "step": 30240 + }, + { + "epoch": 4.504766160262139, + "grad_norm": 0.0322265625, + "learning_rate": 0.02858950384666767, + "loss": 0.7946, + "num_input_tokens_seen": 17549520, + "step": 30245 + }, + { + "epoch": 4.505510872803098, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02858867834930218, + "loss": 0.7838, + "num_input_tokens_seen": 17552368, + "step": 30250 + }, + { + "epoch": 4.506255585344057, + "grad_norm": 0.022705078125, + "learning_rate": 0.028587852622369788, + "loss": 0.8291, + "num_input_tokens_seen": 17555312, + "step": 30255 + }, + { + "epoch": 4.507000297885017, + "grad_norm": 0.01165771484375, + "learning_rate": 0.02858702666588443, + "loss": 0.8086, + "num_input_tokens_seen": 17558416, + "step": 30260 + }, + { + "epoch": 4.5077450104259755, + "grad_norm": 0.0198974609375, + "learning_rate": 0.028586200479860067, + "loss": 0.827, + "num_input_tokens_seen": 17561328, + "step": 30265 + }, + { + "epoch": 4.508489722966935, + "grad_norm": 0.0308837890625, + "learning_rate": 0.028585374064310654, + "loss": 0.7812, + "num_input_tokens_seen": 17564432, + "step": 30270 + }, + { + "epoch": 4.509234435507894, + "grad_norm": 0.036865234375, + "learning_rate": 0.028584547419250157, + "loss": 0.796, + "num_input_tokens_seen": 17567376, + "step": 30275 + }, + { + "epoch": 4.509979148048853, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02858372054469254, + "loss": 0.8057, + "num_input_tokens_seen": 17570288, + "step": 30280 + }, + { + "epoch": 4.510723860589812, + "grad_norm": 0.038818359375, + "learning_rate": 0.028582893440651767, + "loss": 0.8202, + "num_input_tokens_seen": 17573456, + "step": 30285 + }, + { + "epoch": 4.511468573130772, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028582066107141816, + "loss": 0.7792, + "num_input_tokens_seen": 17576176, + "step": 30290 + }, + { + "epoch": 4.512213285671731, + "grad_norm": 0.018798828125, + "learning_rate": 0.02858123854417666, + "loss": 0.777, + "num_input_tokens_seen": 17578800, + "step": 30295 + }, + { + "epoch": 4.5129579982126895, + "grad_norm": 0.0205078125, + "learning_rate": 0.028580410751770282, + "loss": 0.7737, + "num_input_tokens_seen": 17581712, + "step": 30300 + }, + { + "epoch": 4.513702710753649, + "grad_norm": 0.01318359375, + "learning_rate": 0.028579582729936666, + "loss": 0.8357, + "num_input_tokens_seen": 17584688, + "step": 30305 + }, + { + "epoch": 4.514447423294608, + "grad_norm": 0.02197265625, + "learning_rate": 0.028578754478689808, + "loss": 0.7916, + "num_input_tokens_seen": 17587504, + "step": 30310 + }, + { + "epoch": 4.5151921358355676, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028577925998043686, + "loss": 0.7901, + "num_input_tokens_seen": 17590128, + "step": 30315 + }, + { + "epoch": 4.515936848376526, + "grad_norm": 0.0301513671875, + "learning_rate": 0.028577097288012308, + "loss": 0.8093, + "num_input_tokens_seen": 17592976, + "step": 30320 + }, + { + "epoch": 4.516681560917486, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02857626834860967, + "loss": 0.802, + "num_input_tokens_seen": 17595984, + "step": 30325 + }, + { + "epoch": 4.517426273458445, + "grad_norm": 0.0218505859375, + "learning_rate": 0.028575439179849775, + "loss": 0.8222, + "num_input_tokens_seen": 17598832, + "step": 30330 + }, + { + "epoch": 4.518170985999404, + "grad_norm": 0.0224609375, + "learning_rate": 0.028574609781746637, + "loss": 0.7898, + "num_input_tokens_seen": 17601552, + "step": 30335 + }, + { + "epoch": 4.518915698540363, + "grad_norm": 0.029296875, + "learning_rate": 0.02857378015431426, + "loss": 0.8201, + "num_input_tokens_seen": 17604592, + "step": 30340 + }, + { + "epoch": 4.519660411081323, + "grad_norm": 0.033935546875, + "learning_rate": 0.028572950297566662, + "loss": 0.8155, + "num_input_tokens_seen": 17607536, + "step": 30345 + }, + { + "epoch": 4.5204051236222815, + "grad_norm": 0.033935546875, + "learning_rate": 0.028572120211517865, + "loss": 0.7808, + "num_input_tokens_seen": 17610672, + "step": 30350 + }, + { + "epoch": 4.521149836163241, + "grad_norm": 0.0274658203125, + "learning_rate": 0.028571289896181892, + "loss": 0.7976, + "num_input_tokens_seen": 17613520, + "step": 30355 + }, + { + "epoch": 4.5218945487042, + "grad_norm": 0.0147705078125, + "learning_rate": 0.02857045935157277, + "loss": 0.8259, + "num_input_tokens_seen": 17616624, + "step": 30360 + }, + { + "epoch": 4.52263926124516, + "grad_norm": 0.040283203125, + "learning_rate": 0.02856962857770453, + "loss": 0.8207, + "num_input_tokens_seen": 17619792, + "step": 30365 + }, + { + "epoch": 4.523383973786118, + "grad_norm": 0.03173828125, + "learning_rate": 0.0285687975745912, + "loss": 0.8173, + "num_input_tokens_seen": 17622992, + "step": 30370 + }, + { + "epoch": 4.524128686327078, + "grad_norm": 0.0303955078125, + "learning_rate": 0.028567966342246832, + "loss": 0.7878, + "num_input_tokens_seen": 17625840, + "step": 30375 + }, + { + "epoch": 4.524873398868037, + "grad_norm": 0.015380859375, + "learning_rate": 0.028567134880685463, + "loss": 0.8064, + "num_input_tokens_seen": 17628496, + "step": 30380 + }, + { + "epoch": 4.525618111408996, + "grad_norm": 0.0283203125, + "learning_rate": 0.02856630318992114, + "loss": 0.8006, + "num_input_tokens_seen": 17631248, + "step": 30385 + }, + { + "epoch": 4.526362823949955, + "grad_norm": 0.01519775390625, + "learning_rate": 0.028565471269967908, + "loss": 0.8231, + "num_input_tokens_seen": 17634032, + "step": 30390 + }, + { + "epoch": 4.527107536490915, + "grad_norm": 0.021728515625, + "learning_rate": 0.028564639120839827, + "loss": 0.7963, + "num_input_tokens_seen": 17637328, + "step": 30395 + }, + { + "epoch": 4.5278522490318736, + "grad_norm": 0.019775390625, + "learning_rate": 0.028563806742550955, + "loss": 0.8014, + "num_input_tokens_seen": 17640432, + "step": 30400 + }, + { + "epoch": 4.528596961572833, + "grad_norm": 0.0205078125, + "learning_rate": 0.028562974135115358, + "loss": 0.8045, + "num_input_tokens_seen": 17643152, + "step": 30405 + }, + { + "epoch": 4.529341674113792, + "grad_norm": 0.02099609375, + "learning_rate": 0.028562141298547093, + "loss": 0.8, + "num_input_tokens_seen": 17646000, + "step": 30410 + }, + { + "epoch": 4.530086386654752, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028561308232860235, + "loss": 0.8149, + "num_input_tokens_seen": 17648880, + "step": 30415 + }, + { + "epoch": 4.53083109919571, + "grad_norm": 0.031005859375, + "learning_rate": 0.02856047493806886, + "loss": 0.7717, + "num_input_tokens_seen": 17651632, + "step": 30420 + }, + { + "epoch": 4.53157581173667, + "grad_norm": 0.0233154296875, + "learning_rate": 0.028559641414187042, + "loss": 0.7844, + "num_input_tokens_seen": 17654448, + "step": 30425 + }, + { + "epoch": 4.532320524277629, + "grad_norm": 0.017578125, + "learning_rate": 0.028558807661228865, + "loss": 0.8059, + "num_input_tokens_seen": 17657680, + "step": 30430 + }, + { + "epoch": 4.533065236818588, + "grad_norm": 0.021484375, + "learning_rate": 0.028557973679208412, + "loss": 0.815, + "num_input_tokens_seen": 17660528, + "step": 30435 + }, + { + "epoch": 4.533809949359547, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028557139468139772, + "loss": 0.8027, + "num_input_tokens_seen": 17663600, + "step": 30440 + }, + { + "epoch": 4.534554661900506, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02855630502803704, + "loss": 0.7953, + "num_input_tokens_seen": 17666480, + "step": 30445 + }, + { + "epoch": 4.535299374441466, + "grad_norm": 0.029541015625, + "learning_rate": 0.028555470358914314, + "loss": 0.817, + "num_input_tokens_seen": 17669520, + "step": 30450 + }, + { + "epoch": 4.536044086982425, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028554635460785693, + "loss": 0.7987, + "num_input_tokens_seen": 17672208, + "step": 30455 + }, + { + "epoch": 4.536788799523384, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028553800333665284, + "loss": 0.8005, + "num_input_tokens_seen": 17675152, + "step": 30460 + }, + { + "epoch": 4.537533512064343, + "grad_norm": 0.01470947265625, + "learning_rate": 0.02855296497756719, + "loss": 0.8007, + "num_input_tokens_seen": 17678448, + "step": 30465 + }, + { + "epoch": 4.538278224605302, + "grad_norm": 0.017333984375, + "learning_rate": 0.028552129392505527, + "loss": 0.7935, + "num_input_tokens_seen": 17681488, + "step": 30470 + }, + { + "epoch": 4.539022937146262, + "grad_norm": 0.0257568359375, + "learning_rate": 0.028551293578494414, + "loss": 0.8112, + "num_input_tokens_seen": 17684432, + "step": 30475 + }, + { + "epoch": 4.539767649687221, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028550457535547972, + "loss": 0.805, + "num_input_tokens_seen": 17687280, + "step": 30480 + }, + { + "epoch": 4.5405123622281796, + "grad_norm": 0.012451171875, + "learning_rate": 0.028549621263680318, + "loss": 0.8167, + "num_input_tokens_seen": 17690160, + "step": 30485 + }, + { + "epoch": 4.541257074769139, + "grad_norm": 0.01080322265625, + "learning_rate": 0.028548784762905583, + "loss": 0.7753, + "num_input_tokens_seen": 17693008, + "step": 30490 + }, + { + "epoch": 4.542001787310098, + "grad_norm": 0.01275634765625, + "learning_rate": 0.028547948033237902, + "loss": 0.7862, + "num_input_tokens_seen": 17695952, + "step": 30495 + }, + { + "epoch": 4.542746499851058, + "grad_norm": 0.029541015625, + "learning_rate": 0.02854711107469141, + "loss": 0.809, + "num_input_tokens_seen": 17698736, + "step": 30500 + }, + { + "epoch": 4.543491212392016, + "grad_norm": 0.0223388671875, + "learning_rate": 0.028546273887280243, + "loss": 0.7808, + "num_input_tokens_seen": 17701552, + "step": 30505 + }, + { + "epoch": 4.544235924932976, + "grad_norm": 0.030517578125, + "learning_rate": 0.028545436471018548, + "loss": 0.8111, + "num_input_tokens_seen": 17704528, + "step": 30510 + }, + { + "epoch": 4.544980637473935, + "grad_norm": 0.0228271484375, + "learning_rate": 0.028544598825920465, + "loss": 0.7816, + "num_input_tokens_seen": 17707408, + "step": 30515 + }, + { + "epoch": 4.545725350014894, + "grad_norm": 0.01708984375, + "learning_rate": 0.028543760952000157, + "loss": 0.8389, + "num_input_tokens_seen": 17710128, + "step": 30520 + }, + { + "epoch": 4.546470062555853, + "grad_norm": 0.019287109375, + "learning_rate": 0.02854292284927177, + "loss": 0.7794, + "num_input_tokens_seen": 17712976, + "step": 30525 + }, + { + "epoch": 4.547214775096813, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028542084517749467, + "loss": 0.811, + "num_input_tokens_seen": 17715984, + "step": 30530 + }, + { + "epoch": 4.547959487637772, + "grad_norm": 0.01361083984375, + "learning_rate": 0.02854124595744741, + "loss": 0.8053, + "num_input_tokens_seen": 17718928, + "step": 30535 + }, + { + "epoch": 4.548704200178731, + "grad_norm": 0.0213623046875, + "learning_rate": 0.028540407168379765, + "loss": 0.7817, + "num_input_tokens_seen": 17721680, + "step": 30540 + }, + { + "epoch": 4.54944891271969, + "grad_norm": 0.019287109375, + "learning_rate": 0.028539568150560704, + "loss": 0.8037, + "num_input_tokens_seen": 17724400, + "step": 30545 + }, + { + "epoch": 4.55019362526065, + "grad_norm": 0.018798828125, + "learning_rate": 0.0285387289040044, + "loss": 0.7961, + "num_input_tokens_seen": 17727280, + "step": 30550 + }, + { + "epoch": 4.550938337801608, + "grad_norm": 0.029052734375, + "learning_rate": 0.02853788942872503, + "loss": 0.8266, + "num_input_tokens_seen": 17730448, + "step": 30555 + }, + { + "epoch": 4.551683050342568, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028537049724736776, + "loss": 0.7899, + "num_input_tokens_seen": 17733328, + "step": 30560 + }, + { + "epoch": 4.552427762883527, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02853620979205383, + "loss": 0.815, + "num_input_tokens_seen": 17736144, + "step": 30565 + }, + { + "epoch": 4.553172475424486, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02853536963069037, + "loss": 0.8089, + "num_input_tokens_seen": 17739152, + "step": 30570 + }, + { + "epoch": 4.553917187965445, + "grad_norm": 0.02197265625, + "learning_rate": 0.028534529240660602, + "loss": 0.7972, + "num_input_tokens_seen": 17742128, + "step": 30575 + }, + { + "epoch": 4.554661900506405, + "grad_norm": 0.01324462890625, + "learning_rate": 0.028533688621978717, + "loss": 0.797, + "num_input_tokens_seen": 17744976, + "step": 30580 + }, + { + "epoch": 4.555406613047364, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028532847774658917, + "loss": 0.8091, + "num_input_tokens_seen": 17747984, + "step": 30585 + }, + { + "epoch": 4.556151325588323, + "grad_norm": 0.01611328125, + "learning_rate": 0.02853200669871541, + "loss": 0.801, + "num_input_tokens_seen": 17750992, + "step": 30590 + }, + { + "epoch": 4.556896038129282, + "grad_norm": 0.033935546875, + "learning_rate": 0.0285311653941624, + "loss": 0.8026, + "num_input_tokens_seen": 17753968, + "step": 30595 + }, + { + "epoch": 4.557640750670242, + "grad_norm": 0.03369140625, + "learning_rate": 0.028530323861014107, + "loss": 0.7823, + "num_input_tokens_seen": 17756784, + "step": 30600 + }, + { + "epoch": 4.5583854632112, + "grad_norm": 0.0439453125, + "learning_rate": 0.028529482099284742, + "loss": 0.8356, + "num_input_tokens_seen": 17759984, + "step": 30605 + }, + { + "epoch": 4.559130175752159, + "grad_norm": 0.0233154296875, + "learning_rate": 0.028528640108988527, + "loss": 0.7953, + "num_input_tokens_seen": 17763280, + "step": 30610 + }, + { + "epoch": 4.559874888293119, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02852779789013969, + "loss": 0.7899, + "num_input_tokens_seen": 17765872, + "step": 30615 + }, + { + "epoch": 4.5606196008340785, + "grad_norm": 0.021484375, + "learning_rate": 0.028526955442752456, + "loss": 0.7978, + "num_input_tokens_seen": 17768816, + "step": 30620 + }, + { + "epoch": 4.561364313375037, + "grad_norm": 0.02099609375, + "learning_rate": 0.028526112766841055, + "loss": 0.7827, + "num_input_tokens_seen": 17771920, + "step": 30625 + }, + { + "epoch": 4.562109025915996, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02852526986241973, + "loss": 0.79, + "num_input_tokens_seen": 17774640, + "step": 30630 + }, + { + "epoch": 4.562853738456956, + "grad_norm": 0.031494140625, + "learning_rate": 0.028524426729502714, + "loss": 0.8062, + "num_input_tokens_seen": 17777264, + "step": 30635 + }, + { + "epoch": 4.563598450997915, + "grad_norm": 0.0322265625, + "learning_rate": 0.028523583368104254, + "loss": 0.8222, + "num_input_tokens_seen": 17780432, + "step": 30640 + }, + { + "epoch": 4.564343163538874, + "grad_norm": 0.014892578125, + "learning_rate": 0.028522739778238604, + "loss": 0.8046, + "num_input_tokens_seen": 17783312, + "step": 30645 + }, + { + "epoch": 4.565087876079833, + "grad_norm": 0.01483154296875, + "learning_rate": 0.02852189595992, + "loss": 0.8094, + "num_input_tokens_seen": 17786064, + "step": 30650 + }, + { + "epoch": 4.565832588620792, + "grad_norm": 0.01507568359375, + "learning_rate": 0.02852105191316271, + "loss": 0.807, + "num_input_tokens_seen": 17788912, + "step": 30655 + }, + { + "epoch": 4.566577301161751, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02852020763798099, + "loss": 0.7752, + "num_input_tokens_seen": 17791376, + "step": 30660 + }, + { + "epoch": 4.567322013702711, + "grad_norm": 0.02685546875, + "learning_rate": 0.028519363134389106, + "loss": 0.8466, + "num_input_tokens_seen": 17794320, + "step": 30665 + }, + { + "epoch": 4.56806672624367, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02851851840240132, + "loss": 0.8426, + "num_input_tokens_seen": 17797488, + "step": 30670 + }, + { + "epoch": 4.568811438784629, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028517673442031908, + "loss": 0.7943, + "num_input_tokens_seen": 17800208, + "step": 30675 + }, + { + "epoch": 4.569556151325588, + "grad_norm": 0.020263671875, + "learning_rate": 0.02851682825329514, + "loss": 0.7839, + "num_input_tokens_seen": 17802960, + "step": 30680 + }, + { + "epoch": 4.570300863866548, + "grad_norm": 0.02587890625, + "learning_rate": 0.028515982836205298, + "loss": 0.7995, + "num_input_tokens_seen": 17805488, + "step": 30685 + }, + { + "epoch": 4.571045576407506, + "grad_norm": 0.01446533203125, + "learning_rate": 0.02851513719077666, + "loss": 0.823, + "num_input_tokens_seen": 17808208, + "step": 30690 + }, + { + "epoch": 4.571790288948466, + "grad_norm": 0.030517578125, + "learning_rate": 0.028514291317023516, + "loss": 0.8132, + "num_input_tokens_seen": 17810896, + "step": 30695 + }, + { + "epoch": 4.572535001489425, + "grad_norm": 0.027587890625, + "learning_rate": 0.028513445214960156, + "loss": 0.7962, + "num_input_tokens_seen": 17813648, + "step": 30700 + }, + { + "epoch": 4.5732797140303845, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028512598884600875, + "loss": 0.8187, + "num_input_tokens_seen": 17816272, + "step": 30705 + }, + { + "epoch": 4.574024426571343, + "grad_norm": 0.0242919921875, + "learning_rate": 0.028511752325959965, + "loss": 0.8145, + "num_input_tokens_seen": 17819248, + "step": 30710 + }, + { + "epoch": 4.574769139112303, + "grad_norm": 0.0244140625, + "learning_rate": 0.02851090553905174, + "loss": 0.7957, + "num_input_tokens_seen": 17822064, + "step": 30715 + }, + { + "epoch": 4.575513851653262, + "grad_norm": 0.026611328125, + "learning_rate": 0.02851005852389049, + "loss": 0.8134, + "num_input_tokens_seen": 17824816, + "step": 30720 + }, + { + "epoch": 4.576258564194221, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02850921128049054, + "loss": 0.813, + "num_input_tokens_seen": 17827600, + "step": 30725 + }, + { + "epoch": 4.57700327673518, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02850836380886619, + "loss": 0.8114, + "num_input_tokens_seen": 17830768, + "step": 30730 + }, + { + "epoch": 4.57774798927614, + "grad_norm": 0.0216064453125, + "learning_rate": 0.028507516109031763, + "loss": 0.8131, + "num_input_tokens_seen": 17833616, + "step": 30735 + }, + { + "epoch": 4.578492701817098, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028506668181001577, + "loss": 0.7904, + "num_input_tokens_seen": 17836464, + "step": 30740 + }, + { + "epoch": 4.579237414358058, + "grad_norm": 0.01336669921875, + "learning_rate": 0.02850582002478997, + "loss": 0.8038, + "num_input_tokens_seen": 17839280, + "step": 30745 + }, + { + "epoch": 4.579982126899017, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02850497164041125, + "loss": 0.8007, + "num_input_tokens_seen": 17842768, + "step": 30750 + }, + { + "epoch": 4.5807268394399765, + "grad_norm": 0.02783203125, + "learning_rate": 0.02850412302787976, + "loss": 0.8075, + "num_input_tokens_seen": 17845488, + "step": 30755 + }, + { + "epoch": 4.581471551980935, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02850327418720984, + "loss": 0.7873, + "num_input_tokens_seen": 17848304, + "step": 30760 + }, + { + "epoch": 4.582216264521895, + "grad_norm": 0.029541015625, + "learning_rate": 0.02850242511841583, + "loss": 0.82, + "num_input_tokens_seen": 17851312, + "step": 30765 + }, + { + "epoch": 4.582960977062854, + "grad_norm": 0.018798828125, + "learning_rate": 0.028501575821512068, + "loss": 0.8098, + "num_input_tokens_seen": 17854000, + "step": 30770 + }, + { + "epoch": 4.583705689603813, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0285007262965129, + "loss": 0.8209, + "num_input_tokens_seen": 17856848, + "step": 30775 + }, + { + "epoch": 4.584450402144772, + "grad_norm": 0.01226806640625, + "learning_rate": 0.02849987654343269, + "loss": 0.7888, + "num_input_tokens_seen": 17859760, + "step": 30780 + }, + { + "epoch": 4.585195114685732, + "grad_norm": 0.026611328125, + "learning_rate": 0.028499026562285784, + "loss": 0.7996, + "num_input_tokens_seen": 17862448, + "step": 30785 + }, + { + "epoch": 4.5859398272266905, + "grad_norm": 0.0247802734375, + "learning_rate": 0.028498176353086546, + "loss": 0.8158, + "num_input_tokens_seen": 17865392, + "step": 30790 + }, + { + "epoch": 4.586684539767649, + "grad_norm": 0.0272216796875, + "learning_rate": 0.028497325915849334, + "loss": 0.8002, + "num_input_tokens_seen": 17868048, + "step": 30795 + }, + { + "epoch": 4.587429252308609, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028496475250588522, + "loss": 0.7907, + "num_input_tokens_seen": 17870768, + "step": 30800 + }, + { + "epoch": 4.5881739648495685, + "grad_norm": 0.011962890625, + "learning_rate": 0.028495624357318474, + "loss": 0.8201, + "num_input_tokens_seen": 17873520, + "step": 30805 + }, + { + "epoch": 4.588918677390527, + "grad_norm": 0.0137939453125, + "learning_rate": 0.02849477323605357, + "loss": 0.7987, + "num_input_tokens_seen": 17876464, + "step": 30810 + }, + { + "epoch": 4.589663389931486, + "grad_norm": 0.0306396484375, + "learning_rate": 0.02849392188680819, + "loss": 0.7898, + "num_input_tokens_seen": 17879152, + "step": 30815 + }, + { + "epoch": 4.590408102472446, + "grad_norm": 0.01385498046875, + "learning_rate": 0.028493070309596708, + "loss": 0.8048, + "num_input_tokens_seen": 17882000, + "step": 30820 + }, + { + "epoch": 4.591152815013404, + "grad_norm": 0.0189208984375, + "learning_rate": 0.028492218504433526, + "loss": 0.8125, + "num_input_tokens_seen": 17884912, + "step": 30825 + }, + { + "epoch": 4.591897527554364, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02849136647133302, + "loss": 0.7947, + "num_input_tokens_seen": 17887632, + "step": 30830 + }, + { + "epoch": 4.592642240095323, + "grad_norm": 0.02392578125, + "learning_rate": 0.02849051421030959, + "loss": 0.7989, + "num_input_tokens_seen": 17890480, + "step": 30835 + }, + { + "epoch": 4.5933869526362825, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02848966172137763, + "loss": 0.821, + "num_input_tokens_seen": 17893392, + "step": 30840 + }, + { + "epoch": 4.594131665177241, + "grad_norm": 0.019775390625, + "learning_rate": 0.02848880900455155, + "loss": 0.8013, + "num_input_tokens_seen": 17896144, + "step": 30845 + }, + { + "epoch": 4.594876377718201, + "grad_norm": 0.02685546875, + "learning_rate": 0.02848795605984575, + "loss": 0.8022, + "num_input_tokens_seen": 17899248, + "step": 30850 + }, + { + "epoch": 4.59562109025916, + "grad_norm": 0.0203857421875, + "learning_rate": 0.028487102887274646, + "loss": 0.7941, + "num_input_tokens_seen": 17902000, + "step": 30855 + }, + { + "epoch": 4.596365802800119, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02848624948685264, + "loss": 0.7888, + "num_input_tokens_seen": 17904720, + "step": 30860 + }, + { + "epoch": 4.597110515341078, + "grad_norm": 0.030029296875, + "learning_rate": 0.028485395858594158, + "loss": 0.8009, + "num_input_tokens_seen": 17907472, + "step": 30865 + }, + { + "epoch": 4.597855227882038, + "grad_norm": 0.0120849609375, + "learning_rate": 0.028484542002513616, + "loss": 0.8023, + "num_input_tokens_seen": 17910032, + "step": 30870 + }, + { + "epoch": 4.5985999404229965, + "grad_norm": 0.0213623046875, + "learning_rate": 0.028483687918625447, + "loss": 0.7837, + "num_input_tokens_seen": 17913040, + "step": 30875 + }, + { + "epoch": 4.599344652963956, + "grad_norm": 0.025146484375, + "learning_rate": 0.028482833606944074, + "loss": 0.8024, + "num_input_tokens_seen": 17915984, + "step": 30880 + }, + { + "epoch": 4.600089365504915, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028481979067483926, + "loss": 0.7989, + "num_input_tokens_seen": 17918640, + "step": 30885 + }, + { + "epoch": 4.6008340780458745, + "grad_norm": 0.0205078125, + "learning_rate": 0.028481124300259446, + "loss": 0.8036, + "num_input_tokens_seen": 17921808, + "step": 30890 + }, + { + "epoch": 4.601578790586833, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02848026930528507, + "loss": 0.7996, + "num_input_tokens_seen": 17924816, + "step": 30895 + }, + { + "epoch": 4.602323503127793, + "grad_norm": 0.019287109375, + "learning_rate": 0.028479414082575254, + "loss": 0.7745, + "num_input_tokens_seen": 17927664, + "step": 30900 + }, + { + "epoch": 4.603068215668752, + "grad_norm": 0.020263671875, + "learning_rate": 0.028478558632144428, + "loss": 0.7827, + "num_input_tokens_seen": 17930288, + "step": 30905 + }, + { + "epoch": 4.603812928209711, + "grad_norm": 0.029052734375, + "learning_rate": 0.028477702954007058, + "loss": 0.776, + "num_input_tokens_seen": 17933072, + "step": 30910 + }, + { + "epoch": 4.60455764075067, + "grad_norm": 0.01953125, + "learning_rate": 0.028476847048177593, + "loss": 0.8337, + "num_input_tokens_seen": 17935856, + "step": 30915 + }, + { + "epoch": 4.60530235329163, + "grad_norm": 0.0213623046875, + "learning_rate": 0.028475990914670492, + "loss": 0.7831, + "num_input_tokens_seen": 17938800, + "step": 30920 + }, + { + "epoch": 4.6060470658325885, + "grad_norm": 0.02099609375, + "learning_rate": 0.028475134553500224, + "loss": 0.824, + "num_input_tokens_seen": 17942032, + "step": 30925 + }, + { + "epoch": 4.606791778373548, + "grad_norm": 0.020263671875, + "learning_rate": 0.028474277964681254, + "loss": 0.8151, + "num_input_tokens_seen": 17945072, + "step": 30930 + }, + { + "epoch": 4.607536490914507, + "grad_norm": 0.02197265625, + "learning_rate": 0.028473421148228054, + "loss": 0.8206, + "num_input_tokens_seen": 17948048, + "step": 30935 + }, + { + "epoch": 4.6082812034554665, + "grad_norm": 0.0133056640625, + "learning_rate": 0.02847256410415509, + "loss": 0.8152, + "num_input_tokens_seen": 17951216, + "step": 30940 + }, + { + "epoch": 4.609025915996425, + "grad_norm": 0.012451171875, + "learning_rate": 0.028471706832476854, + "loss": 0.8097, + "num_input_tokens_seen": 17954160, + "step": 30945 + }, + { + "epoch": 4.609770628537385, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02847084933320782, + "loss": 0.7989, + "num_input_tokens_seen": 17957104, + "step": 30950 + }, + { + "epoch": 4.610515341078344, + "grad_norm": 0.02197265625, + "learning_rate": 0.028469991606362478, + "loss": 0.7996, + "num_input_tokens_seen": 17960080, + "step": 30955 + }, + { + "epoch": 4.6112600536193025, + "grad_norm": 0.020751953125, + "learning_rate": 0.02846913365195532, + "loss": 0.7943, + "num_input_tokens_seen": 17963152, + "step": 30960 + }, + { + "epoch": 4.612004766160262, + "grad_norm": 0.0228271484375, + "learning_rate": 0.028468275470000838, + "loss": 0.8041, + "num_input_tokens_seen": 17966064, + "step": 30965 + }, + { + "epoch": 4.612749478701222, + "grad_norm": 0.0244140625, + "learning_rate": 0.028467417060513533, + "loss": 0.8024, + "num_input_tokens_seen": 17969040, + "step": 30970 + }, + { + "epoch": 4.6134941912421805, + "grad_norm": 0.03759765625, + "learning_rate": 0.028466558423507902, + "loss": 0.789, + "num_input_tokens_seen": 17972208, + "step": 30975 + }, + { + "epoch": 4.614238903783139, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028465699558998452, + "loss": 0.8107, + "num_input_tokens_seen": 17974896, + "step": 30980 + }, + { + "epoch": 4.614983616324099, + "grad_norm": 0.020263671875, + "learning_rate": 0.028464840466999694, + "loss": 0.8001, + "num_input_tokens_seen": 17977776, + "step": 30985 + }, + { + "epoch": 4.615728328865059, + "grad_norm": 0.0184326171875, + "learning_rate": 0.028463981147526143, + "loss": 0.8067, + "num_input_tokens_seen": 17980368, + "step": 30990 + }, + { + "epoch": 4.616473041406017, + "grad_norm": 0.025146484375, + "learning_rate": 0.028463121600592313, + "loss": 0.823, + "num_input_tokens_seen": 17983280, + "step": 30995 + }, + { + "epoch": 4.617217753946976, + "grad_norm": 0.0135498046875, + "learning_rate": 0.028462261826212726, + "loss": 0.8116, + "num_input_tokens_seen": 17986384, + "step": 31000 + }, + { + "epoch": 4.617962466487936, + "grad_norm": 0.0130615234375, + "learning_rate": 0.028461401824401912, + "loss": 0.7903, + "num_input_tokens_seen": 17989328, + "step": 31005 + }, + { + "epoch": 4.6187071790288945, + "grad_norm": 0.02001953125, + "learning_rate": 0.028460541595174395, + "loss": 0.7838, + "num_input_tokens_seen": 17992112, + "step": 31010 + }, + { + "epoch": 4.619451891569854, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028459681138544704, + "loss": 0.7948, + "num_input_tokens_seen": 17995024, + "step": 31015 + }, + { + "epoch": 4.620196604110813, + "grad_norm": 0.0211181640625, + "learning_rate": 0.028458820454527384, + "loss": 0.7971, + "num_input_tokens_seen": 17998160, + "step": 31020 + }, + { + "epoch": 4.6209413166517725, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02845795954313697, + "loss": 0.7919, + "num_input_tokens_seen": 18000496, + "step": 31025 + }, + { + "epoch": 4.621686029192731, + "grad_norm": 0.02490234375, + "learning_rate": 0.028457098404388006, + "loss": 0.8128, + "num_input_tokens_seen": 18003312, + "step": 31030 + }, + { + "epoch": 4.622430741733691, + "grad_norm": 0.054443359375, + "learning_rate": 0.028456237038295043, + "loss": 0.8273, + "num_input_tokens_seen": 18006160, + "step": 31035 + }, + { + "epoch": 4.62317545427465, + "grad_norm": 0.0306396484375, + "learning_rate": 0.028455375444872633, + "loss": 0.8074, + "num_input_tokens_seen": 18008976, + "step": 31040 + }, + { + "epoch": 4.623920166815609, + "grad_norm": 0.0277099609375, + "learning_rate": 0.028454513624135327, + "loss": 0.7967, + "num_input_tokens_seen": 18011792, + "step": 31045 + }, + { + "epoch": 4.624664879356568, + "grad_norm": 0.026123046875, + "learning_rate": 0.028453651576097694, + "loss": 0.7993, + "num_input_tokens_seen": 18014992, + "step": 31050 + }, + { + "epoch": 4.625409591897528, + "grad_norm": 0.0146484375, + "learning_rate": 0.028452789300774287, + "loss": 0.8082, + "num_input_tokens_seen": 18017904, + "step": 31055 + }, + { + "epoch": 4.6261543044384865, + "grad_norm": 0.0198974609375, + "learning_rate": 0.028451926798179678, + "loss": 0.8092, + "num_input_tokens_seen": 18020848, + "step": 31060 + }, + { + "epoch": 4.626899016979446, + "grad_norm": 0.0198974609375, + "learning_rate": 0.028451064068328438, + "loss": 0.7959, + "num_input_tokens_seen": 18023728, + "step": 31065 + }, + { + "epoch": 4.627643729520405, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02845020111123514, + "loss": 0.8086, + "num_input_tokens_seen": 18026480, + "step": 31070 + }, + { + "epoch": 4.628388442061365, + "grad_norm": 0.026123046875, + "learning_rate": 0.028449337926914366, + "loss": 0.8071, + "num_input_tokens_seen": 18029104, + "step": 31075 + }, + { + "epoch": 4.629133154602323, + "grad_norm": 0.021240234375, + "learning_rate": 0.028448474515380698, + "loss": 0.8273, + "num_input_tokens_seen": 18032016, + "step": 31080 + }, + { + "epoch": 4.629877867143283, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028447610876648717, + "loss": 0.8037, + "num_input_tokens_seen": 18034832, + "step": 31085 + }, + { + "epoch": 4.630622579684242, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02844674701073302, + "loss": 0.8016, + "num_input_tokens_seen": 18037616, + "step": 31090 + }, + { + "epoch": 4.631367292225201, + "grad_norm": 0.0135498046875, + "learning_rate": 0.028445882917648203, + "loss": 0.8052, + "num_input_tokens_seen": 18040976, + "step": 31095 + }, + { + "epoch": 4.63211200476616, + "grad_norm": 0.021728515625, + "learning_rate": 0.028445018597408857, + "loss": 0.7901, + "num_input_tokens_seen": 18044592, + "step": 31100 + }, + { + "epoch": 4.63285671730712, + "grad_norm": 0.01348876953125, + "learning_rate": 0.028444154050029584, + "loss": 0.7956, + "num_input_tokens_seen": 18047504, + "step": 31105 + }, + { + "epoch": 4.6336014298480785, + "grad_norm": 0.0240478515625, + "learning_rate": 0.028443289275525, + "loss": 0.8323, + "num_input_tokens_seen": 18050352, + "step": 31110 + }, + { + "epoch": 4.634346142389038, + "grad_norm": 0.028076171875, + "learning_rate": 0.028442424273909694, + "loss": 0.8215, + "num_input_tokens_seen": 18053136, + "step": 31115 + }, + { + "epoch": 4.635090854929997, + "grad_norm": 0.0218505859375, + "learning_rate": 0.028441559045198305, + "loss": 0.8121, + "num_input_tokens_seen": 18056208, + "step": 31120 + }, + { + "epoch": 4.635835567470957, + "grad_norm": 0.022216796875, + "learning_rate": 0.02844069358940543, + "loss": 0.8086, + "num_input_tokens_seen": 18059344, + "step": 31125 + }, + { + "epoch": 4.636580280011915, + "grad_norm": 0.026123046875, + "learning_rate": 0.0284398279065457, + "loss": 0.8138, + "num_input_tokens_seen": 18062224, + "step": 31130 + }, + { + "epoch": 4.637324992552875, + "grad_norm": 0.02490234375, + "learning_rate": 0.028438961996633736, + "loss": 0.7965, + "num_input_tokens_seen": 18065008, + "step": 31135 + }, + { + "epoch": 4.638069705093834, + "grad_norm": 0.03271484375, + "learning_rate": 0.028438095859684168, + "loss": 0.8149, + "num_input_tokens_seen": 18067536, + "step": 31140 + }, + { + "epoch": 4.6388144176347925, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02843722949571163, + "loss": 0.8109, + "num_input_tokens_seen": 18070608, + "step": 31145 + }, + { + "epoch": 4.639559130175752, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02843636290473076, + "loss": 0.7888, + "num_input_tokens_seen": 18073488, + "step": 31150 + }, + { + "epoch": 4.640303842716712, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028435496086756187, + "loss": 0.8113, + "num_input_tokens_seen": 18076336, + "step": 31155 + }, + { + "epoch": 4.641048555257671, + "grad_norm": 0.0260009765625, + "learning_rate": 0.028434629041802564, + "loss": 0.8079, + "num_input_tokens_seen": 18079088, + "step": 31160 + }, + { + "epoch": 4.641793267798629, + "grad_norm": 0.0140380859375, + "learning_rate": 0.02843376176988454, + "loss": 0.8183, + "num_input_tokens_seen": 18082224, + "step": 31165 + }, + { + "epoch": 4.642537980339589, + "grad_norm": 0.0247802734375, + "learning_rate": 0.028432894271016765, + "loss": 0.7915, + "num_input_tokens_seen": 18085200, + "step": 31170 + }, + { + "epoch": 4.643282692880548, + "grad_norm": 0.0174560546875, + "learning_rate": 0.02843202654521389, + "loss": 0.791, + "num_input_tokens_seen": 18087952, + "step": 31175 + }, + { + "epoch": 4.644027405421507, + "grad_norm": 0.032470703125, + "learning_rate": 0.028431158592490586, + "loss": 0.8124, + "num_input_tokens_seen": 18090992, + "step": 31180 + }, + { + "epoch": 4.644772117962466, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028430290412861502, + "loss": 0.7997, + "num_input_tokens_seen": 18093872, + "step": 31185 + }, + { + "epoch": 4.645516830503426, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028429422006341312, + "loss": 0.7933, + "num_input_tokens_seen": 18096624, + "step": 31190 + }, + { + "epoch": 4.6462615430443845, + "grad_norm": 0.02001953125, + "learning_rate": 0.028428553372944684, + "loss": 0.7904, + "num_input_tokens_seen": 18099504, + "step": 31195 + }, + { + "epoch": 4.647006255585344, + "grad_norm": 0.0189208984375, + "learning_rate": 0.0284276845126863, + "loss": 0.8002, + "num_input_tokens_seen": 18102448, + "step": 31200 + }, + { + "epoch": 4.647750968126303, + "grad_norm": 0.02001953125, + "learning_rate": 0.02842681542558083, + "loss": 0.7927, + "num_input_tokens_seen": 18105488, + "step": 31205 + }, + { + "epoch": 4.648495680667263, + "grad_norm": 0.046630859375, + "learning_rate": 0.028425946111642963, + "loss": 0.7972, + "num_input_tokens_seen": 18108336, + "step": 31210 + }, + { + "epoch": 4.649240393208221, + "grad_norm": 0.0201416015625, + "learning_rate": 0.028425076570887383, + "loss": 0.7844, + "num_input_tokens_seen": 18111248, + "step": 31215 + }, + { + "epoch": 4.649985105749181, + "grad_norm": 0.019775390625, + "learning_rate": 0.028424206803328776, + "loss": 0.7889, + "num_input_tokens_seen": 18114416, + "step": 31220 + }, + { + "epoch": 4.65072981829014, + "grad_norm": 0.0228271484375, + "learning_rate": 0.028423336808981837, + "loss": 0.783, + "num_input_tokens_seen": 18117360, + "step": 31225 + }, + { + "epoch": 4.651474530831099, + "grad_norm": 0.053466796875, + "learning_rate": 0.028422466587861267, + "loss": 0.7989, + "num_input_tokens_seen": 18120208, + "step": 31230 + }, + { + "epoch": 4.652219243372058, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02842159613998177, + "loss": 0.7973, + "num_input_tokens_seen": 18122896, + "step": 31235 + }, + { + "epoch": 4.652963955913018, + "grad_norm": 0.022705078125, + "learning_rate": 0.028420725465358045, + "loss": 0.8019, + "num_input_tokens_seen": 18125680, + "step": 31240 + }, + { + "epoch": 4.653708668453977, + "grad_norm": 0.031005859375, + "learning_rate": 0.0284198545640048, + "loss": 0.815, + "num_input_tokens_seen": 18128688, + "step": 31245 + }, + { + "epoch": 4.654453380994936, + "grad_norm": 0.0294189453125, + "learning_rate": 0.028418983435936754, + "loss": 0.7937, + "num_input_tokens_seen": 18131536, + "step": 31250 + }, + { + "epoch": 4.655198093535895, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02841811208116862, + "loss": 0.8055, + "num_input_tokens_seen": 18134256, + "step": 31255 + }, + { + "epoch": 4.655942806076855, + "grad_norm": 0.01373291015625, + "learning_rate": 0.028417240499715123, + "loss": 0.8015, + "num_input_tokens_seen": 18137040, + "step": 31260 + }, + { + "epoch": 4.656687518617813, + "grad_norm": 0.014404296875, + "learning_rate": 0.028416368691590983, + "loss": 0.7842, + "num_input_tokens_seen": 18139920, + "step": 31265 + }, + { + "epoch": 4.657432231158773, + "grad_norm": 0.033447265625, + "learning_rate": 0.028415496656810927, + "loss": 0.8282, + "num_input_tokens_seen": 18142800, + "step": 31270 + }, + { + "epoch": 4.658176943699732, + "grad_norm": 0.019287109375, + "learning_rate": 0.028414624395389693, + "loss": 0.7818, + "num_input_tokens_seen": 18145456, + "step": 31275 + }, + { + "epoch": 4.658921656240691, + "grad_norm": 0.015625, + "learning_rate": 0.02841375190734201, + "loss": 0.8235, + "num_input_tokens_seen": 18148528, + "step": 31280 + }, + { + "epoch": 4.65966636878165, + "grad_norm": 0.018798828125, + "learning_rate": 0.02841287919268263, + "loss": 0.7916, + "num_input_tokens_seen": 18151280, + "step": 31285 + }, + { + "epoch": 4.66041108132261, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02841200625142628, + "loss": 0.8464, + "num_input_tokens_seen": 18154096, + "step": 31290 + }, + { + "epoch": 4.661155793863569, + "grad_norm": 0.01708984375, + "learning_rate": 0.02841113308358772, + "loss": 0.8117, + "num_input_tokens_seen": 18157040, + "step": 31295 + }, + { + "epoch": 4.661900506404528, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028410259689181695, + "loss": 0.8213, + "num_input_tokens_seen": 18159984, + "step": 31300 + }, + { + "epoch": 4.662645218945487, + "grad_norm": 0.01953125, + "learning_rate": 0.028409386068222964, + "loss": 0.7912, + "num_input_tokens_seen": 18162832, + "step": 31305 + }, + { + "epoch": 4.663389931486446, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02840851222072628, + "loss": 0.7933, + "num_input_tokens_seen": 18165872, + "step": 31310 + }, + { + "epoch": 4.664134644027405, + "grad_norm": 0.020263671875, + "learning_rate": 0.028407638146706417, + "loss": 0.808, + "num_input_tokens_seen": 18168784, + "step": 31315 + }, + { + "epoch": 4.664879356568365, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02840676384617813, + "loss": 0.8088, + "num_input_tokens_seen": 18171920, + "step": 31320 + }, + { + "epoch": 4.665624069109324, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02840588931915619, + "loss": 0.8085, + "num_input_tokens_seen": 18174576, + "step": 31325 + }, + { + "epoch": 4.666368781650283, + "grad_norm": 0.01806640625, + "learning_rate": 0.028405014565655383, + "loss": 0.7949, + "num_input_tokens_seen": 18177648, + "step": 31330 + }, + { + "epoch": 4.667113494191242, + "grad_norm": 0.02734375, + "learning_rate": 0.028404139585690474, + "loss": 0.7953, + "num_input_tokens_seen": 18180688, + "step": 31335 + }, + { + "epoch": 4.667858206732202, + "grad_norm": 0.0284423828125, + "learning_rate": 0.028403264379276254, + "loss": 0.8228, + "num_input_tokens_seen": 18183440, + "step": 31340 + }, + { + "epoch": 4.668602919273161, + "grad_norm": 0.0179443359375, + "learning_rate": 0.0284023889464275, + "loss": 0.7888, + "num_input_tokens_seen": 18186384, + "step": 31345 + }, + { + "epoch": 4.669347631814119, + "grad_norm": 0.014892578125, + "learning_rate": 0.028401513287159008, + "loss": 0.8037, + "num_input_tokens_seen": 18189168, + "step": 31350 + }, + { + "epoch": 4.670092344355079, + "grad_norm": 0.03759765625, + "learning_rate": 0.028400637401485566, + "loss": 0.8082, + "num_input_tokens_seen": 18192400, + "step": 31355 + }, + { + "epoch": 4.670837056896038, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028399761289421985, + "loss": 0.7956, + "num_input_tokens_seen": 18195408, + "step": 31360 + }, + { + "epoch": 4.671581769436997, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02839888495098305, + "loss": 0.7958, + "num_input_tokens_seen": 18198480, + "step": 31365 + }, + { + "epoch": 4.672326481977956, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02839800838618357, + "loss": 0.7996, + "num_input_tokens_seen": 18201296, + "step": 31370 + }, + { + "epoch": 4.673071194518916, + "grad_norm": 0.01422119140625, + "learning_rate": 0.028397131595038354, + "loss": 0.8124, + "num_input_tokens_seen": 18204464, + "step": 31375 + }, + { + "epoch": 4.673815907059875, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02839625457756222, + "loss": 0.7968, + "num_input_tokens_seen": 18207312, + "step": 31380 + }, + { + "epoch": 4.674560619600834, + "grad_norm": 0.01287841796875, + "learning_rate": 0.028395377333769983, + "loss": 0.7964, + "num_input_tokens_seen": 18210160, + "step": 31385 + }, + { + "epoch": 4.675305332141793, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028394499863676453, + "loss": 0.8232, + "num_input_tokens_seen": 18213040, + "step": 31390 + }, + { + "epoch": 4.676050044682753, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02839362216729647, + "loss": 0.8126, + "num_input_tokens_seen": 18216208, + "step": 31395 + }, + { + "epoch": 4.676794757223711, + "grad_norm": 0.0107421875, + "learning_rate": 0.028392744244644847, + "loss": 0.8005, + "num_input_tokens_seen": 18218864, + "step": 31400 + }, + { + "epoch": 4.677539469764671, + "grad_norm": 0.01806640625, + "learning_rate": 0.02839186609573642, + "loss": 0.7942, + "num_input_tokens_seen": 18221680, + "step": 31405 + }, + { + "epoch": 4.67828418230563, + "grad_norm": 0.02587890625, + "learning_rate": 0.02839098772058603, + "loss": 0.7931, + "num_input_tokens_seen": 18224496, + "step": 31410 + }, + { + "epoch": 4.6790288948465895, + "grad_norm": 0.018798828125, + "learning_rate": 0.02839010911920851, + "loss": 0.7776, + "num_input_tokens_seen": 18227280, + "step": 31415 + }, + { + "epoch": 4.679773607387548, + "grad_norm": 0.01904296875, + "learning_rate": 0.02838923029161871, + "loss": 0.7789, + "num_input_tokens_seen": 18230064, + "step": 31420 + }, + { + "epoch": 4.680518319928508, + "grad_norm": 0.01239013671875, + "learning_rate": 0.02838835123783147, + "loss": 0.7918, + "num_input_tokens_seen": 18232880, + "step": 31425 + }, + { + "epoch": 4.681263032469467, + "grad_norm": 0.0400390625, + "learning_rate": 0.02838747195786164, + "loss": 0.8441, + "num_input_tokens_seen": 18235536, + "step": 31430 + }, + { + "epoch": 4.682007745010426, + "grad_norm": 0.0296630859375, + "learning_rate": 0.028386592451724084, + "loss": 0.7848, + "num_input_tokens_seen": 18238096, + "step": 31435 + }, + { + "epoch": 4.682752457551385, + "grad_norm": 0.0166015625, + "learning_rate": 0.028385712719433652, + "loss": 0.7864, + "num_input_tokens_seen": 18240912, + "step": 31440 + }, + { + "epoch": 4.683497170092345, + "grad_norm": 0.01458740234375, + "learning_rate": 0.028384832761005205, + "loss": 0.814, + "num_input_tokens_seen": 18243984, + "step": 31445 + }, + { + "epoch": 4.684241882633303, + "grad_norm": 0.02734375, + "learning_rate": 0.028383952576453614, + "loss": 0.8481, + "num_input_tokens_seen": 18246992, + "step": 31450 + }, + { + "epoch": 4.684986595174263, + "grad_norm": 0.01165771484375, + "learning_rate": 0.02838307216579375, + "loss": 0.798, + "num_input_tokens_seen": 18249776, + "step": 31455 + }, + { + "epoch": 4.685731307715222, + "grad_norm": 0.0118408203125, + "learning_rate": 0.02838219152904048, + "loss": 0.8152, + "num_input_tokens_seen": 18252688, + "step": 31460 + }, + { + "epoch": 4.6864760202561815, + "grad_norm": 0.0174560546875, + "learning_rate": 0.028381310666208682, + "loss": 0.8005, + "num_input_tokens_seen": 18255696, + "step": 31465 + }, + { + "epoch": 4.68722073279714, + "grad_norm": 0.026123046875, + "learning_rate": 0.028380429577313245, + "loss": 0.8186, + "num_input_tokens_seen": 18258672, + "step": 31470 + }, + { + "epoch": 4.687965445338099, + "grad_norm": 0.016845703125, + "learning_rate": 0.028379548262369048, + "loss": 0.7668, + "num_input_tokens_seen": 18261424, + "step": 31475 + }, + { + "epoch": 4.688710157879059, + "grad_norm": 0.01129150390625, + "learning_rate": 0.028378666721390985, + "loss": 0.7887, + "num_input_tokens_seen": 18264336, + "step": 31480 + }, + { + "epoch": 4.689454870420018, + "grad_norm": 0.02685546875, + "learning_rate": 0.028377784954393942, + "loss": 0.7948, + "num_input_tokens_seen": 18267504, + "step": 31485 + }, + { + "epoch": 4.690199582960977, + "grad_norm": 0.01202392578125, + "learning_rate": 0.028376902961392822, + "loss": 0.8087, + "num_input_tokens_seen": 18270384, + "step": 31490 + }, + { + "epoch": 4.690944295501936, + "grad_norm": 0.0172119140625, + "learning_rate": 0.02837602074240252, + "loss": 0.8056, + "num_input_tokens_seen": 18273424, + "step": 31495 + }, + { + "epoch": 4.6916890080428955, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02837513829743794, + "loss": 0.7842, + "num_input_tokens_seen": 18276400, + "step": 31500 + }, + { + "epoch": 4.692433720583855, + "grad_norm": 0.0252685546875, + "learning_rate": 0.028374255626513994, + "loss": 0.8139, + "num_input_tokens_seen": 18279216, + "step": 31505 + }, + { + "epoch": 4.693178433124814, + "grad_norm": 0.01336669921875, + "learning_rate": 0.02837337272964559, + "loss": 0.8341, + "num_input_tokens_seen": 18282256, + "step": 31510 + }, + { + "epoch": 4.693923145665773, + "grad_norm": 0.01904296875, + "learning_rate": 0.02837248960684765, + "loss": 0.7846, + "num_input_tokens_seen": 18285232, + "step": 31515 + }, + { + "epoch": 4.694667858206732, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02837160625813509, + "loss": 0.7914, + "num_input_tokens_seen": 18288496, + "step": 31520 + }, + { + "epoch": 4.695412570747691, + "grad_norm": 0.0244140625, + "learning_rate": 0.02837072268352283, + "loss": 0.8165, + "num_input_tokens_seen": 18291504, + "step": 31525 + }, + { + "epoch": 4.696157283288651, + "grad_norm": 0.0140380859375, + "learning_rate": 0.028369838883025796, + "loss": 0.799, + "num_input_tokens_seen": 18294128, + "step": 31530 + }, + { + "epoch": 4.696901995829609, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02836895485665893, + "loss": 0.8018, + "num_input_tokens_seen": 18297232, + "step": 31535 + }, + { + "epoch": 4.697646708370569, + "grad_norm": 0.0301513671875, + "learning_rate": 0.028368070604437155, + "loss": 0.8123, + "num_input_tokens_seen": 18300240, + "step": 31540 + }, + { + "epoch": 4.698391420911528, + "grad_norm": 0.0174560546875, + "learning_rate": 0.028367186126375413, + "loss": 0.7912, + "num_input_tokens_seen": 18302864, + "step": 31545 + }, + { + "epoch": 4.6991361334524875, + "grad_norm": 0.028076171875, + "learning_rate": 0.028366301422488648, + "loss": 0.8112, + "num_input_tokens_seen": 18306032, + "step": 31550 + }, + { + "epoch": 4.699880845993446, + "grad_norm": 0.028564453125, + "learning_rate": 0.02836541649279181, + "loss": 0.8017, + "num_input_tokens_seen": 18308880, + "step": 31555 + }, + { + "epoch": 4.700625558534406, + "grad_norm": 0.02099609375, + "learning_rate": 0.02836453133729984, + "loss": 0.8079, + "num_input_tokens_seen": 18311952, + "step": 31560 + }, + { + "epoch": 4.701370271075365, + "grad_norm": 0.018798828125, + "learning_rate": 0.028363645956027694, + "loss": 0.8098, + "num_input_tokens_seen": 18314960, + "step": 31565 + }, + { + "epoch": 4.702114983616324, + "grad_norm": 0.01214599609375, + "learning_rate": 0.028362760348990337, + "loss": 0.7928, + "num_input_tokens_seen": 18317648, + "step": 31570 + }, + { + "epoch": 4.702859696157283, + "grad_norm": 0.01953125, + "learning_rate": 0.028361874516202718, + "loss": 0.7992, + "num_input_tokens_seen": 18320656, + "step": 31575 + }, + { + "epoch": 4.703604408698243, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028360988457679818, + "loss": 0.812, + "num_input_tokens_seen": 18323504, + "step": 31580 + }, + { + "epoch": 4.7043491212392015, + "grad_norm": 0.027099609375, + "learning_rate": 0.02836010217343659, + "loss": 0.8227, + "num_input_tokens_seen": 18326320, + "step": 31585 + }, + { + "epoch": 4.705093833780161, + "grad_norm": 0.02197265625, + "learning_rate": 0.02835921566348802, + "loss": 0.8267, + "num_input_tokens_seen": 18329424, + "step": 31590 + }, + { + "epoch": 4.70583854632112, + "grad_norm": 0.0234375, + "learning_rate": 0.028358328927849077, + "loss": 0.8062, + "num_input_tokens_seen": 18332208, + "step": 31595 + }, + { + "epoch": 4.7065832588620795, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02835744196653474, + "loss": 0.7957, + "num_input_tokens_seen": 18334928, + "step": 31600 + }, + { + "epoch": 4.707327971403038, + "grad_norm": 0.0296630859375, + "learning_rate": 0.028356554779560006, + "loss": 0.7982, + "num_input_tokens_seen": 18337872, + "step": 31605 + }, + { + "epoch": 4.708072683943998, + "grad_norm": 0.0128173828125, + "learning_rate": 0.028355667366939848, + "loss": 0.805, + "num_input_tokens_seen": 18340912, + "step": 31610 + }, + { + "epoch": 4.708817396484957, + "grad_norm": 0.03857421875, + "learning_rate": 0.028354779728689262, + "loss": 0.7982, + "num_input_tokens_seen": 18343600, + "step": 31615 + }, + { + "epoch": 4.709562109025916, + "grad_norm": 0.0233154296875, + "learning_rate": 0.028353891864823247, + "loss": 0.8168, + "num_input_tokens_seen": 18346352, + "step": 31620 + }, + { + "epoch": 4.710306821566875, + "grad_norm": 0.02099609375, + "learning_rate": 0.028353003775356805, + "loss": 0.8105, + "num_input_tokens_seen": 18349104, + "step": 31625 + }, + { + "epoch": 4.711051534107835, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02835211546030493, + "loss": 0.7936, + "num_input_tokens_seen": 18351920, + "step": 31630 + }, + { + "epoch": 4.7117962466487935, + "grad_norm": 0.023193359375, + "learning_rate": 0.02835122691968264, + "loss": 0.7936, + "num_input_tokens_seen": 18354736, + "step": 31635 + }, + { + "epoch": 4.712540959189753, + "grad_norm": 0.02099609375, + "learning_rate": 0.028350338153504937, + "loss": 0.7983, + "num_input_tokens_seen": 18357936, + "step": 31640 + }, + { + "epoch": 4.713285671730712, + "grad_norm": 0.023193359375, + "learning_rate": 0.028349449161786847, + "loss": 0.8164, + "num_input_tokens_seen": 18360880, + "step": 31645 + }, + { + "epoch": 4.7140303842716715, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02834855994454337, + "loss": 0.8101, + "num_input_tokens_seen": 18363952, + "step": 31650 + }, + { + "epoch": 4.71477509681263, + "grad_norm": 0.02197265625, + "learning_rate": 0.028347670501789546, + "loss": 0.8084, + "num_input_tokens_seen": 18366928, + "step": 31655 + }, + { + "epoch": 4.715519809353589, + "grad_norm": 0.025146484375, + "learning_rate": 0.028346780833540395, + "loss": 0.7793, + "num_input_tokens_seen": 18369776, + "step": 31660 + }, + { + "epoch": 4.716264521894549, + "grad_norm": 0.01373291015625, + "learning_rate": 0.028345890939810943, + "loss": 0.8208, + "num_input_tokens_seen": 18372752, + "step": 31665 + }, + { + "epoch": 4.717009234435508, + "grad_norm": 0.027587890625, + "learning_rate": 0.02834500082061623, + "loss": 0.792, + "num_input_tokens_seen": 18375376, + "step": 31670 + }, + { + "epoch": 4.717753946976467, + "grad_norm": 0.02978515625, + "learning_rate": 0.028344110475971292, + "loss": 0.7879, + "num_input_tokens_seen": 18378096, + "step": 31675 + }, + { + "epoch": 4.718498659517426, + "grad_norm": 0.018798828125, + "learning_rate": 0.02834321990589117, + "loss": 0.8127, + "num_input_tokens_seen": 18380880, + "step": 31680 + }, + { + "epoch": 4.7192433720583855, + "grad_norm": 0.01318359375, + "learning_rate": 0.028342329110390908, + "loss": 0.8024, + "num_input_tokens_seen": 18384080, + "step": 31685 + }, + { + "epoch": 4.719988084599344, + "grad_norm": 0.0306396484375, + "learning_rate": 0.028341438089485558, + "loss": 0.802, + "num_input_tokens_seen": 18386992, + "step": 31690 + }, + { + "epoch": 4.720732797140304, + "grad_norm": 0.0181884765625, + "learning_rate": 0.028340546843190168, + "loss": 0.8094, + "num_input_tokens_seen": 18390128, + "step": 31695 + }, + { + "epoch": 4.721477509681263, + "grad_norm": 0.02294921875, + "learning_rate": 0.028339655371519803, + "loss": 0.8039, + "num_input_tokens_seen": 18393200, + "step": 31700 + }, + { + "epoch": 4.722222222222222, + "grad_norm": 0.01904296875, + "learning_rate": 0.028338763674489516, + "loss": 0.7831, + "num_input_tokens_seen": 18395888, + "step": 31705 + }, + { + "epoch": 4.722966934763181, + "grad_norm": 0.037353515625, + "learning_rate": 0.02833787175211437, + "loss": 0.8149, + "num_input_tokens_seen": 18398832, + "step": 31710 + }, + { + "epoch": 4.723711647304141, + "grad_norm": 0.01190185546875, + "learning_rate": 0.028336979604409437, + "loss": 0.7966, + "num_input_tokens_seen": 18401744, + "step": 31715 + }, + { + "epoch": 4.7244563598450995, + "grad_norm": 0.01220703125, + "learning_rate": 0.028336087231389792, + "loss": 0.7991, + "num_input_tokens_seen": 18404720, + "step": 31720 + }, + { + "epoch": 4.725201072386059, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02833519463307051, + "loss": 0.8064, + "num_input_tokens_seen": 18407536, + "step": 31725 + }, + { + "epoch": 4.725945784927018, + "grad_norm": 0.029541015625, + "learning_rate": 0.02833430180946666, + "loss": 0.8087, + "num_input_tokens_seen": 18410544, + "step": 31730 + }, + { + "epoch": 4.7266904974679775, + "grad_norm": 0.01708984375, + "learning_rate": 0.028333408760593343, + "loss": 0.7898, + "num_input_tokens_seen": 18413424, + "step": 31735 + }, + { + "epoch": 4.727435210008936, + "grad_norm": 0.018798828125, + "learning_rate": 0.02833251548646563, + "loss": 0.8036, + "num_input_tokens_seen": 18416208, + "step": 31740 + }, + { + "epoch": 4.728179922549896, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02833162198709862, + "loss": 0.8147, + "num_input_tokens_seen": 18419728, + "step": 31745 + }, + { + "epoch": 4.728924635090855, + "grad_norm": 0.0113525390625, + "learning_rate": 0.0283307282625074, + "loss": 0.799, + "num_input_tokens_seen": 18422512, + "step": 31750 + }, + { + "epoch": 4.729669347631814, + "grad_norm": 0.0130615234375, + "learning_rate": 0.028329834312707084, + "loss": 0.8187, + "num_input_tokens_seen": 18425232, + "step": 31755 + }, + { + "epoch": 4.730414060172773, + "grad_norm": 0.011474609375, + "learning_rate": 0.02832894013771276, + "loss": 0.8067, + "num_input_tokens_seen": 18428016, + "step": 31760 + }, + { + "epoch": 4.731158772713733, + "grad_norm": 0.02001953125, + "learning_rate": 0.028328045737539546, + "loss": 0.8044, + "num_input_tokens_seen": 18431408, + "step": 31765 + }, + { + "epoch": 4.7319034852546915, + "grad_norm": 0.01116943359375, + "learning_rate": 0.028327151112202537, + "loss": 0.8039, + "num_input_tokens_seen": 18434192, + "step": 31770 + }, + { + "epoch": 4.732648197795651, + "grad_norm": 0.0169677734375, + "learning_rate": 0.02832625626171686, + "loss": 0.7922, + "num_input_tokens_seen": 18437008, + "step": 31775 + }, + { + "epoch": 4.73339291033661, + "grad_norm": 0.0107421875, + "learning_rate": 0.028325361186097627, + "loss": 0.8024, + "num_input_tokens_seen": 18439888, + "step": 31780 + }, + { + "epoch": 4.73413762287757, + "grad_norm": 0.026123046875, + "learning_rate": 0.02832446588535996, + "loss": 0.8181, + "num_input_tokens_seen": 18442704, + "step": 31785 + }, + { + "epoch": 4.734882335418528, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028323570359518976, + "loss": 0.7976, + "num_input_tokens_seen": 18445776, + "step": 31790 + }, + { + "epoch": 4.735627047959488, + "grad_norm": 0.01214599609375, + "learning_rate": 0.028322674608589823, + "loss": 0.8064, + "num_input_tokens_seen": 18448720, + "step": 31795 + }, + { + "epoch": 4.736371760500447, + "grad_norm": 0.017822265625, + "learning_rate": 0.028321778632587623, + "loss": 0.8089, + "num_input_tokens_seen": 18451376, + "step": 31800 + }, + { + "epoch": 4.737116473041406, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028320882431527507, + "loss": 0.8123, + "num_input_tokens_seen": 18454448, + "step": 31805 + }, + { + "epoch": 4.737861185582365, + "grad_norm": 0.0189208984375, + "learning_rate": 0.028319986005424624, + "loss": 0.8018, + "num_input_tokens_seen": 18457456, + "step": 31810 + }, + { + "epoch": 4.738605898123325, + "grad_norm": 0.0225830078125, + "learning_rate": 0.028319089354294114, + "loss": 0.8079, + "num_input_tokens_seen": 18460144, + "step": 31815 + }, + { + "epoch": 4.7393506106642835, + "grad_norm": 0.01806640625, + "learning_rate": 0.02831819247815113, + "loss": 0.8006, + "num_input_tokens_seen": 18463280, + "step": 31820 + }, + { + "epoch": 4.740095323205242, + "grad_norm": 0.0279541015625, + "learning_rate": 0.028317295377010814, + "loss": 0.81, + "num_input_tokens_seen": 18466000, + "step": 31825 + }, + { + "epoch": 4.740840035746202, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02831639805088833, + "loss": 0.7933, + "num_input_tokens_seen": 18468816, + "step": 31830 + }, + { + "epoch": 4.741584748287162, + "grad_norm": 0.0234375, + "learning_rate": 0.028315500499798838, + "loss": 0.7921, + "num_input_tokens_seen": 18472048, + "step": 31835 + }, + { + "epoch": 4.74232946082812, + "grad_norm": 0.0284423828125, + "learning_rate": 0.028314602723757496, + "loss": 0.8061, + "num_input_tokens_seen": 18474992, + "step": 31840 + }, + { + "epoch": 4.743074173369079, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02831370472277947, + "loss": 0.7914, + "num_input_tokens_seen": 18477616, + "step": 31845 + }, + { + "epoch": 4.743818885910039, + "grad_norm": 0.017822265625, + "learning_rate": 0.028312806496879936, + "loss": 0.79, + "num_input_tokens_seen": 18480560, + "step": 31850 + }, + { + "epoch": 4.744563598450998, + "grad_norm": 0.0301513671875, + "learning_rate": 0.028311908046074072, + "loss": 0.7992, + "num_input_tokens_seen": 18483152, + "step": 31855 + }, + { + "epoch": 4.745308310991957, + "grad_norm": 0.019287109375, + "learning_rate": 0.028311009370377047, + "loss": 0.7912, + "num_input_tokens_seen": 18485968, + "step": 31860 + }, + { + "epoch": 4.746053023532916, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02831011046980405, + "loss": 0.8019, + "num_input_tokens_seen": 18488944, + "step": 31865 + }, + { + "epoch": 4.746797736073876, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028309211344370262, + "loss": 0.7888, + "num_input_tokens_seen": 18492048, + "step": 31870 + }, + { + "epoch": 4.747542448614834, + "grad_norm": 0.0260009765625, + "learning_rate": 0.028308311994090875, + "loss": 0.8185, + "num_input_tokens_seen": 18494768, + "step": 31875 + }, + { + "epoch": 4.748287161155794, + "grad_norm": 0.013427734375, + "learning_rate": 0.028307412418981086, + "loss": 0.8139, + "num_input_tokens_seen": 18497776, + "step": 31880 + }, + { + "epoch": 4.749031873696753, + "grad_norm": 0.01141357421875, + "learning_rate": 0.028306512619056083, + "loss": 0.7898, + "num_input_tokens_seen": 18500496, + "step": 31885 + }, + { + "epoch": 4.749776586237712, + "grad_norm": 0.019775390625, + "learning_rate": 0.028305612594331078, + "loss": 0.8191, + "num_input_tokens_seen": 18503568, + "step": 31890 + }, + { + "epoch": 4.750521298778671, + "grad_norm": 0.0181884765625, + "learning_rate": 0.028304712344821268, + "loss": 0.811, + "num_input_tokens_seen": 18506736, + "step": 31895 + }, + { + "epoch": 4.751266011319631, + "grad_norm": 0.019775390625, + "learning_rate": 0.02830381187054187, + "loss": 0.7888, + "num_input_tokens_seen": 18509872, + "step": 31900 + }, + { + "epoch": 4.7520107238605895, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02830291117150809, + "loss": 0.8093, + "num_input_tokens_seen": 18513200, + "step": 31905 + }, + { + "epoch": 4.752755436401549, + "grad_norm": 0.027099609375, + "learning_rate": 0.028302010247735147, + "loss": 0.7874, + "num_input_tokens_seen": 18515856, + "step": 31910 + }, + { + "epoch": 4.753500148942508, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02830110909923826, + "loss": 0.804, + "num_input_tokens_seen": 18518768, + "step": 31915 + }, + { + "epoch": 4.754244861483468, + "grad_norm": 0.01904296875, + "learning_rate": 0.02830020772603265, + "loss": 0.8191, + "num_input_tokens_seen": 18521776, + "step": 31920 + }, + { + "epoch": 4.754989574024426, + "grad_norm": 0.0133056640625, + "learning_rate": 0.028299306128133552, + "loss": 0.8044, + "num_input_tokens_seen": 18524624, + "step": 31925 + }, + { + "epoch": 4.755734286565386, + "grad_norm": 0.011474609375, + "learning_rate": 0.028298404305556194, + "loss": 0.8086, + "num_input_tokens_seen": 18527536, + "step": 31930 + }, + { + "epoch": 4.756478999106345, + "grad_norm": 0.0274658203125, + "learning_rate": 0.028297502258315812, + "loss": 0.8095, + "num_input_tokens_seen": 18530288, + "step": 31935 + }, + { + "epoch": 4.757223711647304, + "grad_norm": 0.018310546875, + "learning_rate": 0.02829659998642764, + "loss": 0.8011, + "num_input_tokens_seen": 18533264, + "step": 31940 + }, + { + "epoch": 4.757968424188263, + "grad_norm": 0.0185546875, + "learning_rate": 0.028295697489906926, + "loss": 0.7979, + "num_input_tokens_seen": 18536144, + "step": 31945 + }, + { + "epoch": 4.758713136729223, + "grad_norm": 0.0245361328125, + "learning_rate": 0.028294794768768918, + "loss": 0.8001, + "num_input_tokens_seen": 18538800, + "step": 31950 + }, + { + "epoch": 4.759457849270182, + "grad_norm": 0.018310546875, + "learning_rate": 0.028293891823028865, + "loss": 0.8055, + "num_input_tokens_seen": 18541456, + "step": 31955 + }, + { + "epoch": 4.760202561811141, + "grad_norm": 0.01202392578125, + "learning_rate": 0.02829298865270202, + "loss": 0.7872, + "num_input_tokens_seen": 18544176, + "step": 31960 + }, + { + "epoch": 4.7609472743521, + "grad_norm": 0.059814453125, + "learning_rate": 0.02829208525780364, + "loss": 0.8387, + "num_input_tokens_seen": 18548368, + "step": 31965 + }, + { + "epoch": 4.76169198689306, + "grad_norm": 0.017333984375, + "learning_rate": 0.028291181638348994, + "loss": 0.8076, + "num_input_tokens_seen": 18551216, + "step": 31970 + }, + { + "epoch": 4.762436699434018, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02829027779435334, + "loss": 0.8233, + "num_input_tokens_seen": 18553968, + "step": 31975 + }, + { + "epoch": 4.763181411974978, + "grad_norm": 0.017822265625, + "learning_rate": 0.02828937372583195, + "loss": 0.8026, + "num_input_tokens_seen": 18556816, + "step": 31980 + }, + { + "epoch": 4.763926124515937, + "grad_norm": 0.016845703125, + "learning_rate": 0.028288469432800098, + "loss": 0.7825, + "num_input_tokens_seen": 18559760, + "step": 31985 + }, + { + "epoch": 4.764670837056896, + "grad_norm": 0.0281982421875, + "learning_rate": 0.028287564915273056, + "loss": 0.7882, + "num_input_tokens_seen": 18562544, + "step": 31990 + }, + { + "epoch": 4.765415549597855, + "grad_norm": 0.018310546875, + "learning_rate": 0.02828666017326612, + "loss": 0.8033, + "num_input_tokens_seen": 18565488, + "step": 31995 + }, + { + "epoch": 4.766160262138815, + "grad_norm": 0.01348876953125, + "learning_rate": 0.028285755206794556, + "loss": 0.81, + "num_input_tokens_seen": 18568624, + "step": 32000 + }, + { + "epoch": 4.766904974679774, + "grad_norm": 0.019775390625, + "learning_rate": 0.028284850015873664, + "loss": 0.8, + "num_input_tokens_seen": 18571984, + "step": 32005 + }, + { + "epoch": 4.767649687220732, + "grad_norm": 0.019287109375, + "learning_rate": 0.028283944600518734, + "loss": 0.7962, + "num_input_tokens_seen": 18574800, + "step": 32010 + }, + { + "epoch": 4.768394399761692, + "grad_norm": 0.017578125, + "learning_rate": 0.02828303896074506, + "loss": 0.7807, + "num_input_tokens_seen": 18577552, + "step": 32015 + }, + { + "epoch": 4.769139112302652, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028282133096567942, + "loss": 0.8294, + "num_input_tokens_seen": 18580688, + "step": 32020 + }, + { + "epoch": 4.76988382484361, + "grad_norm": 0.019287109375, + "learning_rate": 0.028281227008002684, + "loss": 0.7996, + "num_input_tokens_seen": 18583600, + "step": 32025 + }, + { + "epoch": 4.770628537384569, + "grad_norm": 0.02099609375, + "learning_rate": 0.028280320695064594, + "loss": 0.7954, + "num_input_tokens_seen": 18587024, + "step": 32030 + }, + { + "epoch": 4.771373249925529, + "grad_norm": 0.017822265625, + "learning_rate": 0.028279414157768988, + "loss": 0.7787, + "num_input_tokens_seen": 18589552, + "step": 32035 + }, + { + "epoch": 4.772117962466488, + "grad_norm": 0.0269775390625, + "learning_rate": 0.028278507396131174, + "loss": 0.8012, + "num_input_tokens_seen": 18592496, + "step": 32040 + }, + { + "epoch": 4.772862675007447, + "grad_norm": 0.018798828125, + "learning_rate": 0.02827760041016647, + "loss": 0.8029, + "num_input_tokens_seen": 18595248, + "step": 32045 + }, + { + "epoch": 4.773607387548406, + "grad_norm": 0.020263671875, + "learning_rate": 0.028276693199890204, + "loss": 0.8042, + "num_input_tokens_seen": 18598032, + "step": 32050 + }, + { + "epoch": 4.774352100089366, + "grad_norm": 0.0252685546875, + "learning_rate": 0.028275785765317703, + "loss": 0.7937, + "num_input_tokens_seen": 18601200, + "step": 32055 + }, + { + "epoch": 4.775096812630324, + "grad_norm": 0.031005859375, + "learning_rate": 0.028274878106464292, + "loss": 0.8054, + "num_input_tokens_seen": 18604176, + "step": 32060 + }, + { + "epoch": 4.775841525171284, + "grad_norm": 0.0322265625, + "learning_rate": 0.028273970223345306, + "loss": 0.8216, + "num_input_tokens_seen": 18607024, + "step": 32065 + }, + { + "epoch": 4.776586237712243, + "grad_norm": 0.0439453125, + "learning_rate": 0.028273062115976084, + "loss": 0.8277, + "num_input_tokens_seen": 18609648, + "step": 32070 + }, + { + "epoch": 4.777330950253202, + "grad_norm": 0.0198974609375, + "learning_rate": 0.028272153784371968, + "loss": 0.8159, + "num_input_tokens_seen": 18612720, + "step": 32075 + }, + { + "epoch": 4.778075662794161, + "grad_norm": 0.0223388671875, + "learning_rate": 0.028271245228548306, + "loss": 0.8006, + "num_input_tokens_seen": 18615408, + "step": 32080 + }, + { + "epoch": 4.778820375335121, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02827033644852044, + "loss": 0.7966, + "num_input_tokens_seen": 18618192, + "step": 32085 + }, + { + "epoch": 4.77956508787608, + "grad_norm": 0.01904296875, + "learning_rate": 0.028269427444303732, + "loss": 0.8077, + "num_input_tokens_seen": 18621104, + "step": 32090 + }, + { + "epoch": 4.780309800417039, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02826851821591353, + "loss": 0.778, + "num_input_tokens_seen": 18624112, + "step": 32095 + }, + { + "epoch": 4.781054512957998, + "grad_norm": 0.01251220703125, + "learning_rate": 0.0282676087633652, + "loss": 0.7933, + "num_input_tokens_seen": 18626864, + "step": 32100 + }, + { + "epoch": 4.781799225498958, + "grad_norm": 0.0245361328125, + "learning_rate": 0.028266699086674105, + "loss": 0.8197, + "num_input_tokens_seen": 18629936, + "step": 32105 + }, + { + "epoch": 4.782543938039916, + "grad_norm": 0.021728515625, + "learning_rate": 0.028265789185855608, + "loss": 0.8199, + "num_input_tokens_seen": 18632784, + "step": 32110 + }, + { + "epoch": 4.783288650580876, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028264879060925088, + "loss": 0.8123, + "num_input_tokens_seen": 18635728, + "step": 32115 + }, + { + "epoch": 4.784033363121835, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02826396871189792, + "loss": 0.8023, + "num_input_tokens_seen": 18638448, + "step": 32120 + }, + { + "epoch": 4.7847780756627944, + "grad_norm": 0.021240234375, + "learning_rate": 0.028263058138789484, + "loss": 0.8022, + "num_input_tokens_seen": 18641296, + "step": 32125 + }, + { + "epoch": 4.785522788203753, + "grad_norm": 0.014404296875, + "learning_rate": 0.028262147341615154, + "loss": 0.7918, + "num_input_tokens_seen": 18644528, + "step": 32130 + }, + { + "epoch": 4.786267500744713, + "grad_norm": 0.021728515625, + "learning_rate": 0.028261236320390327, + "loss": 0.7685, + "num_input_tokens_seen": 18647760, + "step": 32135 + }, + { + "epoch": 4.787012213285672, + "grad_norm": 0.013427734375, + "learning_rate": 0.02826032507513039, + "loss": 0.8114, + "num_input_tokens_seen": 18650736, + "step": 32140 + }, + { + "epoch": 4.787756925826631, + "grad_norm": 0.02978515625, + "learning_rate": 0.028259413605850738, + "loss": 0.7854, + "num_input_tokens_seen": 18653712, + "step": 32145 + }, + { + "epoch": 4.78850163836759, + "grad_norm": 0.0306396484375, + "learning_rate": 0.028258501912566774, + "loss": 0.8245, + "num_input_tokens_seen": 18656752, + "step": 32150 + }, + { + "epoch": 4.78924635090855, + "grad_norm": 0.0115966796875, + "learning_rate": 0.028257589995293892, + "loss": 0.7764, + "num_input_tokens_seen": 18659632, + "step": 32155 + }, + { + "epoch": 4.789991063449508, + "grad_norm": 0.02490234375, + "learning_rate": 0.0282566778540475, + "loss": 0.8273, + "num_input_tokens_seen": 18662480, + "step": 32160 + }, + { + "epoch": 4.790735775990468, + "grad_norm": 0.018798828125, + "learning_rate": 0.02825576548884301, + "loss": 0.801, + "num_input_tokens_seen": 18665232, + "step": 32165 + }, + { + "epoch": 4.791480488531427, + "grad_norm": 0.0272216796875, + "learning_rate": 0.028254852899695832, + "loss": 0.8065, + "num_input_tokens_seen": 18668208, + "step": 32170 + }, + { + "epoch": 4.792225201072386, + "grad_norm": 0.017822265625, + "learning_rate": 0.028253940086621392, + "loss": 0.7951, + "num_input_tokens_seen": 18671280, + "step": 32175 + }, + { + "epoch": 4.792969913613345, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0282530270496351, + "loss": 0.8066, + "num_input_tokens_seen": 18674352, + "step": 32180 + }, + { + "epoch": 4.793714626154305, + "grad_norm": 0.02392578125, + "learning_rate": 0.02825211378875239, + "loss": 0.7964, + "num_input_tokens_seen": 18677296, + "step": 32185 + }, + { + "epoch": 4.794459338695264, + "grad_norm": 0.01141357421875, + "learning_rate": 0.028251200303988682, + "loss": 0.7923, + "num_input_tokens_seen": 18681008, + "step": 32190 + }, + { + "epoch": 4.795204051236222, + "grad_norm": 0.0225830078125, + "learning_rate": 0.028250286595359417, + "loss": 0.8401, + "num_input_tokens_seen": 18684112, + "step": 32195 + }, + { + "epoch": 4.795948763777182, + "grad_norm": 0.0234375, + "learning_rate": 0.02824937266288002, + "loss": 0.7895, + "num_input_tokens_seen": 18686864, + "step": 32200 + }, + { + "epoch": 4.796693476318142, + "grad_norm": 0.033447265625, + "learning_rate": 0.028248458506565946, + "loss": 0.8312, + "num_input_tokens_seen": 18689840, + "step": 32205 + }, + { + "epoch": 4.7974381888591004, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028247544126432624, + "loss": 0.7992, + "num_input_tokens_seen": 18692752, + "step": 32210 + }, + { + "epoch": 4.798182901400059, + "grad_norm": 0.030517578125, + "learning_rate": 0.02824662952249551, + "loss": 0.7955, + "num_input_tokens_seen": 18695408, + "step": 32215 + }, + { + "epoch": 4.798927613941019, + "grad_norm": 0.0283203125, + "learning_rate": 0.02824571469477006, + "loss": 0.7889, + "num_input_tokens_seen": 18698224, + "step": 32220 + }, + { + "epoch": 4.799672326481978, + "grad_norm": 0.01361083984375, + "learning_rate": 0.028244799643271715, + "loss": 0.8008, + "num_input_tokens_seen": 18701072, + "step": 32225 + }, + { + "epoch": 4.800417039022937, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028243884368015944, + "loss": 0.8039, + "num_input_tokens_seen": 18703824, + "step": 32230 + }, + { + "epoch": 4.801161751563896, + "grad_norm": 0.027587890625, + "learning_rate": 0.02824296886901821, + "loss": 0.8013, + "num_input_tokens_seen": 18706992, + "step": 32235 + }, + { + "epoch": 4.801906464104856, + "grad_norm": 0.025390625, + "learning_rate": 0.028242053146293973, + "loss": 0.8058, + "num_input_tokens_seen": 18709968, + "step": 32240 + }, + { + "epoch": 4.802651176645814, + "grad_norm": 0.038330078125, + "learning_rate": 0.028241137199858702, + "loss": 0.8135, + "num_input_tokens_seen": 18712816, + "step": 32245 + }, + { + "epoch": 4.803395889186774, + "grad_norm": 0.01177978515625, + "learning_rate": 0.02824022102972788, + "loss": 0.801, + "num_input_tokens_seen": 18716944, + "step": 32250 + }, + { + "epoch": 4.804140601727733, + "grad_norm": 0.01397705078125, + "learning_rate": 0.028239304635916986, + "loss": 0.798, + "num_input_tokens_seen": 18719792, + "step": 32255 + }, + { + "epoch": 4.8048853142686925, + "grad_norm": 0.0126953125, + "learning_rate": 0.02823838801844149, + "loss": 0.797, + "num_input_tokens_seen": 18722512, + "step": 32260 + }, + { + "epoch": 4.805630026809651, + "grad_norm": 0.030029296875, + "learning_rate": 0.028237471177316883, + "loss": 0.8022, + "num_input_tokens_seen": 18725424, + "step": 32265 + }, + { + "epoch": 4.806374739350611, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028236554112558657, + "loss": 0.8091, + "num_input_tokens_seen": 18728304, + "step": 32270 + }, + { + "epoch": 4.80711945189157, + "grad_norm": 0.01239013671875, + "learning_rate": 0.0282356368241823, + "loss": 0.8153, + "num_input_tokens_seen": 18731152, + "step": 32275 + }, + { + "epoch": 4.807864164432529, + "grad_norm": 0.0245361328125, + "learning_rate": 0.028234719312203314, + "loss": 0.8129, + "num_input_tokens_seen": 18733968, + "step": 32280 + }, + { + "epoch": 4.808608876973488, + "grad_norm": 0.0233154296875, + "learning_rate": 0.028233801576637192, + "loss": 0.7958, + "num_input_tokens_seen": 18736816, + "step": 32285 + }, + { + "epoch": 4.809353589514448, + "grad_norm": 0.0269775390625, + "learning_rate": 0.028232883617499448, + "loss": 0.7909, + "num_input_tokens_seen": 18739696, + "step": 32290 + }, + { + "epoch": 4.8100983020554064, + "grad_norm": 0.037353515625, + "learning_rate": 0.02823196543480558, + "loss": 0.8209, + "num_input_tokens_seen": 18742672, + "step": 32295 + }, + { + "epoch": 4.810843014596366, + "grad_norm": 0.019287109375, + "learning_rate": 0.028231047028571106, + "loss": 0.7939, + "num_input_tokens_seen": 18745808, + "step": 32300 + }, + { + "epoch": 4.811587727137325, + "grad_norm": 0.01336669921875, + "learning_rate": 0.02823012839881154, + "loss": 0.7962, + "num_input_tokens_seen": 18748688, + "step": 32305 + }, + { + "epoch": 4.8123324396782845, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0282292095455424, + "loss": 0.793, + "num_input_tokens_seen": 18751664, + "step": 32310 + }, + { + "epoch": 4.813077152219243, + "grad_norm": 0.024658203125, + "learning_rate": 0.028228290468779213, + "loss": 0.7891, + "num_input_tokens_seen": 18754384, + "step": 32315 + }, + { + "epoch": 4.813821864760203, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0282273711685375, + "loss": 0.7974, + "num_input_tokens_seen": 18757168, + "step": 32320 + }, + { + "epoch": 4.814566577301162, + "grad_norm": 0.011962890625, + "learning_rate": 0.0282264516448328, + "loss": 0.7856, + "num_input_tokens_seen": 18760368, + "step": 32325 + }, + { + "epoch": 4.815311289842121, + "grad_norm": 0.0311279296875, + "learning_rate": 0.028225531897680636, + "loss": 0.8069, + "num_input_tokens_seen": 18763472, + "step": 32330 + }, + { + "epoch": 4.81605600238308, + "grad_norm": 0.0177001953125, + "learning_rate": 0.028224611927096556, + "loss": 0.7978, + "num_input_tokens_seen": 18766352, + "step": 32335 + }, + { + "epoch": 4.816800714924039, + "grad_norm": 0.0279541015625, + "learning_rate": 0.028223691733096096, + "loss": 0.7804, + "num_input_tokens_seen": 18769264, + "step": 32340 + }, + { + "epoch": 4.8175454274649985, + "grad_norm": 0.0244140625, + "learning_rate": 0.028222771315694808, + "loss": 0.8194, + "num_input_tokens_seen": 18772272, + "step": 32345 + }, + { + "epoch": 4.818290140005958, + "grad_norm": 0.0260009765625, + "learning_rate": 0.028221850674908237, + "loss": 0.805, + "num_input_tokens_seen": 18775120, + "step": 32350 + }, + { + "epoch": 4.819034852546917, + "grad_norm": 0.01904296875, + "learning_rate": 0.028220929810751933, + "loss": 0.8071, + "num_input_tokens_seen": 18778032, + "step": 32355 + }, + { + "epoch": 4.819779565087876, + "grad_norm": 0.0185546875, + "learning_rate": 0.028220008723241458, + "loss": 0.8202, + "num_input_tokens_seen": 18780624, + "step": 32360 + }, + { + "epoch": 4.820524277628835, + "grad_norm": 0.019775390625, + "learning_rate": 0.028219087412392374, + "loss": 0.7639, + "num_input_tokens_seen": 18783344, + "step": 32365 + }, + { + "epoch": 4.821268990169795, + "grad_norm": 0.02978515625, + "learning_rate": 0.028218165878220243, + "loss": 0.8264, + "num_input_tokens_seen": 18786128, + "step": 32370 + }, + { + "epoch": 4.822013702710754, + "grad_norm": 0.020751953125, + "learning_rate": 0.028217244120740632, + "loss": 0.7867, + "num_input_tokens_seen": 18789008, + "step": 32375 + }, + { + "epoch": 4.8227584152517124, + "grad_norm": 0.01611328125, + "learning_rate": 0.028216322139969115, + "loss": 0.8322, + "num_input_tokens_seen": 18792080, + "step": 32380 + }, + { + "epoch": 4.823503127792672, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02821539993592127, + "loss": 0.7997, + "num_input_tokens_seen": 18795056, + "step": 32385 + }, + { + "epoch": 4.824247840333631, + "grad_norm": 0.015869140625, + "learning_rate": 0.028214477508612675, + "loss": 0.8168, + "num_input_tokens_seen": 18797648, + "step": 32390 + }, + { + "epoch": 4.8249925528745905, + "grad_norm": 0.01226806640625, + "learning_rate": 0.02821355485805891, + "loss": 0.7977, + "num_input_tokens_seen": 18800528, + "step": 32395 + }, + { + "epoch": 4.825737265415549, + "grad_norm": 0.01544189453125, + "learning_rate": 0.02821263198427557, + "loss": 0.8006, + "num_input_tokens_seen": 18803760, + "step": 32400 + }, + { + "epoch": 4.826481977956509, + "grad_norm": 0.023681640625, + "learning_rate": 0.02821170888727824, + "loss": 0.7975, + "num_input_tokens_seen": 18806928, + "step": 32405 + }, + { + "epoch": 4.827226690497468, + "grad_norm": 0.047119140625, + "learning_rate": 0.028210785567082515, + "loss": 0.813, + "num_input_tokens_seen": 18810096, + "step": 32410 + }, + { + "epoch": 4.827971403038427, + "grad_norm": 0.0284423828125, + "learning_rate": 0.028209862023703996, + "loss": 0.8121, + "num_input_tokens_seen": 18813264, + "step": 32415 + }, + { + "epoch": 4.828716115579386, + "grad_norm": 0.0235595703125, + "learning_rate": 0.028208938257158277, + "loss": 0.8017, + "num_input_tokens_seen": 18816240, + "step": 32420 + }, + { + "epoch": 4.829460828120346, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02820801426746098, + "loss": 0.809, + "num_input_tokens_seen": 18819056, + "step": 32425 + }, + { + "epoch": 4.8302055406613045, + "grad_norm": 0.020751953125, + "learning_rate": 0.0282070900546277, + "loss": 0.7784, + "num_input_tokens_seen": 18822096, + "step": 32430 + }, + { + "epoch": 4.830950253202264, + "grad_norm": 0.021728515625, + "learning_rate": 0.028206165618674058, + "loss": 0.7875, + "num_input_tokens_seen": 18825008, + "step": 32435 + }, + { + "epoch": 4.831694965743223, + "grad_norm": 0.0245361328125, + "learning_rate": 0.028205240959615675, + "loss": 0.8094, + "num_input_tokens_seen": 18827664, + "step": 32440 + }, + { + "epoch": 4.8324396782841825, + "grad_norm": 0.0299072265625, + "learning_rate": 0.028204316077468164, + "loss": 0.8, + "num_input_tokens_seen": 18830832, + "step": 32445 + }, + { + "epoch": 4.833184390825141, + "grad_norm": 0.01287841796875, + "learning_rate": 0.02820339097224715, + "loss": 0.8197, + "num_input_tokens_seen": 18833680, + "step": 32450 + }, + { + "epoch": 4.833929103366101, + "grad_norm": 0.024658203125, + "learning_rate": 0.028202465643968267, + "loss": 0.8028, + "num_input_tokens_seen": 18836752, + "step": 32455 + }, + { + "epoch": 4.83467381590706, + "grad_norm": 0.031494140625, + "learning_rate": 0.028201540092647148, + "loss": 0.7981, + "num_input_tokens_seen": 18839568, + "step": 32460 + }, + { + "epoch": 4.835418528448019, + "grad_norm": 0.046142578125, + "learning_rate": 0.02820061431829942, + "loss": 0.8101, + "num_input_tokens_seen": 18842608, + "step": 32465 + }, + { + "epoch": 4.836163240988978, + "grad_norm": 0.0810546875, + "learning_rate": 0.028199688320940736, + "loss": 0.7891, + "num_input_tokens_seen": 18845904, + "step": 32470 + }, + { + "epoch": 4.836907953529938, + "grad_norm": 0.1328125, + "learning_rate": 0.028198762100586732, + "loss": 0.7731, + "num_input_tokens_seen": 18848880, + "step": 32475 + }, + { + "epoch": 4.8376526660708965, + "grad_norm": 0.08544921875, + "learning_rate": 0.028197835657253055, + "loss": 0.8964, + "num_input_tokens_seen": 18851824, + "step": 32480 + }, + { + "epoch": 4.838397378611856, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02819690899095536, + "loss": 0.8163, + "num_input_tokens_seen": 18854672, + "step": 32485 + }, + { + "epoch": 4.839142091152815, + "grad_norm": 0.0238037109375, + "learning_rate": 0.028195982101709302, + "loss": 0.8086, + "num_input_tokens_seen": 18857712, + "step": 32490 + }, + { + "epoch": 4.8398868036937746, + "grad_norm": 0.01226806640625, + "learning_rate": 0.028195054989530535, + "loss": 0.7728, + "num_input_tokens_seen": 18860528, + "step": 32495 + }, + { + "epoch": 4.840631516234733, + "grad_norm": 0.0208740234375, + "learning_rate": 0.028194127654434725, + "loss": 0.7847, + "num_input_tokens_seen": 18863312, + "step": 32500 + }, + { + "epoch": 4.841376228775693, + "grad_norm": 0.0145263671875, + "learning_rate": 0.028193200096437537, + "loss": 0.7742, + "num_input_tokens_seen": 18866192, + "step": 32505 + }, + { + "epoch": 4.842120941316652, + "grad_norm": 0.0322265625, + "learning_rate": 0.028192272315554642, + "loss": 0.7712, + "num_input_tokens_seen": 18869072, + "step": 32510 + }, + { + "epoch": 4.842865653857611, + "grad_norm": 0.025390625, + "learning_rate": 0.02819134431180172, + "loss": 0.8122, + "num_input_tokens_seen": 18872176, + "step": 32515 + }, + { + "epoch": 4.84361036639857, + "grad_norm": 0.035400390625, + "learning_rate": 0.02819041608519444, + "loss": 0.7936, + "num_input_tokens_seen": 18874896, + "step": 32520 + }, + { + "epoch": 4.844355078939529, + "grad_norm": 0.033447265625, + "learning_rate": 0.028189487635748484, + "loss": 0.7932, + "num_input_tokens_seen": 18877552, + "step": 32525 + }, + { + "epoch": 4.8450997914804885, + "grad_norm": 0.05712890625, + "learning_rate": 0.02818855896347954, + "loss": 0.815, + "num_input_tokens_seen": 18880592, + "step": 32530 + }, + { + "epoch": 4.845844504021448, + "grad_norm": 0.02490234375, + "learning_rate": 0.028187630068403302, + "loss": 0.8214, + "num_input_tokens_seen": 18883344, + "step": 32535 + }, + { + "epoch": 4.846589216562407, + "grad_norm": 0.03271484375, + "learning_rate": 0.028186700950535454, + "loss": 0.7966, + "num_input_tokens_seen": 18886512, + "step": 32540 + }, + { + "epoch": 4.847333929103366, + "grad_norm": 0.01519775390625, + "learning_rate": 0.028185771609891695, + "loss": 0.789, + "num_input_tokens_seen": 18889616, + "step": 32545 + }, + { + "epoch": 4.848078641644325, + "grad_norm": 0.019775390625, + "learning_rate": 0.02818484204648772, + "loss": 0.7863, + "num_input_tokens_seen": 18892400, + "step": 32550 + }, + { + "epoch": 4.848823354185284, + "grad_norm": 0.020263671875, + "learning_rate": 0.02818391226033925, + "loss": 0.789, + "num_input_tokens_seen": 18895184, + "step": 32555 + }, + { + "epoch": 4.849568066726244, + "grad_norm": 0.01416015625, + "learning_rate": 0.02818298225146198, + "loss": 0.7879, + "num_input_tokens_seen": 18898000, + "step": 32560 + }, + { + "epoch": 4.8503127792672025, + "grad_norm": 0.0537109375, + "learning_rate": 0.028182052019871617, + "loss": 0.8253, + "num_input_tokens_seen": 18900944, + "step": 32565 + }, + { + "epoch": 4.851057491808162, + "grad_norm": 0.021484375, + "learning_rate": 0.028181121565583885, + "loss": 0.7948, + "num_input_tokens_seen": 18904112, + "step": 32570 + }, + { + "epoch": 4.851802204349121, + "grad_norm": 0.019775390625, + "learning_rate": 0.028180190888614506, + "loss": 0.7969, + "num_input_tokens_seen": 18907024, + "step": 32575 + }, + { + "epoch": 4.8525469168900806, + "grad_norm": 0.01953125, + "learning_rate": 0.028179259988979192, + "loss": 0.8088, + "num_input_tokens_seen": 18909968, + "step": 32580 + }, + { + "epoch": 4.853291629431039, + "grad_norm": 0.0223388671875, + "learning_rate": 0.028178328866693678, + "loss": 0.8195, + "num_input_tokens_seen": 18912848, + "step": 32585 + }, + { + "epoch": 4.854036341971999, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02817739752177369, + "loss": 0.7934, + "num_input_tokens_seen": 18915504, + "step": 32590 + }, + { + "epoch": 4.854781054512958, + "grad_norm": 0.01300048828125, + "learning_rate": 0.028176465954234963, + "loss": 0.8145, + "num_input_tokens_seen": 18918064, + "step": 32595 + }, + { + "epoch": 4.855525767053917, + "grad_norm": 0.01806640625, + "learning_rate": 0.028175534164093238, + "loss": 0.7842, + "num_input_tokens_seen": 18921072, + "step": 32600 + }, + { + "epoch": 4.856270479594876, + "grad_norm": 0.01409912109375, + "learning_rate": 0.028174602151364254, + "loss": 0.7709, + "num_input_tokens_seen": 18924016, + "step": 32605 + }, + { + "epoch": 4.857015192135836, + "grad_norm": 0.03369140625, + "learning_rate": 0.028173669916063755, + "loss": 0.8172, + "num_input_tokens_seen": 18926768, + "step": 32610 + }, + { + "epoch": 4.8577599046767945, + "grad_norm": 0.02294921875, + "learning_rate": 0.028172737458207493, + "loss": 0.8304, + "num_input_tokens_seen": 18929296, + "step": 32615 + }, + { + "epoch": 4.858504617217754, + "grad_norm": 0.019287109375, + "learning_rate": 0.028171804777811223, + "loss": 0.7872, + "num_input_tokens_seen": 18932624, + "step": 32620 + }, + { + "epoch": 4.859249329758713, + "grad_norm": 0.031982421875, + "learning_rate": 0.028170871874890695, + "loss": 0.8162, + "num_input_tokens_seen": 18935536, + "step": 32625 + }, + { + "epoch": 4.859994042299673, + "grad_norm": 0.0223388671875, + "learning_rate": 0.028169938749461672, + "loss": 0.8199, + "num_input_tokens_seen": 18938480, + "step": 32630 + }, + { + "epoch": 4.860738754840631, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028169005401539923, + "loss": 0.8083, + "num_input_tokens_seen": 18941328, + "step": 32635 + }, + { + "epoch": 4.861483467381591, + "grad_norm": 0.021484375, + "learning_rate": 0.02816807183114121, + "loss": 0.8076, + "num_input_tokens_seen": 18944432, + "step": 32640 + }, + { + "epoch": 4.86222817992255, + "grad_norm": 0.0198974609375, + "learning_rate": 0.028167138038281306, + "loss": 0.8022, + "num_input_tokens_seen": 18947088, + "step": 32645 + }, + { + "epoch": 4.862972892463509, + "grad_norm": 0.01953125, + "learning_rate": 0.02816620402297599, + "loss": 0.804, + "num_input_tokens_seen": 18949968, + "step": 32650 + }, + { + "epoch": 4.863717605004468, + "grad_norm": 0.0142822265625, + "learning_rate": 0.028165269785241035, + "loss": 0.782, + "num_input_tokens_seen": 18952880, + "step": 32655 + }, + { + "epoch": 4.864462317545428, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02816433532509223, + "loss": 0.8237, + "num_input_tokens_seen": 18955824, + "step": 32660 + }, + { + "epoch": 4.8652070300863866, + "grad_norm": 0.02685546875, + "learning_rate": 0.028163400642545355, + "loss": 0.8142, + "num_input_tokens_seen": 18959088, + "step": 32665 + }, + { + "epoch": 4.865951742627346, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02816246573761621, + "loss": 0.8109, + "num_input_tokens_seen": 18961808, + "step": 32670 + }, + { + "epoch": 4.866696455168305, + "grad_norm": 0.0128173828125, + "learning_rate": 0.028161530610320583, + "loss": 0.7953, + "num_input_tokens_seen": 18964528, + "step": 32675 + }, + { + "epoch": 4.867441167709265, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02816059526067427, + "loss": 0.7945, + "num_input_tokens_seen": 18967280, + "step": 32680 + }, + { + "epoch": 4.868185880250223, + "grad_norm": 0.021240234375, + "learning_rate": 0.028159659688693082, + "loss": 0.7788, + "num_input_tokens_seen": 18969936, + "step": 32685 + }, + { + "epoch": 4.868930592791182, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02815872389439281, + "loss": 0.8082, + "num_input_tokens_seen": 18972752, + "step": 32690 + }, + { + "epoch": 4.869675305332142, + "grad_norm": 0.0174560546875, + "learning_rate": 0.02815778787778928, + "loss": 0.7887, + "num_input_tokens_seen": 18975536, + "step": 32695 + }, + { + "epoch": 4.870420017873101, + "grad_norm": 0.0115966796875, + "learning_rate": 0.028156851638898292, + "loss": 0.8109, + "num_input_tokens_seen": 18978192, + "step": 32700 + }, + { + "epoch": 4.87116473041406, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02815591517773567, + "loss": 0.8005, + "num_input_tokens_seen": 18981136, + "step": 32705 + }, + { + "epoch": 4.871909442955019, + "grad_norm": 0.01953125, + "learning_rate": 0.02815497849431723, + "loss": 0.8, + "num_input_tokens_seen": 18984048, + "step": 32710 + }, + { + "epoch": 4.872654155495979, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0281540415886588, + "loss": 0.8125, + "num_input_tokens_seen": 18986768, + "step": 32715 + }, + { + "epoch": 4.873398868036938, + "grad_norm": 0.01806640625, + "learning_rate": 0.028153104460776204, + "loss": 0.8098, + "num_input_tokens_seen": 18989456, + "step": 32720 + }, + { + "epoch": 4.874143580577897, + "grad_norm": 0.018798828125, + "learning_rate": 0.02815216711068528, + "loss": 0.7891, + "num_input_tokens_seen": 18992464, + "step": 32725 + }, + { + "epoch": 4.874888293118856, + "grad_norm": 0.02783203125, + "learning_rate": 0.028151229538401858, + "loss": 0.8087, + "num_input_tokens_seen": 18995312, + "step": 32730 + }, + { + "epoch": 4.875633005659815, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02815029174394178, + "loss": 0.7936, + "num_input_tokens_seen": 18998384, + "step": 32735 + }, + { + "epoch": 4.876377718200774, + "grad_norm": 0.01251220703125, + "learning_rate": 0.028149353727320886, + "loss": 0.8027, + "num_input_tokens_seen": 19001488, + "step": 32740 + }, + { + "epoch": 4.877122430741734, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02814841548855503, + "loss": 0.8034, + "num_input_tokens_seen": 19004560, + "step": 32745 + }, + { + "epoch": 4.8778671432826926, + "grad_norm": 0.01806640625, + "learning_rate": 0.028147477027660052, + "loss": 0.8192, + "num_input_tokens_seen": 19007504, + "step": 32750 + }, + { + "epoch": 4.878611855823652, + "grad_norm": 0.023193359375, + "learning_rate": 0.028146538344651816, + "loss": 0.7962, + "num_input_tokens_seen": 19010224, + "step": 32755 + }, + { + "epoch": 4.879356568364611, + "grad_norm": 0.021484375, + "learning_rate": 0.028145599439546178, + "loss": 0.7859, + "num_input_tokens_seen": 19012752, + "step": 32760 + }, + { + "epoch": 4.880101280905571, + "grad_norm": 0.031494140625, + "learning_rate": 0.028144660312358995, + "loss": 0.8155, + "num_input_tokens_seen": 19015952, + "step": 32765 + }, + { + "epoch": 4.880845993446529, + "grad_norm": 0.02587890625, + "learning_rate": 0.028143720963106134, + "loss": 0.7949, + "num_input_tokens_seen": 19018768, + "step": 32770 + }, + { + "epoch": 4.881590705987489, + "grad_norm": 0.019287109375, + "learning_rate": 0.02814278139180347, + "loss": 0.8108, + "num_input_tokens_seen": 19021360, + "step": 32775 + }, + { + "epoch": 4.882335418528448, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028141841598466867, + "loss": 0.7756, + "num_input_tokens_seen": 19024080, + "step": 32780 + }, + { + "epoch": 4.883080131069407, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02814090158311221, + "loss": 0.8099, + "num_input_tokens_seen": 19026864, + "step": 32785 + }, + { + "epoch": 4.883824843610366, + "grad_norm": 0.021240234375, + "learning_rate": 0.028139961345755377, + "loss": 0.8072, + "num_input_tokens_seen": 19029648, + "step": 32790 + }, + { + "epoch": 4.884569556151326, + "grad_norm": 0.033447265625, + "learning_rate": 0.02813902088641225, + "loss": 0.7946, + "num_input_tokens_seen": 19032592, + "step": 32795 + }, + { + "epoch": 4.885314268692285, + "grad_norm": 0.026611328125, + "learning_rate": 0.028138080205098722, + "loss": 0.8092, + "num_input_tokens_seen": 19035568, + "step": 32800 + }, + { + "epoch": 4.886058981233244, + "grad_norm": 0.025146484375, + "learning_rate": 0.02813713930183068, + "loss": 0.8013, + "num_input_tokens_seen": 19038384, + "step": 32805 + }, + { + "epoch": 4.886803693774203, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028136198176624022, + "loss": 0.7963, + "num_input_tokens_seen": 19041264, + "step": 32810 + }, + { + "epoch": 4.887548406315163, + "grad_norm": 0.01806640625, + "learning_rate": 0.02813525682949465, + "loss": 0.7954, + "num_input_tokens_seen": 19044208, + "step": 32815 + }, + { + "epoch": 4.888293118856121, + "grad_norm": 0.020263671875, + "learning_rate": 0.028134315260458457, + "loss": 0.7975, + "num_input_tokens_seen": 19047408, + "step": 32820 + }, + { + "epoch": 4.889037831397081, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02813337346953136, + "loss": 0.7973, + "num_input_tokens_seen": 19050032, + "step": 32825 + }, + { + "epoch": 4.88978254393804, + "grad_norm": 0.0205078125, + "learning_rate": 0.02813243145672927, + "loss": 0.7887, + "num_input_tokens_seen": 19053104, + "step": 32830 + }, + { + "epoch": 4.890527256478999, + "grad_norm": 0.01318359375, + "learning_rate": 0.028131489222068096, + "loss": 0.828, + "num_input_tokens_seen": 19055792, + "step": 32835 + }, + { + "epoch": 4.891271969019958, + "grad_norm": 0.0257568359375, + "learning_rate": 0.028130546765563754, + "loss": 0.7873, + "num_input_tokens_seen": 19058800, + "step": 32840 + }, + { + "epoch": 4.892016681560918, + "grad_norm": 0.0235595703125, + "learning_rate": 0.028129604087232172, + "loss": 0.8199, + "num_input_tokens_seen": 19061488, + "step": 32845 + }, + { + "epoch": 4.892761394101877, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02812866118708927, + "loss": 0.7845, + "num_input_tokens_seen": 19064656, + "step": 32850 + }, + { + "epoch": 4.893506106642836, + "grad_norm": 0.0213623046875, + "learning_rate": 0.028127718065150985, + "loss": 0.7891, + "num_input_tokens_seen": 19067632, + "step": 32855 + }, + { + "epoch": 4.894250819183795, + "grad_norm": 0.0224609375, + "learning_rate": 0.028126774721433243, + "loss": 0.8083, + "num_input_tokens_seen": 19070320, + "step": 32860 + }, + { + "epoch": 4.894995531724755, + "grad_norm": 0.028564453125, + "learning_rate": 0.028125831155951988, + "loss": 0.8111, + "num_input_tokens_seen": 19073424, + "step": 32865 + }, + { + "epoch": 4.895740244265713, + "grad_norm": 0.0299072265625, + "learning_rate": 0.028124887368723154, + "loss": 0.7903, + "num_input_tokens_seen": 19076400, + "step": 32870 + }, + { + "epoch": 4.896484956806672, + "grad_norm": 0.012939453125, + "learning_rate": 0.028123943359762687, + "loss": 0.8127, + "num_input_tokens_seen": 19079152, + "step": 32875 + }, + { + "epoch": 4.897229669347632, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02812299912908654, + "loss": 0.7906, + "num_input_tokens_seen": 19081936, + "step": 32880 + }, + { + "epoch": 4.8979743818885915, + "grad_norm": 0.013427734375, + "learning_rate": 0.028122054676710653, + "loss": 0.8124, + "num_input_tokens_seen": 19084656, + "step": 32885 + }, + { + "epoch": 4.89871909442955, + "grad_norm": 0.020263671875, + "learning_rate": 0.028121110002650992, + "loss": 0.806, + "num_input_tokens_seen": 19087696, + "step": 32890 + }, + { + "epoch": 4.899463806970509, + "grad_norm": 0.0234375, + "learning_rate": 0.028120165106923514, + "loss": 0.7971, + "num_input_tokens_seen": 19090672, + "step": 32895 + }, + { + "epoch": 4.900208519511469, + "grad_norm": 0.0228271484375, + "learning_rate": 0.028119219989544177, + "loss": 0.8185, + "num_input_tokens_seen": 19093392, + "step": 32900 + }, + { + "epoch": 4.900953232052427, + "grad_norm": 0.0223388671875, + "learning_rate": 0.028118274650528956, + "loss": 0.8013, + "num_input_tokens_seen": 19096112, + "step": 32905 + }, + { + "epoch": 4.901697944593387, + "grad_norm": 0.02001953125, + "learning_rate": 0.028117329089893815, + "loss": 0.7918, + "num_input_tokens_seen": 19098960, + "step": 32910 + }, + { + "epoch": 4.902442657134346, + "grad_norm": 0.01318359375, + "learning_rate": 0.028116383307654733, + "loss": 0.8329, + "num_input_tokens_seen": 19101680, + "step": 32915 + }, + { + "epoch": 4.903187369675305, + "grad_norm": 0.031005859375, + "learning_rate": 0.028115437303827685, + "loss": 0.8178, + "num_input_tokens_seen": 19104272, + "step": 32920 + }, + { + "epoch": 4.903932082216264, + "grad_norm": 0.022216796875, + "learning_rate": 0.028114491078428654, + "loss": 0.7893, + "num_input_tokens_seen": 19107216, + "step": 32925 + }, + { + "epoch": 4.904676794757224, + "grad_norm": 0.02587890625, + "learning_rate": 0.028113544631473628, + "loss": 0.8108, + "num_input_tokens_seen": 19110064, + "step": 32930 + }, + { + "epoch": 4.905421507298183, + "grad_norm": 0.019287109375, + "learning_rate": 0.02811259796297859, + "loss": 0.7889, + "num_input_tokens_seen": 19113072, + "step": 32935 + }, + { + "epoch": 4.906166219839142, + "grad_norm": 0.0205078125, + "learning_rate": 0.028111651072959536, + "loss": 0.8209, + "num_input_tokens_seen": 19116048, + "step": 32940 + }, + { + "epoch": 4.906910932380101, + "grad_norm": 0.01513671875, + "learning_rate": 0.028110703961432466, + "loss": 0.812, + "num_input_tokens_seen": 19119152, + "step": 32945 + }, + { + "epoch": 4.907655644921061, + "grad_norm": 0.0179443359375, + "learning_rate": 0.028109756628413377, + "loss": 0.8114, + "num_input_tokens_seen": 19121968, + "step": 32950 + }, + { + "epoch": 4.908400357462019, + "grad_norm": 0.018798828125, + "learning_rate": 0.02810880907391827, + "loss": 0.8108, + "num_input_tokens_seen": 19124784, + "step": 32955 + }, + { + "epoch": 4.909145070002979, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028107861297963162, + "loss": 0.8127, + "num_input_tokens_seen": 19127888, + "step": 32960 + }, + { + "epoch": 4.909889782543938, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028106913300564057, + "loss": 0.7876, + "num_input_tokens_seen": 19130960, + "step": 32965 + }, + { + "epoch": 4.9106344950848975, + "grad_norm": 0.01434326171875, + "learning_rate": 0.028105965081736975, + "loss": 0.7925, + "num_input_tokens_seen": 19133872, + "step": 32970 + }, + { + "epoch": 4.911379207625856, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02810501664149793, + "loss": 0.8045, + "num_input_tokens_seen": 19136752, + "step": 32975 + }, + { + "epoch": 4.912123920166816, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02810406797986295, + "loss": 0.8046, + "num_input_tokens_seen": 19139440, + "step": 32980 + }, + { + "epoch": 4.912868632707775, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02810311909684806, + "loss": 0.8032, + "num_input_tokens_seen": 19142544, + "step": 32985 + }, + { + "epoch": 4.913613345248734, + "grad_norm": 0.0245361328125, + "learning_rate": 0.028102169992469294, + "loss": 0.7853, + "num_input_tokens_seen": 19145360, + "step": 32990 + }, + { + "epoch": 4.914358057789693, + "grad_norm": 0.0252685546875, + "learning_rate": 0.028101220666742678, + "loss": 0.8063, + "num_input_tokens_seen": 19149008, + "step": 32995 + }, + { + "epoch": 4.915102770330653, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02810027111968425, + "loss": 0.7978, + "num_input_tokens_seen": 19151920, + "step": 33000 + }, + { + "epoch": 4.915847482871611, + "grad_norm": 0.01434326171875, + "learning_rate": 0.028099321351310064, + "loss": 0.7905, + "num_input_tokens_seen": 19154864, + "step": 33005 + }, + { + "epoch": 4.916592195412571, + "grad_norm": 0.0230712890625, + "learning_rate": 0.028098371361636156, + "loss": 0.8088, + "num_input_tokens_seen": 19157744, + "step": 33010 + }, + { + "epoch": 4.91733690795353, + "grad_norm": 0.046630859375, + "learning_rate": 0.028097421150678575, + "loss": 0.8192, + "num_input_tokens_seen": 19160752, + "step": 33015 + }, + { + "epoch": 4.9180816204944895, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02809647071845337, + "loss": 0.8322, + "num_input_tokens_seen": 19163824, + "step": 33020 + }, + { + "epoch": 4.918826333035448, + "grad_norm": 0.018798828125, + "learning_rate": 0.028095520064976613, + "loss": 0.8019, + "num_input_tokens_seen": 19166672, + "step": 33025 + }, + { + "epoch": 4.919571045576408, + "grad_norm": 0.026611328125, + "learning_rate": 0.028094569190264346, + "loss": 0.828, + "num_input_tokens_seen": 19169584, + "step": 33030 + }, + { + "epoch": 4.920315758117367, + "grad_norm": 0.0185546875, + "learning_rate": 0.028093618094332647, + "loss": 0.7896, + "num_input_tokens_seen": 19172720, + "step": 33035 + }, + { + "epoch": 4.921060470658325, + "grad_norm": 0.020751953125, + "learning_rate": 0.028092666777197576, + "loss": 0.7961, + "num_input_tokens_seen": 19175568, + "step": 33040 + }, + { + "epoch": 4.921805183199285, + "grad_norm": 0.03125, + "learning_rate": 0.028091715238875204, + "loss": 0.8196, + "num_input_tokens_seen": 19178704, + "step": 33045 + }, + { + "epoch": 4.922549895740245, + "grad_norm": 0.011474609375, + "learning_rate": 0.028090763479381612, + "loss": 0.797, + "num_input_tokens_seen": 19181776, + "step": 33050 + }, + { + "epoch": 4.9232946082812035, + "grad_norm": 0.012451171875, + "learning_rate": 0.028089811498732874, + "loss": 0.787, + "num_input_tokens_seen": 19184592, + "step": 33055 + }, + { + "epoch": 4.924039320822162, + "grad_norm": 0.01226806640625, + "learning_rate": 0.028088859296945077, + "loss": 0.796, + "num_input_tokens_seen": 19187600, + "step": 33060 + }, + { + "epoch": 4.924784033363122, + "grad_norm": 0.0201416015625, + "learning_rate": 0.028087906874034303, + "loss": 0.8184, + "num_input_tokens_seen": 19190224, + "step": 33065 + }, + { + "epoch": 4.9255287459040815, + "grad_norm": 0.017822265625, + "learning_rate": 0.028086954230016642, + "loss": 0.788, + "num_input_tokens_seen": 19193392, + "step": 33070 + }, + { + "epoch": 4.92627345844504, + "grad_norm": 0.0302734375, + "learning_rate": 0.028086001364908197, + "loss": 0.7804, + "num_input_tokens_seen": 19196336, + "step": 33075 + }, + { + "epoch": 4.927018170985999, + "grad_norm": 0.0196533203125, + "learning_rate": 0.028085048278725055, + "loss": 0.7906, + "num_input_tokens_seen": 19199184, + "step": 33080 + }, + { + "epoch": 4.927762883526959, + "grad_norm": 0.013671875, + "learning_rate": 0.02808409497148332, + "loss": 0.7896, + "num_input_tokens_seen": 19202064, + "step": 33085 + }, + { + "epoch": 4.928507596067917, + "grad_norm": 0.01141357421875, + "learning_rate": 0.028083141443199098, + "loss": 0.7769, + "num_input_tokens_seen": 19205072, + "step": 33090 + }, + { + "epoch": 4.929252308608877, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0280821876938885, + "loss": 0.8146, + "num_input_tokens_seen": 19207792, + "step": 33095 + }, + { + "epoch": 4.929997021149836, + "grad_norm": 0.044677734375, + "learning_rate": 0.028081233723567637, + "loss": 0.8148, + "num_input_tokens_seen": 19210480, + "step": 33100 + }, + { + "epoch": 4.9307417336907955, + "grad_norm": 0.02197265625, + "learning_rate": 0.02808027953225263, + "loss": 0.807, + "num_input_tokens_seen": 19213360, + "step": 33105 + }, + { + "epoch": 4.931486446231754, + "grad_norm": 0.027587890625, + "learning_rate": 0.02807932511995959, + "loss": 0.7982, + "num_input_tokens_seen": 19216112, + "step": 33110 + }, + { + "epoch": 4.932231158772714, + "grad_norm": 0.020263671875, + "learning_rate": 0.028078370486704644, + "loss": 0.788, + "num_input_tokens_seen": 19219024, + "step": 33115 + }, + { + "epoch": 4.932975871313673, + "grad_norm": 0.044189453125, + "learning_rate": 0.028077415632503922, + "loss": 0.8194, + "num_input_tokens_seen": 19221680, + "step": 33120 + }, + { + "epoch": 4.933720583854632, + "grad_norm": 0.02001953125, + "learning_rate": 0.028076460557373555, + "loss": 0.7916, + "num_input_tokens_seen": 19224336, + "step": 33125 + }, + { + "epoch": 4.934465296395591, + "grad_norm": 0.029541015625, + "learning_rate": 0.028075505261329677, + "loss": 0.8203, + "num_input_tokens_seen": 19227120, + "step": 33130 + }, + { + "epoch": 4.935210008936551, + "grad_norm": 0.029296875, + "learning_rate": 0.028074549744388427, + "loss": 0.7985, + "num_input_tokens_seen": 19230128, + "step": 33135 + }, + { + "epoch": 4.9359547214775095, + "grad_norm": 0.018310546875, + "learning_rate": 0.028073594006565946, + "loss": 0.7838, + "num_input_tokens_seen": 19232944, + "step": 33140 + }, + { + "epoch": 4.936699434018469, + "grad_norm": 0.0286865234375, + "learning_rate": 0.028072638047878382, + "loss": 0.8176, + "num_input_tokens_seen": 19235664, + "step": 33145 + }, + { + "epoch": 4.937444146559428, + "grad_norm": 0.01953125, + "learning_rate": 0.02807168186834189, + "loss": 0.8101, + "num_input_tokens_seen": 19238608, + "step": 33150 + }, + { + "epoch": 4.9381888591003875, + "grad_norm": 0.0289306640625, + "learning_rate": 0.028070725467972613, + "loss": 0.7973, + "num_input_tokens_seen": 19241584, + "step": 33155 + }, + { + "epoch": 4.938933571641346, + "grad_norm": 0.02392578125, + "learning_rate": 0.028069768846786716, + "loss": 0.8143, + "num_input_tokens_seen": 19244496, + "step": 33160 + }, + { + "epoch": 4.939678284182306, + "grad_norm": 0.0223388671875, + "learning_rate": 0.028068812004800353, + "loss": 0.8304, + "num_input_tokens_seen": 19247664, + "step": 33165 + }, + { + "epoch": 4.940422996723265, + "grad_norm": 0.0294189453125, + "learning_rate": 0.028067854942029697, + "loss": 0.8133, + "num_input_tokens_seen": 19250608, + "step": 33170 + }, + { + "epoch": 4.941167709264224, + "grad_norm": 0.0185546875, + "learning_rate": 0.028066897658490915, + "loss": 0.797, + "num_input_tokens_seen": 19253776, + "step": 33175 + }, + { + "epoch": 4.941912421805183, + "grad_norm": 0.029296875, + "learning_rate": 0.028065940154200178, + "loss": 0.7896, + "num_input_tokens_seen": 19256400, + "step": 33180 + }, + { + "epoch": 4.942657134346143, + "grad_norm": 0.027099609375, + "learning_rate": 0.028064982429173657, + "loss": 0.7978, + "num_input_tokens_seen": 19259152, + "step": 33185 + }, + { + "epoch": 4.9434018468871015, + "grad_norm": 0.018310546875, + "learning_rate": 0.028064024483427537, + "loss": 0.7795, + "num_input_tokens_seen": 19261968, + "step": 33190 + }, + { + "epoch": 4.944146559428061, + "grad_norm": 0.026611328125, + "learning_rate": 0.028063066316978007, + "loss": 0.8044, + "num_input_tokens_seen": 19265136, + "step": 33195 + }, + { + "epoch": 4.94489127196902, + "grad_norm": 0.041259765625, + "learning_rate": 0.028062107929841245, + "loss": 0.8221, + "num_input_tokens_seen": 19268080, + "step": 33200 + }, + { + "epoch": 4.945635984509979, + "grad_norm": 0.0260009765625, + "learning_rate": 0.028061149322033446, + "loss": 0.8113, + "num_input_tokens_seen": 19270960, + "step": 33205 + }, + { + "epoch": 4.946380697050938, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0280601904935708, + "loss": 0.8036, + "num_input_tokens_seen": 19273616, + "step": 33210 + }, + { + "epoch": 4.947125409591898, + "grad_norm": 0.019287109375, + "learning_rate": 0.028059231444469517, + "loss": 0.79, + "num_input_tokens_seen": 19276400, + "step": 33215 + }, + { + "epoch": 4.947870122132857, + "grad_norm": 0.032958984375, + "learning_rate": 0.028058272174745783, + "loss": 0.8052, + "num_input_tokens_seen": 19279056, + "step": 33220 + }, + { + "epoch": 4.9486148346738155, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02805731268441582, + "loss": 0.7877, + "num_input_tokens_seen": 19281904, + "step": 33225 + }, + { + "epoch": 4.949359547214775, + "grad_norm": 0.01220703125, + "learning_rate": 0.028056352973495825, + "loss": 0.7759, + "num_input_tokens_seen": 19284784, + "step": 33230 + }, + { + "epoch": 4.950104259755735, + "grad_norm": 0.01141357421875, + "learning_rate": 0.028055393042002018, + "loss": 0.8038, + "num_input_tokens_seen": 19287952, + "step": 33235 + }, + { + "epoch": 4.9508489722966935, + "grad_norm": 0.0191650390625, + "learning_rate": 0.028054432889950614, + "loss": 0.7976, + "num_input_tokens_seen": 19290896, + "step": 33240 + }, + { + "epoch": 4.951593684837652, + "grad_norm": 0.014892578125, + "learning_rate": 0.028053472517357837, + "loss": 0.8234, + "num_input_tokens_seen": 19293648, + "step": 33245 + }, + { + "epoch": 4.952338397378612, + "grad_norm": 0.02001953125, + "learning_rate": 0.028052511924239902, + "loss": 0.7856, + "num_input_tokens_seen": 19296592, + "step": 33250 + }, + { + "epoch": 4.953083109919571, + "grad_norm": 0.025390625, + "learning_rate": 0.028051551110613047, + "loss": 0.7999, + "num_input_tokens_seen": 19299504, + "step": 33255 + }, + { + "epoch": 4.95382782246053, + "grad_norm": 0.0306396484375, + "learning_rate": 0.028050590076493503, + "loss": 0.7952, + "num_input_tokens_seen": 19302704, + "step": 33260 + }, + { + "epoch": 4.954572535001489, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028049628821897505, + "loss": 0.8183, + "num_input_tokens_seen": 19305936, + "step": 33265 + }, + { + "epoch": 4.955317247542449, + "grad_norm": 0.0189208984375, + "learning_rate": 0.028048667346841288, + "loss": 0.8173, + "num_input_tokens_seen": 19308720, + "step": 33270 + }, + { + "epoch": 4.9560619600834075, + "grad_norm": 0.01300048828125, + "learning_rate": 0.028047705651341102, + "loss": 0.7924, + "num_input_tokens_seen": 19311440, + "step": 33275 + }, + { + "epoch": 4.956806672624367, + "grad_norm": 0.0220947265625, + "learning_rate": 0.028046743735413184, + "loss": 0.8261, + "num_input_tokens_seen": 19314768, + "step": 33280 + }, + { + "epoch": 4.957551385165326, + "grad_norm": 0.0286865234375, + "learning_rate": 0.028045781599073796, + "loss": 0.808, + "num_input_tokens_seen": 19317584, + "step": 33285 + }, + { + "epoch": 4.9582960977062855, + "grad_norm": 0.024169921875, + "learning_rate": 0.028044819242339182, + "loss": 0.8093, + "num_input_tokens_seen": 19320496, + "step": 33290 + }, + { + "epoch": 4.959040810247244, + "grad_norm": 0.0238037109375, + "learning_rate": 0.028043856665225613, + "loss": 0.799, + "num_input_tokens_seen": 19323760, + "step": 33295 + }, + { + "epoch": 4.959785522788204, + "grad_norm": 0.020263671875, + "learning_rate": 0.028042893867749333, + "loss": 0.8097, + "num_input_tokens_seen": 19326512, + "step": 33300 + }, + { + "epoch": 4.960530235329163, + "grad_norm": 0.0230712890625, + "learning_rate": 0.028041930849926625, + "loss": 0.8062, + "num_input_tokens_seen": 19329200, + "step": 33305 + }, + { + "epoch": 4.961274947870122, + "grad_norm": 0.020751953125, + "learning_rate": 0.028040967611773746, + "loss": 0.8096, + "num_input_tokens_seen": 19331920, + "step": 33310 + }, + { + "epoch": 4.962019660411081, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02804000415330698, + "loss": 0.8074, + "num_input_tokens_seen": 19334608, + "step": 33315 + }, + { + "epoch": 4.962764372952041, + "grad_norm": 0.019775390625, + "learning_rate": 0.02803904047454259, + "loss": 0.7969, + "num_input_tokens_seen": 19337456, + "step": 33320 + }, + { + "epoch": 4.9635090854929995, + "grad_norm": 0.0279541015625, + "learning_rate": 0.028038076575496864, + "loss": 0.8008, + "num_input_tokens_seen": 19340528, + "step": 33325 + }, + { + "epoch": 4.964253798033959, + "grad_norm": 0.0205078125, + "learning_rate": 0.02803711245618609, + "loss": 0.7927, + "num_input_tokens_seen": 19343248, + "step": 33330 + }, + { + "epoch": 4.964998510574918, + "grad_norm": 0.01361083984375, + "learning_rate": 0.028036148116626546, + "loss": 0.8157, + "num_input_tokens_seen": 19345776, + "step": 33335 + }, + { + "epoch": 4.965743223115878, + "grad_norm": 0.01177978515625, + "learning_rate": 0.028035183556834535, + "loss": 0.8192, + "num_input_tokens_seen": 19348624, + "step": 33340 + }, + { + "epoch": 4.966487935656836, + "grad_norm": 0.019287109375, + "learning_rate": 0.028034218776826342, + "loss": 0.783, + "num_input_tokens_seen": 19351664, + "step": 33345 + }, + { + "epoch": 4.967232648197796, + "grad_norm": 0.01953125, + "learning_rate": 0.02803325377661827, + "loss": 0.8119, + "num_input_tokens_seen": 19354128, + "step": 33350 + }, + { + "epoch": 4.967977360738755, + "grad_norm": 0.0245361328125, + "learning_rate": 0.028032288556226623, + "loss": 0.8249, + "num_input_tokens_seen": 19357008, + "step": 33355 + }, + { + "epoch": 4.968722073279714, + "grad_norm": 0.0126953125, + "learning_rate": 0.028031323115667705, + "loss": 0.7983, + "num_input_tokens_seen": 19359952, + "step": 33360 + }, + { + "epoch": 4.969466785820673, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02803035745495783, + "loss": 0.8037, + "num_input_tokens_seen": 19363088, + "step": 33365 + }, + { + "epoch": 4.970211498361633, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02802939157411331, + "loss": 0.7887, + "num_input_tokens_seen": 19365840, + "step": 33370 + }, + { + "epoch": 4.9709562109025915, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02802842547315046, + "loss": 0.8115, + "num_input_tokens_seen": 19368912, + "step": 33375 + }, + { + "epoch": 4.971700923443551, + "grad_norm": 0.01373291015625, + "learning_rate": 0.028027459152085602, + "loss": 0.8273, + "num_input_tokens_seen": 19371792, + "step": 33380 + }, + { + "epoch": 4.97244563598451, + "grad_norm": 0.037109375, + "learning_rate": 0.02802649261093506, + "loss": 0.7886, + "num_input_tokens_seen": 19374704, + "step": 33385 + }, + { + "epoch": 4.973190348525469, + "grad_norm": 0.01177978515625, + "learning_rate": 0.02802552584971517, + "loss": 0.8136, + "num_input_tokens_seen": 19377456, + "step": 33390 + }, + { + "epoch": 4.973935061066428, + "grad_norm": 0.0291748046875, + "learning_rate": 0.028024558868442257, + "loss": 0.8006, + "num_input_tokens_seen": 19380496, + "step": 33395 + }, + { + "epoch": 4.974679773607388, + "grad_norm": 0.01312255859375, + "learning_rate": 0.02802359166713266, + "loss": 0.8157, + "num_input_tokens_seen": 19383504, + "step": 33400 + }, + { + "epoch": 4.975424486148347, + "grad_norm": 0.01348876953125, + "learning_rate": 0.02802262424580272, + "loss": 0.8142, + "num_input_tokens_seen": 19386288, + "step": 33405 + }, + { + "epoch": 4.9761691986893055, + "grad_norm": 0.0213623046875, + "learning_rate": 0.028021656604468773, + "loss": 0.7967, + "num_input_tokens_seen": 19388912, + "step": 33410 + }, + { + "epoch": 4.976913911230265, + "grad_norm": 0.0230712890625, + "learning_rate": 0.028020688743147178, + "loss": 0.8071, + "num_input_tokens_seen": 19392368, + "step": 33415 + }, + { + "epoch": 4.977658623771224, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02801972066185428, + "loss": 0.8058, + "num_input_tokens_seen": 19395184, + "step": 33420 + }, + { + "epoch": 4.978403336312184, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02801875236060643, + "loss": 0.7896, + "num_input_tokens_seen": 19398032, + "step": 33425 + }, + { + "epoch": 4.979148048853142, + "grad_norm": 0.0198974609375, + "learning_rate": 0.028017783839419996, + "loss": 0.8035, + "num_input_tokens_seen": 19400976, + "step": 33430 + }, + { + "epoch": 4.979892761394102, + "grad_norm": 0.025146484375, + "learning_rate": 0.028016815098311332, + "loss": 0.8204, + "num_input_tokens_seen": 19403760, + "step": 33435 + }, + { + "epoch": 4.980637473935061, + "grad_norm": 0.0279541015625, + "learning_rate": 0.028015846137296806, + "loss": 0.8054, + "num_input_tokens_seen": 19406640, + "step": 33440 + }, + { + "epoch": 4.98138218647602, + "grad_norm": 0.034912109375, + "learning_rate": 0.02801487695639279, + "loss": 0.7935, + "num_input_tokens_seen": 19410032, + "step": 33445 + }, + { + "epoch": 4.982126899016979, + "grad_norm": 0.0186767578125, + "learning_rate": 0.028013907555615655, + "loss": 0.8087, + "num_input_tokens_seen": 19412752, + "step": 33450 + }, + { + "epoch": 4.982871611557939, + "grad_norm": 0.0234375, + "learning_rate": 0.028012937934981778, + "loss": 0.8043, + "num_input_tokens_seen": 19415728, + "step": 33455 + }, + { + "epoch": 4.9836163240988975, + "grad_norm": 0.027099609375, + "learning_rate": 0.028011968094507535, + "loss": 0.8085, + "num_input_tokens_seen": 19418992, + "step": 33460 + }, + { + "epoch": 4.984361036639857, + "grad_norm": 0.027587890625, + "learning_rate": 0.028010998034209325, + "loss": 0.8268, + "num_input_tokens_seen": 19422032, + "step": 33465 + }, + { + "epoch": 4.985105749180816, + "grad_norm": 0.026611328125, + "learning_rate": 0.028010027754103523, + "loss": 0.7991, + "num_input_tokens_seen": 19425328, + "step": 33470 + }, + { + "epoch": 4.985850461721776, + "grad_norm": 0.0250244140625, + "learning_rate": 0.028009057254206524, + "loss": 0.7892, + "num_input_tokens_seen": 19428304, + "step": 33475 + }, + { + "epoch": 4.986595174262734, + "grad_norm": 0.0206298828125, + "learning_rate": 0.028008086534534727, + "loss": 0.8062, + "num_input_tokens_seen": 19431312, + "step": 33480 + }, + { + "epoch": 4.987339886803694, + "grad_norm": 0.01177978515625, + "learning_rate": 0.028007115595104527, + "loss": 0.8032, + "num_input_tokens_seen": 19433968, + "step": 33485 + }, + { + "epoch": 4.988084599344653, + "grad_norm": 0.021484375, + "learning_rate": 0.02800614443593233, + "loss": 0.8002, + "num_input_tokens_seen": 19436528, + "step": 33490 + }, + { + "epoch": 4.988829311885612, + "grad_norm": 0.0137939453125, + "learning_rate": 0.02800517305703454, + "loss": 0.7929, + "num_input_tokens_seen": 19439248, + "step": 33495 + }, + { + "epoch": 4.989574024426571, + "grad_norm": 0.0194091796875, + "learning_rate": 0.028004201458427573, + "loss": 0.8063, + "num_input_tokens_seen": 19442096, + "step": 33500 + }, + { + "epoch": 4.990318736967531, + "grad_norm": 0.0184326171875, + "learning_rate": 0.028003229640127834, + "loss": 0.8126, + "num_input_tokens_seen": 19445136, + "step": 33505 + }, + { + "epoch": 4.99106344950849, + "grad_norm": 0.02734375, + "learning_rate": 0.028002257602151748, + "loss": 0.8089, + "num_input_tokens_seen": 19448112, + "step": 33510 + }, + { + "epoch": 4.991808162049449, + "grad_norm": 0.0264892578125, + "learning_rate": 0.028001285344515736, + "loss": 0.8148, + "num_input_tokens_seen": 19451120, + "step": 33515 + }, + { + "epoch": 4.992552874590408, + "grad_norm": 0.01904296875, + "learning_rate": 0.028000312867236216, + "loss": 0.8033, + "num_input_tokens_seen": 19454000, + "step": 33520 + }, + { + "epoch": 4.993297587131368, + "grad_norm": 0.01953125, + "learning_rate": 0.02799934017032963, + "loss": 0.8052, + "num_input_tokens_seen": 19456848, + "step": 33525 + }, + { + "epoch": 4.994042299672326, + "grad_norm": 0.032470703125, + "learning_rate": 0.027998367253812405, + "loss": 0.8031, + "num_input_tokens_seen": 19459536, + "step": 33530 + }, + { + "epoch": 4.994787012213286, + "grad_norm": 0.0257568359375, + "learning_rate": 0.027997394117700972, + "loss": 0.8093, + "num_input_tokens_seen": 19462384, + "step": 33535 + }, + { + "epoch": 4.995531724754245, + "grad_norm": 0.0213623046875, + "learning_rate": 0.027996420762011774, + "loss": 0.8, + "num_input_tokens_seen": 19465296, + "step": 33540 + }, + { + "epoch": 4.996276437295204, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027995447186761262, + "loss": 0.7924, + "num_input_tokens_seen": 19468336, + "step": 33545 + }, + { + "epoch": 4.997021149836163, + "grad_norm": 0.02197265625, + "learning_rate": 0.027994473391965875, + "loss": 0.8032, + "num_input_tokens_seen": 19471216, + "step": 33550 + }, + { + "epoch": 4.997765862377122, + "grad_norm": 0.0211181640625, + "learning_rate": 0.027993499377642065, + "loss": 0.8091, + "num_input_tokens_seen": 19474160, + "step": 33555 + }, + { + "epoch": 4.998510574918082, + "grad_norm": 0.011962890625, + "learning_rate": 0.027992525143806287, + "loss": 0.8196, + "num_input_tokens_seen": 19476912, + "step": 33560 + }, + { + "epoch": 4.999255287459041, + "grad_norm": 0.019287109375, + "learning_rate": 0.027991550690475007, + "loss": 0.8019, + "num_input_tokens_seen": 19479824, + "step": 33565 + }, + { + "epoch": 5.0, + "grad_norm": 0.036376953125, + "learning_rate": 0.027990576017664682, + "loss": 0.8031, + "num_input_tokens_seen": 19482128, + "step": 33570 + }, + { + "epoch": 5.0, + "eval_loss": 0.8017240166664124, + "eval_runtime": 70.6032, + "eval_samples_per_second": 42.264, + "eval_steps_per_second": 10.566, + "num_input_tokens_seen": 19482128, + "step": 33570 + }, + { + "epoch": 5.000744712540959, + "grad_norm": 0.019287109375, + "learning_rate": 0.027989601125391773, + "loss": 0.7931, + "num_input_tokens_seen": 19484976, + "step": 33575 + }, + { + "epoch": 5.001489425081918, + "grad_norm": 0.024658203125, + "learning_rate": 0.02798862601367276, + "loss": 0.7849, + "num_input_tokens_seen": 19487792, + "step": 33580 + }, + { + "epoch": 5.002234137622877, + "grad_norm": 0.021728515625, + "learning_rate": 0.02798765068252411, + "loss": 0.8129, + "num_input_tokens_seen": 19490448, + "step": 33585 + }, + { + "epoch": 5.002978850163837, + "grad_norm": 0.0174560546875, + "learning_rate": 0.027986675131962305, + "loss": 0.8053, + "num_input_tokens_seen": 19493136, + "step": 33590 + }, + { + "epoch": 5.003723562704796, + "grad_norm": 0.01348876953125, + "learning_rate": 0.027985699362003815, + "loss": 0.7911, + "num_input_tokens_seen": 19496144, + "step": 33595 + }, + { + "epoch": 5.004468275245755, + "grad_norm": 0.0198974609375, + "learning_rate": 0.027984723372665138, + "loss": 0.8019, + "num_input_tokens_seen": 19499056, + "step": 33600 + }, + { + "epoch": 5.005212987786714, + "grad_norm": 0.018798828125, + "learning_rate": 0.027983747163962753, + "loss": 0.8098, + "num_input_tokens_seen": 19502064, + "step": 33605 + }, + { + "epoch": 5.005957700327674, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02798277073591316, + "loss": 0.8001, + "num_input_tokens_seen": 19504752, + "step": 33610 + }, + { + "epoch": 5.006702412868632, + "grad_norm": 0.0186767578125, + "learning_rate": 0.027981794088532852, + "loss": 0.8156, + "num_input_tokens_seen": 19507696, + "step": 33615 + }, + { + "epoch": 5.007447125409592, + "grad_norm": 0.0186767578125, + "learning_rate": 0.027980817221838324, + "loss": 0.7885, + "num_input_tokens_seen": 19510672, + "step": 33620 + }, + { + "epoch": 5.008191837950551, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02797984013584608, + "loss": 0.7814, + "num_input_tokens_seen": 19513424, + "step": 33625 + }, + { + "epoch": 5.00893655049151, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02797886283057263, + "loss": 0.813, + "num_input_tokens_seen": 19516240, + "step": 33630 + }, + { + "epoch": 5.009681263032469, + "grad_norm": 0.022216796875, + "learning_rate": 0.027977885306034483, + "loss": 0.8179, + "num_input_tokens_seen": 19518896, + "step": 33635 + }, + { + "epoch": 5.010425975573429, + "grad_norm": 0.01904296875, + "learning_rate": 0.027976907562248157, + "loss": 0.7961, + "num_input_tokens_seen": 19521584, + "step": 33640 + }, + { + "epoch": 5.011170688114388, + "grad_norm": 0.042724609375, + "learning_rate": 0.02797592959923017, + "loss": 0.8232, + "num_input_tokens_seen": 19524880, + "step": 33645 + }, + { + "epoch": 5.011915400655347, + "grad_norm": 0.0250244140625, + "learning_rate": 0.027974951416997035, + "loss": 0.7902, + "num_input_tokens_seen": 19527536, + "step": 33650 + }, + { + "epoch": 5.012660113196306, + "grad_norm": 0.019775390625, + "learning_rate": 0.02797397301556528, + "loss": 0.8124, + "num_input_tokens_seen": 19530128, + "step": 33655 + }, + { + "epoch": 5.013404825737266, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02797299439495144, + "loss": 0.804, + "num_input_tokens_seen": 19533328, + "step": 33660 + }, + { + "epoch": 5.014149538278224, + "grad_norm": 0.01263427734375, + "learning_rate": 0.027972015555172047, + "loss": 0.8053, + "num_input_tokens_seen": 19536272, + "step": 33665 + }, + { + "epoch": 5.014894250819184, + "grad_norm": 0.021484375, + "learning_rate": 0.027971036496243638, + "loss": 0.7858, + "num_input_tokens_seen": 19539184, + "step": 33670 + }, + { + "epoch": 5.015638963360143, + "grad_norm": 0.02001953125, + "learning_rate": 0.027970057218182742, + "loss": 0.801, + "num_input_tokens_seen": 19541904, + "step": 33675 + }, + { + "epoch": 5.0163836759011025, + "grad_norm": 0.026611328125, + "learning_rate": 0.027969077721005917, + "loss": 0.8027, + "num_input_tokens_seen": 19545008, + "step": 33680 + }, + { + "epoch": 5.017128388442061, + "grad_norm": 0.0213623046875, + "learning_rate": 0.027968098004729703, + "loss": 0.7936, + "num_input_tokens_seen": 19547600, + "step": 33685 + }, + { + "epoch": 5.017873100983021, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027967118069370658, + "loss": 0.8085, + "num_input_tokens_seen": 19550640, + "step": 33690 + }, + { + "epoch": 5.01861781352398, + "grad_norm": 0.020751953125, + "learning_rate": 0.027966137914945326, + "loss": 0.7859, + "num_input_tokens_seen": 19553744, + "step": 33695 + }, + { + "epoch": 5.019362526064939, + "grad_norm": 0.02685546875, + "learning_rate": 0.027965157541470276, + "loss": 0.8065, + "num_input_tokens_seen": 19556784, + "step": 33700 + }, + { + "epoch": 5.020107238605898, + "grad_norm": 0.02001953125, + "learning_rate": 0.027964176948962065, + "loss": 0.7913, + "num_input_tokens_seen": 19559792, + "step": 33705 + }, + { + "epoch": 5.020851951146858, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027963196137437262, + "loss": 0.7921, + "num_input_tokens_seen": 19562896, + "step": 33710 + }, + { + "epoch": 5.021596663687816, + "grad_norm": 0.020751953125, + "learning_rate": 0.027962215106912436, + "loss": 0.7724, + "num_input_tokens_seen": 19565488, + "step": 33715 + }, + { + "epoch": 5.022341376228776, + "grad_norm": 0.03466796875, + "learning_rate": 0.02796123385740416, + "loss": 0.7951, + "num_input_tokens_seen": 19568176, + "step": 33720 + }, + { + "epoch": 5.023086088769735, + "grad_norm": 0.035888671875, + "learning_rate": 0.02796025238892901, + "loss": 0.8217, + "num_input_tokens_seen": 19570960, + "step": 33725 + }, + { + "epoch": 5.0238308013106945, + "grad_norm": 0.0162353515625, + "learning_rate": 0.02795927070150357, + "loss": 0.7933, + "num_input_tokens_seen": 19574320, + "step": 33730 + }, + { + "epoch": 5.024575513851653, + "grad_norm": 0.0225830078125, + "learning_rate": 0.027958288795144417, + "loss": 0.8, + "num_input_tokens_seen": 19577040, + "step": 33735 + }, + { + "epoch": 5.025320226392613, + "grad_norm": 0.0224609375, + "learning_rate": 0.027957306669868152, + "loss": 0.8001, + "num_input_tokens_seen": 19580080, + "step": 33740 + }, + { + "epoch": 5.026064938933572, + "grad_norm": 0.01312255859375, + "learning_rate": 0.027956324325691354, + "loss": 0.8345, + "num_input_tokens_seen": 19582960, + "step": 33745 + }, + { + "epoch": 5.02680965147453, + "grad_norm": 0.0120849609375, + "learning_rate": 0.027955341762630627, + "loss": 0.8043, + "num_input_tokens_seen": 19585552, + "step": 33750 + }, + { + "epoch": 5.02755436401549, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02795435898070257, + "loss": 0.8001, + "num_input_tokens_seen": 19588560, + "step": 33755 + }, + { + "epoch": 5.028299076556449, + "grad_norm": 0.0194091796875, + "learning_rate": 0.027953375979923784, + "loss": 0.812, + "num_input_tokens_seen": 19591248, + "step": 33760 + }, + { + "epoch": 5.0290437890974085, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027952392760310873, + "loss": 0.8163, + "num_input_tokens_seen": 19594160, + "step": 33765 + }, + { + "epoch": 5.029788501638367, + "grad_norm": 0.027099609375, + "learning_rate": 0.027951409321880456, + "loss": 0.7814, + "num_input_tokens_seen": 19596912, + "step": 33770 + }, + { + "epoch": 5.030533214179327, + "grad_norm": 0.020263671875, + "learning_rate": 0.027950425664649132, + "loss": 0.7828, + "num_input_tokens_seen": 19599664, + "step": 33775 + }, + { + "epoch": 5.031277926720286, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02794944178863354, + "loss": 0.8075, + "num_input_tokens_seen": 19602416, + "step": 33780 + }, + { + "epoch": 5.032022639261245, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027948457693850286, + "loss": 0.8106, + "num_input_tokens_seen": 19605360, + "step": 33785 + }, + { + "epoch": 5.032767351802204, + "grad_norm": 0.03271484375, + "learning_rate": 0.027947473380315998, + "loss": 0.8008, + "num_input_tokens_seen": 19608304, + "step": 33790 + }, + { + "epoch": 5.033512064343164, + "grad_norm": 0.0211181640625, + "learning_rate": 0.027946488848047304, + "loss": 0.7911, + "num_input_tokens_seen": 19611184, + "step": 33795 + }, + { + "epoch": 5.034256776884122, + "grad_norm": 0.018798828125, + "learning_rate": 0.02794550409706084, + "loss": 0.7841, + "num_input_tokens_seen": 19614224, + "step": 33800 + }, + { + "epoch": 5.035001489425082, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02794451912737325, + "loss": 0.8261, + "num_input_tokens_seen": 19617424, + "step": 33805 + }, + { + "epoch": 5.035746201966041, + "grad_norm": 0.0269775390625, + "learning_rate": 0.027943533939001154, + "loss": 0.8123, + "num_input_tokens_seen": 19620016, + "step": 33810 + }, + { + "epoch": 5.0364909145070005, + "grad_norm": 0.02294921875, + "learning_rate": 0.027942548531961216, + "loss": 0.7998, + "num_input_tokens_seen": 19622992, + "step": 33815 + }, + { + "epoch": 5.037235627047959, + "grad_norm": 0.01953125, + "learning_rate": 0.02794156290627007, + "loss": 0.7908, + "num_input_tokens_seen": 19625872, + "step": 33820 + }, + { + "epoch": 5.037980339588919, + "grad_norm": 0.01953125, + "learning_rate": 0.027940577061944374, + "loss": 0.812, + "num_input_tokens_seen": 19628784, + "step": 33825 + }, + { + "epoch": 5.038725052129878, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027939590999000778, + "loss": 0.8011, + "num_input_tokens_seen": 19631600, + "step": 33830 + }, + { + "epoch": 5.039469764670837, + "grad_norm": 0.018798828125, + "learning_rate": 0.027938604717455948, + "loss": 0.8197, + "num_input_tokens_seen": 19634576, + "step": 33835 + }, + { + "epoch": 5.040214477211796, + "grad_norm": 0.0189208984375, + "learning_rate": 0.027937618217326536, + "loss": 0.7883, + "num_input_tokens_seen": 19637648, + "step": 33840 + }, + { + "epoch": 5.040959189752756, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027936631498629214, + "loss": 0.7914, + "num_input_tokens_seen": 19640464, + "step": 33845 + }, + { + "epoch": 5.0417039022937145, + "grad_norm": 0.0272216796875, + "learning_rate": 0.027935644561380656, + "loss": 0.8413, + "num_input_tokens_seen": 19643056, + "step": 33850 + }, + { + "epoch": 5.042448614834674, + "grad_norm": 0.01361083984375, + "learning_rate": 0.027934657405597526, + "loss": 0.8116, + "num_input_tokens_seen": 19645936, + "step": 33855 + }, + { + "epoch": 5.043193327375633, + "grad_norm": 0.0255126953125, + "learning_rate": 0.027933670031296508, + "loss": 0.804, + "num_input_tokens_seen": 19648976, + "step": 33860 + }, + { + "epoch": 5.0439380399165925, + "grad_norm": 0.0137939453125, + "learning_rate": 0.027932682438494283, + "loss": 0.8072, + "num_input_tokens_seen": 19651984, + "step": 33865 + }, + { + "epoch": 5.044682752457551, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02793169462720753, + "loss": 0.8135, + "num_input_tokens_seen": 19654864, + "step": 33870 + }, + { + "epoch": 5.045427464998511, + "grad_norm": 0.026611328125, + "learning_rate": 0.027930706597452937, + "loss": 0.8117, + "num_input_tokens_seen": 19657744, + "step": 33875 + }, + { + "epoch": 5.04617217753947, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027929718349247193, + "loss": 0.7927, + "num_input_tokens_seen": 19660560, + "step": 33880 + }, + { + "epoch": 5.046916890080429, + "grad_norm": 0.0123291015625, + "learning_rate": 0.027928729882607008, + "loss": 0.7796, + "num_input_tokens_seen": 19663152, + "step": 33885 + }, + { + "epoch": 5.047661602621388, + "grad_norm": 0.02734375, + "learning_rate": 0.027927741197549066, + "loss": 0.8082, + "num_input_tokens_seen": 19665904, + "step": 33890 + }, + { + "epoch": 5.048406315162348, + "grad_norm": 0.031005859375, + "learning_rate": 0.02792675229409008, + "loss": 0.8117, + "num_input_tokens_seen": 19668624, + "step": 33895 + }, + { + "epoch": 5.0491510277033065, + "grad_norm": 0.023193359375, + "learning_rate": 0.027925763172246754, + "loss": 0.7907, + "num_input_tokens_seen": 19671504, + "step": 33900 + }, + { + "epoch": 5.049895740244266, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02792477383203579, + "loss": 0.8283, + "num_input_tokens_seen": 19674416, + "step": 33905 + }, + { + "epoch": 5.050640452785225, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02792378427347391, + "loss": 0.8055, + "num_input_tokens_seen": 19677328, + "step": 33910 + }, + { + "epoch": 5.0513851653261845, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02792279449657783, + "loss": 0.8148, + "num_input_tokens_seen": 19680240, + "step": 33915 + }, + { + "epoch": 5.052129877867143, + "grad_norm": 0.01287841796875, + "learning_rate": 0.027921804501364274, + "loss": 0.8257, + "num_input_tokens_seen": 19683344, + "step": 33920 + }, + { + "epoch": 5.052874590408102, + "grad_norm": 0.0115966796875, + "learning_rate": 0.02792081428784996, + "loss": 0.803, + "num_input_tokens_seen": 19686576, + "step": 33925 + }, + { + "epoch": 5.053619302949062, + "grad_norm": 0.033203125, + "learning_rate": 0.027919823856051622, + "loss": 0.7834, + "num_input_tokens_seen": 19689616, + "step": 33930 + }, + { + "epoch": 5.0543640154900205, + "grad_norm": 0.013671875, + "learning_rate": 0.027918833205985996, + "loss": 0.8148, + "num_input_tokens_seen": 19692432, + "step": 33935 + }, + { + "epoch": 5.05510872803098, + "grad_norm": 0.020263671875, + "learning_rate": 0.027917842337669804, + "loss": 0.7892, + "num_input_tokens_seen": 19695408, + "step": 33940 + }, + { + "epoch": 5.055853440571939, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0279168512511198, + "loss": 0.8113, + "num_input_tokens_seen": 19698352, + "step": 33945 + }, + { + "epoch": 5.0565981531128985, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027915859946352723, + "loss": 0.8033, + "num_input_tokens_seen": 19701712, + "step": 33950 + }, + { + "epoch": 5.057342865653857, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02791486842338532, + "loss": 0.7891, + "num_input_tokens_seen": 19704592, + "step": 33955 + }, + { + "epoch": 5.058087578194817, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027913876682234336, + "loss": 0.8102, + "num_input_tokens_seen": 19707664, + "step": 33960 + }, + { + "epoch": 5.058832290735776, + "grad_norm": 0.0196533203125, + "learning_rate": 0.027912884722916538, + "loss": 0.8213, + "num_input_tokens_seen": 19710704, + "step": 33965 + }, + { + "epoch": 5.059577003276735, + "grad_norm": 0.0198974609375, + "learning_rate": 0.027911892545448673, + "loss": 0.7907, + "num_input_tokens_seen": 19713520, + "step": 33970 + }, + { + "epoch": 5.060321715817694, + "grad_norm": 0.0185546875, + "learning_rate": 0.0279109001498475, + "loss": 0.7813, + "num_input_tokens_seen": 19716560, + "step": 33975 + }, + { + "epoch": 5.061066428358654, + "grad_norm": 0.032958984375, + "learning_rate": 0.0279099075361298, + "loss": 0.821, + "num_input_tokens_seen": 19719504, + "step": 33980 + }, + { + "epoch": 5.0618111408996125, + "grad_norm": 0.0341796875, + "learning_rate": 0.02790891470431233, + "loss": 0.8386, + "num_input_tokens_seen": 19722352, + "step": 33985 + }, + { + "epoch": 5.062555853440572, + "grad_norm": 0.018798828125, + "learning_rate": 0.027907921654411867, + "loss": 0.7928, + "num_input_tokens_seen": 19725104, + "step": 33990 + }, + { + "epoch": 5.063300565981531, + "grad_norm": 0.01312255859375, + "learning_rate": 0.027906928386445186, + "loss": 0.7779, + "num_input_tokens_seen": 19727952, + "step": 33995 + }, + { + "epoch": 5.0640452785224905, + "grad_norm": 0.03515625, + "learning_rate": 0.02790593490042907, + "loss": 0.7963, + "num_input_tokens_seen": 19730992, + "step": 34000 + }, + { + "epoch": 5.064789991063449, + "grad_norm": 0.0137939453125, + "learning_rate": 0.027904941196380297, + "loss": 0.8033, + "num_input_tokens_seen": 19734288, + "step": 34005 + }, + { + "epoch": 5.065534703604409, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02790394727431566, + "loss": 0.7951, + "num_input_tokens_seen": 19737264, + "step": 34010 + }, + { + "epoch": 5.066279416145368, + "grad_norm": 0.02294921875, + "learning_rate": 0.027902953134251948, + "loss": 0.8253, + "num_input_tokens_seen": 19740048, + "step": 34015 + }, + { + "epoch": 5.067024128686327, + "grad_norm": 0.0264892578125, + "learning_rate": 0.027901958776205958, + "loss": 0.8234, + "num_input_tokens_seen": 19742736, + "step": 34020 + }, + { + "epoch": 5.067768841227286, + "grad_norm": 0.0185546875, + "learning_rate": 0.02790096420019449, + "loss": 0.806, + "num_input_tokens_seen": 19745424, + "step": 34025 + }, + { + "epoch": 5.068513553768246, + "grad_norm": 0.017822265625, + "learning_rate": 0.027899969406234338, + "loss": 0.7949, + "num_input_tokens_seen": 19748176, + "step": 34030 + }, + { + "epoch": 5.0692582663092045, + "grad_norm": 0.01226806640625, + "learning_rate": 0.02789897439434232, + "loss": 0.7925, + "num_input_tokens_seen": 19750928, + "step": 34035 + }, + { + "epoch": 5.070002978850164, + "grad_norm": 0.032470703125, + "learning_rate": 0.027897979164535236, + "loss": 0.8217, + "num_input_tokens_seen": 19754000, + "step": 34040 + }, + { + "epoch": 5.070747691391123, + "grad_norm": 0.0289306640625, + "learning_rate": 0.027896983716829905, + "loss": 0.8315, + "num_input_tokens_seen": 19756816, + "step": 34045 + }, + { + "epoch": 5.071492403932083, + "grad_norm": 0.01953125, + "learning_rate": 0.027895988051243138, + "loss": 0.8, + "num_input_tokens_seen": 19759888, + "step": 34050 + }, + { + "epoch": 5.072237116473041, + "grad_norm": 0.01129150390625, + "learning_rate": 0.027894992167791764, + "loss": 0.8027, + "num_input_tokens_seen": 19762608, + "step": 34055 + }, + { + "epoch": 5.072981829014001, + "grad_norm": 0.024658203125, + "learning_rate": 0.027893996066492597, + "loss": 0.8062, + "num_input_tokens_seen": 19765392, + "step": 34060 + }, + { + "epoch": 5.07372654155496, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02789299974736248, + "loss": 0.8053, + "num_input_tokens_seen": 19768336, + "step": 34065 + }, + { + "epoch": 5.074471254095919, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02789200321041823, + "loss": 0.8144, + "num_input_tokens_seen": 19771152, + "step": 34070 + }, + { + "epoch": 5.075215966636878, + "grad_norm": 0.01953125, + "learning_rate": 0.02789100645567669, + "loss": 0.7894, + "num_input_tokens_seen": 19773712, + "step": 34075 + }, + { + "epoch": 5.075960679177838, + "grad_norm": 0.018310546875, + "learning_rate": 0.0278900094831547, + "loss": 0.7844, + "num_input_tokens_seen": 19776784, + "step": 34080 + }, + { + "epoch": 5.0767053917187965, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027889012292869098, + "loss": 0.8, + "num_input_tokens_seen": 19779568, + "step": 34085 + }, + { + "epoch": 5.077450104259755, + "grad_norm": 0.0189208984375, + "learning_rate": 0.027888014884836734, + "loss": 0.7854, + "num_input_tokens_seen": 19782256, + "step": 34090 + }, + { + "epoch": 5.078194816800715, + "grad_norm": 0.028564453125, + "learning_rate": 0.02788701725907446, + "loss": 0.7847, + "num_input_tokens_seen": 19784976, + "step": 34095 + }, + { + "epoch": 5.078939529341674, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02788601941559912, + "loss": 0.8148, + "num_input_tokens_seen": 19787824, + "step": 34100 + }, + { + "epoch": 5.079684241882633, + "grad_norm": 0.031494140625, + "learning_rate": 0.027885021354427584, + "loss": 0.8061, + "num_input_tokens_seen": 19790640, + "step": 34105 + }, + { + "epoch": 5.080428954423592, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02788402307557671, + "loss": 0.8355, + "num_input_tokens_seen": 19793552, + "step": 34110 + }, + { + "epoch": 5.081173666964552, + "grad_norm": 0.0130615234375, + "learning_rate": 0.027883024579063356, + "loss": 0.7816, + "num_input_tokens_seen": 19796528, + "step": 34115 + }, + { + "epoch": 5.0819183795055105, + "grad_norm": 0.01055908203125, + "learning_rate": 0.027882025864904396, + "loss": 0.808, + "num_input_tokens_seen": 19799216, + "step": 34120 + }, + { + "epoch": 5.08266309204647, + "grad_norm": 0.02587890625, + "learning_rate": 0.0278810269331167, + "loss": 0.8138, + "num_input_tokens_seen": 19801904, + "step": 34125 + }, + { + "epoch": 5.083407804587429, + "grad_norm": 0.0240478515625, + "learning_rate": 0.027880027783717152, + "loss": 0.8011, + "num_input_tokens_seen": 19804784, + "step": 34130 + }, + { + "epoch": 5.084152517128389, + "grad_norm": 0.01287841796875, + "learning_rate": 0.027879028416722625, + "loss": 0.7846, + "num_input_tokens_seen": 19807728, + "step": 34135 + }, + { + "epoch": 5.084897229669347, + "grad_norm": 0.02099609375, + "learning_rate": 0.02787802883215, + "loss": 0.7965, + "num_input_tokens_seen": 19810384, + "step": 34140 + }, + { + "epoch": 5.085641942210307, + "grad_norm": 0.01904296875, + "learning_rate": 0.027877029030016164, + "loss": 0.7893, + "num_input_tokens_seen": 19813232, + "step": 34145 + }, + { + "epoch": 5.086386654751266, + "grad_norm": 0.01483154296875, + "learning_rate": 0.027876029010338012, + "loss": 0.842, + "num_input_tokens_seen": 19815984, + "step": 34150 + }, + { + "epoch": 5.087131367292225, + "grad_norm": 0.0218505859375, + "learning_rate": 0.027875028773132436, + "loss": 0.8089, + "num_input_tokens_seen": 19818992, + "step": 34155 + }, + { + "epoch": 5.087876079833184, + "grad_norm": 0.03125, + "learning_rate": 0.027874028318416337, + "loss": 0.8161, + "num_input_tokens_seen": 19821968, + "step": 34160 + }, + { + "epoch": 5.088620792374144, + "grad_norm": 0.02685546875, + "learning_rate": 0.02787302764620661, + "loss": 0.8131, + "num_input_tokens_seen": 19825040, + "step": 34165 + }, + { + "epoch": 5.0893655049151025, + "grad_norm": 0.0262451171875, + "learning_rate": 0.02787202675652017, + "loss": 0.7983, + "num_input_tokens_seen": 19828080, + "step": 34170 + }, + { + "epoch": 5.090110217456062, + "grad_norm": 0.0172119140625, + "learning_rate": 0.02787102564937392, + "loss": 0.792, + "num_input_tokens_seen": 19830896, + "step": 34175 + }, + { + "epoch": 5.090854929997021, + "grad_norm": 0.019287109375, + "learning_rate": 0.027870024324784765, + "loss": 0.8063, + "num_input_tokens_seen": 19833712, + "step": 34180 + }, + { + "epoch": 5.091599642537981, + "grad_norm": 0.026123046875, + "learning_rate": 0.027869022782769638, + "loss": 0.8042, + "num_input_tokens_seen": 19836592, + "step": 34185 + }, + { + "epoch": 5.092344355078939, + "grad_norm": 0.01385498046875, + "learning_rate": 0.027868021023345447, + "loss": 0.8114, + "num_input_tokens_seen": 19839856, + "step": 34190 + }, + { + "epoch": 5.093089067619899, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02786701904652912, + "loss": 0.7958, + "num_input_tokens_seen": 19842832, + "step": 34195 + }, + { + "epoch": 5.093833780160858, + "grad_norm": 0.0301513671875, + "learning_rate": 0.027866016852337586, + "loss": 0.8024, + "num_input_tokens_seen": 19845616, + "step": 34200 + }, + { + "epoch": 5.094578492701817, + "grad_norm": 0.031494140625, + "learning_rate": 0.02786501444078777, + "loss": 0.812, + "num_input_tokens_seen": 19848464, + "step": 34205 + }, + { + "epoch": 5.095323205242776, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027864011811896613, + "loss": 0.7853, + "num_input_tokens_seen": 19851344, + "step": 34210 + }, + { + "epoch": 5.096067917783736, + "grad_norm": 0.020263671875, + "learning_rate": 0.027863008965681047, + "loss": 0.802, + "num_input_tokens_seen": 19854512, + "step": 34215 + }, + { + "epoch": 5.096812630324695, + "grad_norm": 0.011962890625, + "learning_rate": 0.027862005902158024, + "loss": 0.8045, + "num_input_tokens_seen": 19857424, + "step": 34220 + }, + { + "epoch": 5.097557342865654, + "grad_norm": 0.02099609375, + "learning_rate": 0.027861002621344475, + "loss": 0.7774, + "num_input_tokens_seen": 19860144, + "step": 34225 + }, + { + "epoch": 5.098302055406613, + "grad_norm": 0.029296875, + "learning_rate": 0.027859999123257366, + "loss": 0.7751, + "num_input_tokens_seen": 19863312, + "step": 34230 + }, + { + "epoch": 5.099046767947573, + "grad_norm": 0.01904296875, + "learning_rate": 0.02785899540791364, + "loss": 0.8063, + "num_input_tokens_seen": 19866352, + "step": 34235 + }, + { + "epoch": 5.099791480488531, + "grad_norm": 0.0216064453125, + "learning_rate": 0.027857991475330256, + "loss": 0.8, + "num_input_tokens_seen": 19869168, + "step": 34240 + }, + { + "epoch": 5.100536193029491, + "grad_norm": 0.0152587890625, + "learning_rate": 0.027856987325524173, + "loss": 0.8162, + "num_input_tokens_seen": 19872528, + "step": 34245 + }, + { + "epoch": 5.10128090557045, + "grad_norm": 0.0186767578125, + "learning_rate": 0.027855982958512354, + "loss": 0.8039, + "num_input_tokens_seen": 19875248, + "step": 34250 + }, + { + "epoch": 5.102025618111409, + "grad_norm": 0.0174560546875, + "learning_rate": 0.027854978374311775, + "loss": 0.7961, + "num_input_tokens_seen": 19877936, + "step": 34255 + }, + { + "epoch": 5.102770330652368, + "grad_norm": 0.0284423828125, + "learning_rate": 0.027853973572939398, + "loss": 0.8149, + "num_input_tokens_seen": 19880912, + "step": 34260 + }, + { + "epoch": 5.103515043193327, + "grad_norm": 0.01434326171875, + "learning_rate": 0.027852968554412202, + "loss": 0.831, + "num_input_tokens_seen": 19883856, + "step": 34265 + }, + { + "epoch": 5.104259755734287, + "grad_norm": 0.01312255859375, + "learning_rate": 0.027851963318747168, + "loss": 0.8175, + "num_input_tokens_seen": 19886960, + "step": 34270 + }, + { + "epoch": 5.105004468275245, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02785095786596127, + "loss": 0.7906, + "num_input_tokens_seen": 19889840, + "step": 34275 + }, + { + "epoch": 5.105749180816205, + "grad_norm": 0.025634765625, + "learning_rate": 0.027849952196071507, + "loss": 0.8, + "num_input_tokens_seen": 19892688, + "step": 34280 + }, + { + "epoch": 5.106493893357164, + "grad_norm": 0.0255126953125, + "learning_rate": 0.027848946309094857, + "loss": 0.8071, + "num_input_tokens_seen": 19895856, + "step": 34285 + }, + { + "epoch": 5.107238605898123, + "grad_norm": 0.01171875, + "learning_rate": 0.02784794020504832, + "loss": 0.811, + "num_input_tokens_seen": 19898576, + "step": 34290 + }, + { + "epoch": 5.107983318439082, + "grad_norm": 0.025146484375, + "learning_rate": 0.027846933883948894, + "loss": 0.8056, + "num_input_tokens_seen": 19901328, + "step": 34295 + }, + { + "epoch": 5.108728030980042, + "grad_norm": 0.027099609375, + "learning_rate": 0.027845927345813568, + "loss": 0.821, + "num_input_tokens_seen": 19903984, + "step": 34300 + }, + { + "epoch": 5.109472743521001, + "grad_norm": 0.0208740234375, + "learning_rate": 0.027844920590659364, + "loss": 0.8, + "num_input_tokens_seen": 19906864, + "step": 34305 + }, + { + "epoch": 5.11021745606196, + "grad_norm": 0.0181884765625, + "learning_rate": 0.02784391361850328, + "loss": 0.8066, + "num_input_tokens_seen": 19909680, + "step": 34310 + }, + { + "epoch": 5.110962168602919, + "grad_norm": 0.0260009765625, + "learning_rate": 0.027842906429362328, + "loss": 0.8066, + "num_input_tokens_seen": 19912464, + "step": 34315 + }, + { + "epoch": 5.111706881143879, + "grad_norm": 0.02001953125, + "learning_rate": 0.027841899023253527, + "loss": 0.8223, + "num_input_tokens_seen": 19915280, + "step": 34320 + }, + { + "epoch": 5.112451593684837, + "grad_norm": 0.0177001953125, + "learning_rate": 0.027840891400193888, + "loss": 0.7996, + "num_input_tokens_seen": 19917904, + "step": 34325 + }, + { + "epoch": 5.113196306225797, + "grad_norm": 0.01275634765625, + "learning_rate": 0.027839883560200448, + "loss": 0.8148, + "num_input_tokens_seen": 19920560, + "step": 34330 + }, + { + "epoch": 5.113941018766756, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02783887550329022, + "loss": 0.8111, + "num_input_tokens_seen": 19923248, + "step": 34335 + }, + { + "epoch": 5.114685731307715, + "grad_norm": 0.01031494140625, + "learning_rate": 0.02783786722948024, + "loss": 0.7833, + "num_input_tokens_seen": 19926064, + "step": 34340 + }, + { + "epoch": 5.115430443848674, + "grad_norm": 0.0201416015625, + "learning_rate": 0.027836858738787544, + "loss": 0.8003, + "num_input_tokens_seen": 19929040, + "step": 34345 + }, + { + "epoch": 5.116175156389634, + "grad_norm": 0.01251220703125, + "learning_rate": 0.027835850031229164, + "loss": 0.8014, + "num_input_tokens_seen": 19931888, + "step": 34350 + }, + { + "epoch": 5.116919868930593, + "grad_norm": 0.021484375, + "learning_rate": 0.027834841106822144, + "loss": 0.8031, + "num_input_tokens_seen": 19934736, + "step": 34355 + }, + { + "epoch": 5.117664581471552, + "grad_norm": 0.018798828125, + "learning_rate": 0.027833831965583523, + "loss": 0.7977, + "num_input_tokens_seen": 19937392, + "step": 34360 + }, + { + "epoch": 5.118409294012511, + "grad_norm": 0.01806640625, + "learning_rate": 0.02783282260753036, + "loss": 0.7949, + "num_input_tokens_seen": 19940112, + "step": 34365 + }, + { + "epoch": 5.119154006553471, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0278318130326797, + "loss": 0.7942, + "num_input_tokens_seen": 19942992, + "step": 34370 + }, + { + "epoch": 5.119898719094429, + "grad_norm": 0.024658203125, + "learning_rate": 0.027830803241048602, + "loss": 0.8076, + "num_input_tokens_seen": 19946256, + "step": 34375 + }, + { + "epoch": 5.120643431635389, + "grad_norm": 0.0166015625, + "learning_rate": 0.027829793232654127, + "loss": 0.7742, + "num_input_tokens_seen": 19949232, + "step": 34380 + }, + { + "epoch": 5.121388144176348, + "grad_norm": 0.02392578125, + "learning_rate": 0.027828783007513326, + "loss": 0.8006, + "num_input_tokens_seen": 19952208, + "step": 34385 + }, + { + "epoch": 5.1221328567173074, + "grad_norm": 0.018310546875, + "learning_rate": 0.027827772565643283, + "loss": 0.7991, + "num_input_tokens_seen": 19954960, + "step": 34390 + }, + { + "epoch": 5.122877569258266, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027826761907061053, + "loss": 0.8013, + "num_input_tokens_seen": 19957904, + "step": 34395 + }, + { + "epoch": 5.123622281799226, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02782575103178372, + "loss": 0.7793, + "num_input_tokens_seen": 19960720, + "step": 34400 + }, + { + "epoch": 5.124366994340185, + "grad_norm": 0.02587890625, + "learning_rate": 0.027824739939828362, + "loss": 0.8197, + "num_input_tokens_seen": 19963312, + "step": 34405 + }, + { + "epoch": 5.125111706881144, + "grad_norm": 0.01104736328125, + "learning_rate": 0.027823728631212057, + "loss": 0.7779, + "num_input_tokens_seen": 19966192, + "step": 34410 + }, + { + "epoch": 5.125856419422103, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02782271710595189, + "loss": 0.7914, + "num_input_tokens_seen": 19969296, + "step": 34415 + }, + { + "epoch": 5.126601131963063, + "grad_norm": 0.01214599609375, + "learning_rate": 0.027821705364064947, + "loss": 0.8089, + "num_input_tokens_seen": 19972208, + "step": 34420 + }, + { + "epoch": 5.127345844504021, + "grad_norm": 0.011962890625, + "learning_rate": 0.027820693405568322, + "loss": 0.7996, + "num_input_tokens_seen": 19975088, + "step": 34425 + }, + { + "epoch": 5.128090557044981, + "grad_norm": 0.0137939453125, + "learning_rate": 0.027819681230479117, + "loss": 0.8318, + "num_input_tokens_seen": 19977840, + "step": 34430 + }, + { + "epoch": 5.12883526958594, + "grad_norm": 0.022705078125, + "learning_rate": 0.027818668838814425, + "loss": 0.7898, + "num_input_tokens_seen": 19980496, + "step": 34435 + }, + { + "epoch": 5.129579982126899, + "grad_norm": 0.0306396484375, + "learning_rate": 0.02781765623059135, + "loss": 0.7927, + "num_input_tokens_seen": 19983696, + "step": 34440 + }, + { + "epoch": 5.130324694667858, + "grad_norm": 0.0186767578125, + "learning_rate": 0.027816643405827002, + "loss": 0.8158, + "num_input_tokens_seen": 19986576, + "step": 34445 + }, + { + "epoch": 5.131069407208817, + "grad_norm": 0.0260009765625, + "learning_rate": 0.027815630364538487, + "loss": 0.7994, + "num_input_tokens_seen": 19989200, + "step": 34450 + }, + { + "epoch": 5.131814119749777, + "grad_norm": 0.0194091796875, + "learning_rate": 0.027814617106742927, + "loss": 0.8187, + "num_input_tokens_seen": 19991920, + "step": 34455 + }, + { + "epoch": 5.132558832290735, + "grad_norm": 0.0263671875, + "learning_rate": 0.027813603632457433, + "loss": 0.8433, + "num_input_tokens_seen": 19994800, + "step": 34460 + }, + { + "epoch": 5.133303544831695, + "grad_norm": 0.02587890625, + "learning_rate": 0.027812589941699124, + "loss": 0.8119, + "num_input_tokens_seen": 19997680, + "step": 34465 + }, + { + "epoch": 5.134048257372654, + "grad_norm": 0.0291748046875, + "learning_rate": 0.027811576034485138, + "loss": 0.7891, + "num_input_tokens_seen": 20000368, + "step": 34470 + }, + { + "epoch": 5.1347929699136134, + "grad_norm": 0.0185546875, + "learning_rate": 0.02781056191083259, + "loss": 0.8158, + "num_input_tokens_seen": 20003152, + "step": 34475 + }, + { + "epoch": 5.135537682454572, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02780954757075862, + "loss": 0.7888, + "num_input_tokens_seen": 20006064, + "step": 34480 + }, + { + "epoch": 5.136282394995532, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02780853301428037, + "loss": 0.7903, + "num_input_tokens_seen": 20008432, + "step": 34485 + }, + { + "epoch": 5.137027107536491, + "grad_norm": 0.01397705078125, + "learning_rate": 0.027807518241414964, + "loss": 0.7985, + "num_input_tokens_seen": 20011440, + "step": 34490 + }, + { + "epoch": 5.13777182007745, + "grad_norm": 0.0113525390625, + "learning_rate": 0.027806503252179555, + "loss": 0.819, + "num_input_tokens_seen": 20014064, + "step": 34495 + }, + { + "epoch": 5.138516532618409, + "grad_norm": 0.026123046875, + "learning_rate": 0.027805488046591293, + "loss": 0.8089, + "num_input_tokens_seen": 20016976, + "step": 34500 + }, + { + "epoch": 5.139261245159369, + "grad_norm": 0.01171875, + "learning_rate": 0.027804472624667325, + "loss": 0.7856, + "num_input_tokens_seen": 20020016, + "step": 34505 + }, + { + "epoch": 5.140005957700327, + "grad_norm": 0.017333984375, + "learning_rate": 0.027803456986424804, + "loss": 0.8108, + "num_input_tokens_seen": 20022768, + "step": 34510 + }, + { + "epoch": 5.140750670241287, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027802441131880892, + "loss": 0.8197, + "num_input_tokens_seen": 20025712, + "step": 34515 + }, + { + "epoch": 5.141495382782246, + "grad_norm": 0.0185546875, + "learning_rate": 0.027801425061052747, + "loss": 0.8128, + "num_input_tokens_seen": 20028368, + "step": 34520 + }, + { + "epoch": 5.1422400953232055, + "grad_norm": 0.010986328125, + "learning_rate": 0.02780040877395754, + "loss": 0.8111, + "num_input_tokens_seen": 20031472, + "step": 34525 + }, + { + "epoch": 5.142984807864164, + "grad_norm": 0.01904296875, + "learning_rate": 0.02779939227061243, + "loss": 0.7916, + "num_input_tokens_seen": 20034192, + "step": 34530 + }, + { + "epoch": 5.143729520405124, + "grad_norm": 0.0205078125, + "learning_rate": 0.0277983755510346, + "loss": 0.8159, + "num_input_tokens_seen": 20037264, + "step": 34535 + }, + { + "epoch": 5.144474232946083, + "grad_norm": 0.0201416015625, + "learning_rate": 0.027797358615241224, + "loss": 0.8096, + "num_input_tokens_seen": 20040304, + "step": 34540 + }, + { + "epoch": 5.145218945487042, + "grad_norm": 0.0181884765625, + "learning_rate": 0.02779634146324948, + "loss": 0.8112, + "num_input_tokens_seen": 20043248, + "step": 34545 + }, + { + "epoch": 5.145963658028001, + "grad_norm": 0.02392578125, + "learning_rate": 0.02779532409507655, + "loss": 0.8008, + "num_input_tokens_seen": 20046064, + "step": 34550 + }, + { + "epoch": 5.146708370568961, + "grad_norm": 0.01953125, + "learning_rate": 0.027794306510739625, + "loss": 0.8119, + "num_input_tokens_seen": 20049392, + "step": 34555 + }, + { + "epoch": 5.1474530831099194, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0277932887102559, + "loss": 0.7972, + "num_input_tokens_seen": 20052304, + "step": 34560 + }, + { + "epoch": 5.148197795650879, + "grad_norm": 0.0211181640625, + "learning_rate": 0.027792270693642557, + "loss": 0.8057, + "num_input_tokens_seen": 20054960, + "step": 34565 + }, + { + "epoch": 5.148942508191838, + "grad_norm": 0.01904296875, + "learning_rate": 0.027791252460916807, + "loss": 0.8091, + "num_input_tokens_seen": 20058000, + "step": 34570 + }, + { + "epoch": 5.1496872207327975, + "grad_norm": 0.01953125, + "learning_rate": 0.027790234012095846, + "loss": 0.7995, + "num_input_tokens_seen": 20061072, + "step": 34575 + }, + { + "epoch": 5.150431933273756, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027789215347196885, + "loss": 0.809, + "num_input_tokens_seen": 20063824, + "step": 34580 + }, + { + "epoch": 5.151176645814716, + "grad_norm": 0.02001953125, + "learning_rate": 0.027788196466237123, + "loss": 0.7938, + "num_input_tokens_seen": 20066896, + "step": 34585 + }, + { + "epoch": 5.151921358355675, + "grad_norm": 0.0234375, + "learning_rate": 0.02778717736923378, + "loss": 0.8054, + "num_input_tokens_seen": 20069648, + "step": 34590 + }, + { + "epoch": 5.152666070896634, + "grad_norm": 0.019287109375, + "learning_rate": 0.02778615805620407, + "loss": 0.7902, + "num_input_tokens_seen": 20072528, + "step": 34595 + }, + { + "epoch": 5.153410783437593, + "grad_norm": 0.02734375, + "learning_rate": 0.027785138527165222, + "loss": 0.8064, + "num_input_tokens_seen": 20075504, + "step": 34600 + }, + { + "epoch": 5.154155495978552, + "grad_norm": 0.0263671875, + "learning_rate": 0.027784118782134444, + "loss": 0.8055, + "num_input_tokens_seen": 20078672, + "step": 34605 + }, + { + "epoch": 5.1549002085195115, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02778309882112898, + "loss": 0.7801, + "num_input_tokens_seen": 20081296, + "step": 34610 + }, + { + "epoch": 5.15564492106047, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02778207864416605, + "loss": 0.7981, + "num_input_tokens_seen": 20084208, + "step": 34615 + }, + { + "epoch": 5.15638963360143, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02778105825126289, + "loss": 0.8224, + "num_input_tokens_seen": 20087088, + "step": 34620 + }, + { + "epoch": 5.157134346142389, + "grad_norm": 0.01806640625, + "learning_rate": 0.027780037642436743, + "loss": 0.7984, + "num_input_tokens_seen": 20090032, + "step": 34625 + }, + { + "epoch": 5.157879058683348, + "grad_norm": 0.0201416015625, + "learning_rate": 0.027779016817704852, + "loss": 0.7863, + "num_input_tokens_seen": 20092848, + "step": 34630 + }, + { + "epoch": 5.158623771224307, + "grad_norm": 0.0107421875, + "learning_rate": 0.02777799577708446, + "loss": 0.7858, + "num_input_tokens_seen": 20095792, + "step": 34635 + }, + { + "epoch": 5.159368483765267, + "grad_norm": 0.01251220703125, + "learning_rate": 0.027776974520592815, + "loss": 0.8018, + "num_input_tokens_seen": 20099056, + "step": 34640 + }, + { + "epoch": 5.1601131963062254, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02777595304824717, + "loss": 0.7876, + "num_input_tokens_seen": 20101712, + "step": 34645 + }, + { + "epoch": 5.160857908847185, + "grad_norm": 0.01300048828125, + "learning_rate": 0.027774931360064783, + "loss": 0.8089, + "num_input_tokens_seen": 20104720, + "step": 34650 + }, + { + "epoch": 5.161602621388144, + "grad_norm": 0.019287109375, + "learning_rate": 0.027773909456062913, + "loss": 0.794, + "num_input_tokens_seen": 20107632, + "step": 34655 + }, + { + "epoch": 5.1623473339291035, + "grad_norm": 0.017578125, + "learning_rate": 0.02777288733625883, + "loss": 0.7871, + "num_input_tokens_seen": 20110768, + "step": 34660 + }, + { + "epoch": 5.163092046470062, + "grad_norm": 0.018310546875, + "learning_rate": 0.027771865000669794, + "loss": 0.8353, + "num_input_tokens_seen": 20113584, + "step": 34665 + }, + { + "epoch": 5.163836759011022, + "grad_norm": 0.01708984375, + "learning_rate": 0.02777084244931308, + "loss": 0.7958, + "num_input_tokens_seen": 20116656, + "step": 34670 + }, + { + "epoch": 5.164581471551981, + "grad_norm": 0.0272216796875, + "learning_rate": 0.027769819682205963, + "loss": 0.8249, + "num_input_tokens_seen": 20119696, + "step": 34675 + }, + { + "epoch": 5.16532618409294, + "grad_norm": 0.01806640625, + "learning_rate": 0.02776879669936572, + "loss": 0.7985, + "num_input_tokens_seen": 20122448, + "step": 34680 + }, + { + "epoch": 5.166070896633899, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027767773500809636, + "loss": 0.8122, + "num_input_tokens_seen": 20125328, + "step": 34685 + }, + { + "epoch": 5.166815609174859, + "grad_norm": 0.03515625, + "learning_rate": 0.027766750086554992, + "loss": 0.8108, + "num_input_tokens_seen": 20128272, + "step": 34690 + }, + { + "epoch": 5.1675603217158175, + "grad_norm": 0.01092529296875, + "learning_rate": 0.027765726456619084, + "loss": 0.7984, + "num_input_tokens_seen": 20131216, + "step": 34695 + }, + { + "epoch": 5.168305034256777, + "grad_norm": 0.021484375, + "learning_rate": 0.027764702611019203, + "loss": 0.8163, + "num_input_tokens_seen": 20134384, + "step": 34700 + }, + { + "epoch": 5.169049746797736, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027763678549772644, + "loss": 0.7871, + "num_input_tokens_seen": 20137360, + "step": 34705 + }, + { + "epoch": 5.1697944593386955, + "grad_norm": 0.01214599609375, + "learning_rate": 0.027762654272896706, + "loss": 0.8081, + "num_input_tokens_seen": 20140432, + "step": 34710 + }, + { + "epoch": 5.170539171879654, + "grad_norm": 0.020263671875, + "learning_rate": 0.0277616297804087, + "loss": 0.7963, + "num_input_tokens_seen": 20143344, + "step": 34715 + }, + { + "epoch": 5.171283884420614, + "grad_norm": 0.017333984375, + "learning_rate": 0.027760605072325927, + "loss": 0.807, + "num_input_tokens_seen": 20146384, + "step": 34720 + }, + { + "epoch": 5.172028596961573, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0277595801486657, + "loss": 0.7934, + "num_input_tokens_seen": 20149136, + "step": 34725 + }, + { + "epoch": 5.172773309502532, + "grad_norm": 0.0174560546875, + "learning_rate": 0.027758555009445334, + "loss": 0.7933, + "num_input_tokens_seen": 20151824, + "step": 34730 + }, + { + "epoch": 5.173518022043491, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02775752965468215, + "loss": 0.7914, + "num_input_tokens_seen": 20154544, + "step": 34735 + }, + { + "epoch": 5.174262734584451, + "grad_norm": 0.017822265625, + "learning_rate": 0.027756504084393466, + "loss": 0.8016, + "num_input_tokens_seen": 20157488, + "step": 34740 + }, + { + "epoch": 5.1750074471254095, + "grad_norm": 0.0208740234375, + "learning_rate": 0.027755478298596616, + "loss": 0.8058, + "num_input_tokens_seen": 20159984, + "step": 34745 + }, + { + "epoch": 5.175752159666369, + "grad_norm": 0.0174560546875, + "learning_rate": 0.02775445229730892, + "loss": 0.7876, + "num_input_tokens_seen": 20162704, + "step": 34750 + }, + { + "epoch": 5.176496872207328, + "grad_norm": 0.0242919921875, + "learning_rate": 0.027753426080547716, + "loss": 0.8135, + "num_input_tokens_seen": 20165616, + "step": 34755 + }, + { + "epoch": 5.1772415847482876, + "grad_norm": 0.022705078125, + "learning_rate": 0.027752399648330338, + "loss": 0.7873, + "num_input_tokens_seen": 20168432, + "step": 34760 + }, + { + "epoch": 5.177986297289246, + "grad_norm": 0.011962890625, + "learning_rate": 0.02775137300067413, + "loss": 0.787, + "num_input_tokens_seen": 20171408, + "step": 34765 + }, + { + "epoch": 5.178731009830206, + "grad_norm": 0.018310546875, + "learning_rate": 0.02775034613759644, + "loss": 0.7927, + "num_input_tokens_seen": 20174224, + "step": 34770 + }, + { + "epoch": 5.179475722371165, + "grad_norm": 0.0186767578125, + "learning_rate": 0.027749319059114607, + "loss": 0.808, + "num_input_tokens_seen": 20177360, + "step": 34775 + }, + { + "epoch": 5.180220434912124, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027748291765245982, + "loss": 0.7904, + "num_input_tokens_seen": 20180176, + "step": 34780 + }, + { + "epoch": 5.180965147453083, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02774726425600793, + "loss": 0.7994, + "num_input_tokens_seen": 20183024, + "step": 34785 + }, + { + "epoch": 5.181709859994042, + "grad_norm": 0.0208740234375, + "learning_rate": 0.027746236531417806, + "loss": 0.8035, + "num_input_tokens_seen": 20185872, + "step": 34790 + }, + { + "epoch": 5.1824545725350015, + "grad_norm": 0.019287109375, + "learning_rate": 0.027745208591492966, + "loss": 0.8122, + "num_input_tokens_seen": 20188592, + "step": 34795 + }, + { + "epoch": 5.18319928507596, + "grad_norm": 0.0185546875, + "learning_rate": 0.027744180436250782, + "loss": 0.8001, + "num_input_tokens_seen": 20191504, + "step": 34800 + }, + { + "epoch": 5.18394399761692, + "grad_norm": 0.023681640625, + "learning_rate": 0.027743152065708623, + "loss": 0.8265, + "num_input_tokens_seen": 20194576, + "step": 34805 + }, + { + "epoch": 5.184688710157879, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02774212347988386, + "loss": 0.8177, + "num_input_tokens_seen": 20197456, + "step": 34810 + }, + { + "epoch": 5.185433422698838, + "grad_norm": 0.0245361328125, + "learning_rate": 0.027741094678793873, + "loss": 0.8127, + "num_input_tokens_seen": 20200368, + "step": 34815 + }, + { + "epoch": 5.186178135239797, + "grad_norm": 0.028076171875, + "learning_rate": 0.02774006566245604, + "loss": 0.7817, + "num_input_tokens_seen": 20203152, + "step": 34820 + }, + { + "epoch": 5.186922847780757, + "grad_norm": 0.018310546875, + "learning_rate": 0.027739036430887747, + "loss": 0.7913, + "num_input_tokens_seen": 20205840, + "step": 34825 + }, + { + "epoch": 5.1876675603217155, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02773800698410638, + "loss": 0.7899, + "num_input_tokens_seen": 20208656, + "step": 34830 + }, + { + "epoch": 5.188412272862675, + "grad_norm": 0.02587890625, + "learning_rate": 0.027736977322129326, + "loss": 0.7792, + "num_input_tokens_seen": 20211344, + "step": 34835 + }, + { + "epoch": 5.189156985403634, + "grad_norm": 0.019775390625, + "learning_rate": 0.027735947444974, + "loss": 0.8308, + "num_input_tokens_seen": 20214288, + "step": 34840 + }, + { + "epoch": 5.1899016979445936, + "grad_norm": 0.0296630859375, + "learning_rate": 0.027734917352657772, + "loss": 0.7925, + "num_input_tokens_seen": 20216912, + "step": 34845 + }, + { + "epoch": 5.190646410485552, + "grad_norm": 0.022216796875, + "learning_rate": 0.027733887045198063, + "loss": 0.7965, + "num_input_tokens_seen": 20219984, + "step": 34850 + }, + { + "epoch": 5.191391123026512, + "grad_norm": 0.0194091796875, + "learning_rate": 0.027732856522612277, + "loss": 0.8058, + "num_input_tokens_seen": 20222736, + "step": 34855 + }, + { + "epoch": 5.192135835567471, + "grad_norm": 0.0186767578125, + "learning_rate": 0.027731825784917816, + "loss": 0.8065, + "num_input_tokens_seen": 20225488, + "step": 34860 + }, + { + "epoch": 5.19288054810843, + "grad_norm": 0.0283203125, + "learning_rate": 0.027730794832132102, + "loss": 0.8214, + "num_input_tokens_seen": 20228336, + "step": 34865 + }, + { + "epoch": 5.193625260649389, + "grad_norm": 0.0196533203125, + "learning_rate": 0.027729763664272548, + "loss": 0.7902, + "num_input_tokens_seen": 20231568, + "step": 34870 + }, + { + "epoch": 5.194369973190349, + "grad_norm": 0.0225830078125, + "learning_rate": 0.027728732281356576, + "loss": 0.8249, + "num_input_tokens_seen": 20234512, + "step": 34875 + }, + { + "epoch": 5.1951146857313075, + "grad_norm": 0.01318359375, + "learning_rate": 0.027727700683401607, + "loss": 0.8002, + "num_input_tokens_seen": 20237456, + "step": 34880 + }, + { + "epoch": 5.195859398272267, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02772666887042507, + "loss": 0.8033, + "num_input_tokens_seen": 20240848, + "step": 34885 + }, + { + "epoch": 5.196604110813226, + "grad_norm": 0.01904296875, + "learning_rate": 0.027725636842444397, + "loss": 0.8123, + "num_input_tokens_seen": 20243696, + "step": 34890 + }, + { + "epoch": 5.197348823354186, + "grad_norm": 0.01318359375, + "learning_rate": 0.027724604599477027, + "loss": 0.8279, + "num_input_tokens_seen": 20246608, + "step": 34895 + }, + { + "epoch": 5.198093535895144, + "grad_norm": 0.013427734375, + "learning_rate": 0.027723572141540392, + "loss": 0.826, + "num_input_tokens_seen": 20249360, + "step": 34900 + }, + { + "epoch": 5.198838248436104, + "grad_norm": 0.01214599609375, + "learning_rate": 0.027722539468651938, + "loss": 0.7986, + "num_input_tokens_seen": 20252464, + "step": 34905 + }, + { + "epoch": 5.199582960977063, + "grad_norm": 0.01031494140625, + "learning_rate": 0.027721506580829107, + "loss": 0.7918, + "num_input_tokens_seen": 20255056, + "step": 34910 + }, + { + "epoch": 5.200327673518022, + "grad_norm": 0.02587890625, + "learning_rate": 0.02772047347808936, + "loss": 0.8068, + "num_input_tokens_seen": 20258160, + "step": 34915 + }, + { + "epoch": 5.201072386058981, + "grad_norm": 0.0111083984375, + "learning_rate": 0.027719440160450137, + "loss": 0.7967, + "num_input_tokens_seen": 20261136, + "step": 34920 + }, + { + "epoch": 5.201817098599941, + "grad_norm": 0.01409912109375, + "learning_rate": 0.027718406627928897, + "loss": 0.8118, + "num_input_tokens_seen": 20264176, + "step": 34925 + }, + { + "epoch": 5.2025618111408996, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02771737288054311, + "loss": 0.8223, + "num_input_tokens_seen": 20266960, + "step": 34930 + }, + { + "epoch": 5.203306523681859, + "grad_norm": 0.019775390625, + "learning_rate": 0.027716338918310227, + "loss": 0.7983, + "num_input_tokens_seen": 20269776, + "step": 34935 + }, + { + "epoch": 5.204051236222818, + "grad_norm": 0.01068115234375, + "learning_rate": 0.027715304741247726, + "loss": 0.8092, + "num_input_tokens_seen": 20272272, + "step": 34940 + }, + { + "epoch": 5.204795948763778, + "grad_norm": 0.021728515625, + "learning_rate": 0.027714270349373078, + "loss": 0.8036, + "num_input_tokens_seen": 20275088, + "step": 34945 + }, + { + "epoch": 5.205540661304736, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02771323574270375, + "loss": 0.8136, + "num_input_tokens_seen": 20277808, + "step": 34950 + }, + { + "epoch": 5.206285373845695, + "grad_norm": 0.0250244140625, + "learning_rate": 0.027712200921257226, + "loss": 0.8087, + "num_input_tokens_seen": 20280496, + "step": 34955 + }, + { + "epoch": 5.207030086386655, + "grad_norm": 0.0235595703125, + "learning_rate": 0.027711165885050987, + "loss": 0.8099, + "num_input_tokens_seen": 20283824, + "step": 34960 + }, + { + "epoch": 5.2077747989276135, + "grad_norm": 0.021240234375, + "learning_rate": 0.02771013063410252, + "loss": 0.8071, + "num_input_tokens_seen": 20286864, + "step": 34965 + }, + { + "epoch": 5.208519511468573, + "grad_norm": 0.027587890625, + "learning_rate": 0.027709095168429315, + "loss": 0.7813, + "num_input_tokens_seen": 20289648, + "step": 34970 + }, + { + "epoch": 5.209264224009532, + "grad_norm": 0.02880859375, + "learning_rate": 0.027708059488048865, + "loss": 0.808, + "num_input_tokens_seen": 20293072, + "step": 34975 + }, + { + "epoch": 5.210008936550492, + "grad_norm": 0.018798828125, + "learning_rate": 0.027707023592978663, + "loss": 0.79, + "num_input_tokens_seen": 20295920, + "step": 34980 + }, + { + "epoch": 5.21075364909145, + "grad_norm": 0.01373291015625, + "learning_rate": 0.027705987483236213, + "loss": 0.7941, + "num_input_tokens_seen": 20298768, + "step": 34985 + }, + { + "epoch": 5.21149836163241, + "grad_norm": 0.020751953125, + "learning_rate": 0.027704951158839017, + "loss": 0.8171, + "num_input_tokens_seen": 20301584, + "step": 34990 + }, + { + "epoch": 5.212243074173369, + "grad_norm": 0.0283203125, + "learning_rate": 0.027703914619804586, + "loss": 0.8062, + "num_input_tokens_seen": 20304624, + "step": 34995 + }, + { + "epoch": 5.212987786714328, + "grad_norm": 0.021728515625, + "learning_rate": 0.027702877866150433, + "loss": 0.7884, + "num_input_tokens_seen": 20307664, + "step": 35000 + }, + { + "epoch": 5.213732499255287, + "grad_norm": 0.02294921875, + "learning_rate": 0.027701840897894063, + "loss": 0.8095, + "num_input_tokens_seen": 20310640, + "step": 35005 + }, + { + "epoch": 5.214477211796247, + "grad_norm": 0.01336669921875, + "learning_rate": 0.027700803715053002, + "loss": 0.8157, + "num_input_tokens_seen": 20313328, + "step": 35010 + }, + { + "epoch": 5.2152219243372056, + "grad_norm": 0.02783203125, + "learning_rate": 0.02769976631764477, + "loss": 0.8069, + "num_input_tokens_seen": 20316080, + "step": 35015 + }, + { + "epoch": 5.215966636878165, + "grad_norm": 0.02783203125, + "learning_rate": 0.027698728705686893, + "loss": 0.8092, + "num_input_tokens_seen": 20318960, + "step": 35020 + }, + { + "epoch": 5.216711349419124, + "grad_norm": 0.01226806640625, + "learning_rate": 0.027697690879196903, + "loss": 0.8072, + "num_input_tokens_seen": 20321936, + "step": 35025 + }, + { + "epoch": 5.217456061960084, + "grad_norm": 0.01507568359375, + "learning_rate": 0.02769665283819233, + "loss": 0.8033, + "num_input_tokens_seen": 20325072, + "step": 35030 + }, + { + "epoch": 5.218200774501042, + "grad_norm": 0.03271484375, + "learning_rate": 0.027695614582690712, + "loss": 0.8066, + "num_input_tokens_seen": 20328272, + "step": 35035 + }, + { + "epoch": 5.218945487042002, + "grad_norm": 0.030517578125, + "learning_rate": 0.02769457611270959, + "loss": 0.8119, + "num_input_tokens_seen": 20331152, + "step": 35040 + }, + { + "epoch": 5.219690199582961, + "grad_norm": 0.0269775390625, + "learning_rate": 0.027693537428266506, + "loss": 0.7936, + "num_input_tokens_seen": 20334000, + "step": 35045 + }, + { + "epoch": 5.22043491212392, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02769249852937901, + "loss": 0.8138, + "num_input_tokens_seen": 20336976, + "step": 35050 + }, + { + "epoch": 5.221179624664879, + "grad_norm": 0.029541015625, + "learning_rate": 0.027691459416064646, + "loss": 0.8019, + "num_input_tokens_seen": 20339952, + "step": 35055 + }, + { + "epoch": 5.221924337205839, + "grad_norm": 0.025634765625, + "learning_rate": 0.02769042008834098, + "loss": 0.802, + "num_input_tokens_seen": 20342832, + "step": 35060 + }, + { + "epoch": 5.222669049746798, + "grad_norm": 0.02392578125, + "learning_rate": 0.02768938054622556, + "loss": 0.7866, + "num_input_tokens_seen": 20345584, + "step": 35065 + }, + { + "epoch": 5.223413762287757, + "grad_norm": 0.02734375, + "learning_rate": 0.02768834078973596, + "loss": 0.809, + "num_input_tokens_seen": 20348432, + "step": 35070 + }, + { + "epoch": 5.224158474828716, + "grad_norm": 0.0281982421875, + "learning_rate": 0.027687300818889732, + "loss": 0.801, + "num_input_tokens_seen": 20351632, + "step": 35075 + }, + { + "epoch": 5.224903187369676, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02768626063370445, + "loss": 0.8058, + "num_input_tokens_seen": 20354608, + "step": 35080 + }, + { + "epoch": 5.225647899910634, + "grad_norm": 0.0228271484375, + "learning_rate": 0.027685220234197692, + "loss": 0.8178, + "num_input_tokens_seen": 20357456, + "step": 35085 + }, + { + "epoch": 5.226392612451594, + "grad_norm": 0.021484375, + "learning_rate": 0.02768417962038703, + "loss": 0.8037, + "num_input_tokens_seen": 20360784, + "step": 35090 + }, + { + "epoch": 5.227137324992553, + "grad_norm": 0.0242919921875, + "learning_rate": 0.027683138792290042, + "loss": 0.7969, + "num_input_tokens_seen": 20363600, + "step": 35095 + }, + { + "epoch": 5.227882037533512, + "grad_norm": 0.023193359375, + "learning_rate": 0.027682097749924316, + "loss": 0.8092, + "num_input_tokens_seen": 20366608, + "step": 35100 + }, + { + "epoch": 5.228626750074471, + "grad_norm": 0.031005859375, + "learning_rate": 0.02768105649330744, + "loss": 0.8005, + "num_input_tokens_seen": 20369712, + "step": 35105 + }, + { + "epoch": 5.229371462615431, + "grad_norm": 0.02490234375, + "learning_rate": 0.027680015022457007, + "loss": 0.8035, + "num_input_tokens_seen": 20372880, + "step": 35110 + }, + { + "epoch": 5.23011617515639, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0276789733373906, + "loss": 0.8095, + "num_input_tokens_seen": 20375760, + "step": 35115 + }, + { + "epoch": 5.230860887697349, + "grad_norm": 0.0262451171875, + "learning_rate": 0.027677931438125827, + "loss": 0.7974, + "num_input_tokens_seen": 20378704, + "step": 35120 + }, + { + "epoch": 5.231605600238308, + "grad_norm": 0.01495361328125, + "learning_rate": 0.02767688932468029, + "loss": 0.7947, + "num_input_tokens_seen": 20381360, + "step": 35125 + }, + { + "epoch": 5.232350312779268, + "grad_norm": 0.024658203125, + "learning_rate": 0.02767584699707159, + "loss": 0.7916, + "num_input_tokens_seen": 20384240, + "step": 35130 + }, + { + "epoch": 5.233095025320226, + "grad_norm": 0.031494140625, + "learning_rate": 0.027674804455317337, + "loss": 0.8184, + "num_input_tokens_seen": 20387024, + "step": 35135 + }, + { + "epoch": 5.233839737861185, + "grad_norm": 0.0255126953125, + "learning_rate": 0.027673761699435147, + "loss": 0.7906, + "num_input_tokens_seen": 20390192, + "step": 35140 + }, + { + "epoch": 5.234584450402145, + "grad_norm": 0.0264892578125, + "learning_rate": 0.027672718729442634, + "loss": 0.7877, + "num_input_tokens_seen": 20393232, + "step": 35145 + }, + { + "epoch": 5.235329162943104, + "grad_norm": 0.028564453125, + "learning_rate": 0.027671675545357415, + "loss": 0.8172, + "num_input_tokens_seen": 20396368, + "step": 35150 + }, + { + "epoch": 5.236073875484063, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02767063214719712, + "loss": 0.8171, + "num_input_tokens_seen": 20399280, + "step": 35155 + }, + { + "epoch": 5.236818588025022, + "grad_norm": 0.0245361328125, + "learning_rate": 0.027669588534979375, + "loss": 0.7927, + "num_input_tokens_seen": 20402256, + "step": 35160 + }, + { + "epoch": 5.237563300565982, + "grad_norm": 0.01422119140625, + "learning_rate": 0.027668544708721803, + "loss": 0.8278, + "num_input_tokens_seen": 20404880, + "step": 35165 + }, + { + "epoch": 5.23830801310694, + "grad_norm": 0.01251220703125, + "learning_rate": 0.02766750066844205, + "loss": 0.7992, + "num_input_tokens_seen": 20408016, + "step": 35170 + }, + { + "epoch": 5.2390527256479, + "grad_norm": 0.01422119140625, + "learning_rate": 0.027666456414157742, + "loss": 0.8021, + "num_input_tokens_seen": 20410832, + "step": 35175 + }, + { + "epoch": 5.239797438188859, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027665411945886526, + "loss": 0.7914, + "num_input_tokens_seen": 20413520, + "step": 35180 + }, + { + "epoch": 5.240542150729818, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02766436726364605, + "loss": 0.8042, + "num_input_tokens_seen": 20416528, + "step": 35185 + }, + { + "epoch": 5.241286863270777, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02766332236745396, + "loss": 0.8001, + "num_input_tokens_seen": 20419504, + "step": 35190 + }, + { + "epoch": 5.242031575811737, + "grad_norm": 0.011474609375, + "learning_rate": 0.02766227725732791, + "loss": 0.8096, + "num_input_tokens_seen": 20422352, + "step": 35195 + }, + { + "epoch": 5.242776288352696, + "grad_norm": 0.023681640625, + "learning_rate": 0.027661231933285554, + "loss": 0.7905, + "num_input_tokens_seen": 20425328, + "step": 35200 + }, + { + "epoch": 5.243521000893655, + "grad_norm": 0.01324462890625, + "learning_rate": 0.027660186395344556, + "loss": 0.7958, + "num_input_tokens_seen": 20428272, + "step": 35205 + }, + { + "epoch": 5.244265713434614, + "grad_norm": 0.0311279296875, + "learning_rate": 0.027659140643522574, + "loss": 0.7801, + "num_input_tokens_seen": 20431280, + "step": 35210 + }, + { + "epoch": 5.245010425975574, + "grad_norm": 0.022216796875, + "learning_rate": 0.027658094677837277, + "loss": 0.7914, + "num_input_tokens_seen": 20433936, + "step": 35215 + }, + { + "epoch": 5.245755138516532, + "grad_norm": 0.01544189453125, + "learning_rate": 0.027657048498306334, + "loss": 0.7739, + "num_input_tokens_seen": 20437008, + "step": 35220 + }, + { + "epoch": 5.246499851057492, + "grad_norm": 0.0242919921875, + "learning_rate": 0.027656002104947425, + "loss": 0.8152, + "num_input_tokens_seen": 20439824, + "step": 35225 + }, + { + "epoch": 5.247244563598451, + "grad_norm": 0.022216796875, + "learning_rate": 0.02765495549777822, + "loss": 0.8087, + "num_input_tokens_seen": 20442640, + "step": 35230 + }, + { + "epoch": 5.2479892761394105, + "grad_norm": 0.0220947265625, + "learning_rate": 0.027653908676816403, + "loss": 0.7873, + "num_input_tokens_seen": 20445488, + "step": 35235 + }, + { + "epoch": 5.248733988680369, + "grad_norm": 0.0223388671875, + "learning_rate": 0.027652861642079665, + "loss": 0.7886, + "num_input_tokens_seen": 20448304, + "step": 35240 + }, + { + "epoch": 5.249478701221329, + "grad_norm": 0.025634765625, + "learning_rate": 0.027651814393585678, + "loss": 0.8002, + "num_input_tokens_seen": 20451440, + "step": 35245 + }, + { + "epoch": 5.250223413762288, + "grad_norm": 0.0242919921875, + "learning_rate": 0.027650766931352155, + "loss": 0.8342, + "num_input_tokens_seen": 20454480, + "step": 35250 + }, + { + "epoch": 5.250968126303247, + "grad_norm": 0.03955078125, + "learning_rate": 0.027649719255396783, + "loss": 0.8277, + "num_input_tokens_seen": 20457648, + "step": 35255 + }, + { + "epoch": 5.251712838844206, + "grad_norm": 0.029296875, + "learning_rate": 0.027648671365737256, + "loss": 0.7961, + "num_input_tokens_seen": 20460496, + "step": 35260 + }, + { + "epoch": 5.252457551385166, + "grad_norm": 0.01251220703125, + "learning_rate": 0.027647623262391285, + "loss": 0.7956, + "num_input_tokens_seen": 20463408, + "step": 35265 + }, + { + "epoch": 5.253202263926124, + "grad_norm": 0.0262451171875, + "learning_rate": 0.027646574945376572, + "loss": 0.8054, + "num_input_tokens_seen": 20466384, + "step": 35270 + }, + { + "epoch": 5.253946976467084, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02764552641471083, + "loss": 0.7912, + "num_input_tokens_seen": 20470064, + "step": 35275 + }, + { + "epoch": 5.254691689008043, + "grad_norm": 0.0196533203125, + "learning_rate": 0.027644477670411772, + "loss": 0.82, + "num_input_tokens_seen": 20473072, + "step": 35280 + }, + { + "epoch": 5.2554364015490025, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02764342871249711, + "loss": 0.7996, + "num_input_tokens_seen": 20476176, + "step": 35285 + }, + { + "epoch": 5.256181114089961, + "grad_norm": 0.01373291015625, + "learning_rate": 0.027642379540984575, + "loss": 0.7955, + "num_input_tokens_seen": 20479056, + "step": 35290 + }, + { + "epoch": 5.256925826630921, + "grad_norm": 0.0137939453125, + "learning_rate": 0.027641330155891885, + "loss": 0.7786, + "num_input_tokens_seen": 20481904, + "step": 35295 + }, + { + "epoch": 5.25767053917188, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02764028055723677, + "loss": 0.8008, + "num_input_tokens_seen": 20484656, + "step": 35300 + }, + { + "epoch": 5.258415251712838, + "grad_norm": 0.0150146484375, + "learning_rate": 0.027639230745036963, + "loss": 0.7944, + "num_input_tokens_seen": 20487632, + "step": 35305 + }, + { + "epoch": 5.259159964253798, + "grad_norm": 0.02197265625, + "learning_rate": 0.0276381807193102, + "loss": 0.7829, + "num_input_tokens_seen": 20490448, + "step": 35310 + }, + { + "epoch": 5.259904676794757, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02763713048007422, + "loss": 0.8146, + "num_input_tokens_seen": 20493168, + "step": 35315 + }, + { + "epoch": 5.2606493893357165, + "grad_norm": 0.0213623046875, + "learning_rate": 0.027636080027346757, + "loss": 0.7957, + "num_input_tokens_seen": 20496016, + "step": 35320 + }, + { + "epoch": 5.261394101876675, + "grad_norm": 0.0274658203125, + "learning_rate": 0.027635029361145567, + "loss": 0.7971, + "num_input_tokens_seen": 20498992, + "step": 35325 + }, + { + "epoch": 5.262138814417635, + "grad_norm": 0.015869140625, + "learning_rate": 0.0276339784814884, + "loss": 0.7858, + "num_input_tokens_seen": 20502256, + "step": 35330 + }, + { + "epoch": 5.262883526958594, + "grad_norm": 0.0213623046875, + "learning_rate": 0.027632927388393007, + "loss": 0.7988, + "num_input_tokens_seen": 20505328, + "step": 35335 + }, + { + "epoch": 5.263628239499553, + "grad_norm": 0.0201416015625, + "learning_rate": 0.027631876081877144, + "loss": 0.8046, + "num_input_tokens_seen": 20508272, + "step": 35340 + }, + { + "epoch": 5.264372952040512, + "grad_norm": 0.038818359375, + "learning_rate": 0.027630824561958575, + "loss": 0.8044, + "num_input_tokens_seen": 20511280, + "step": 35345 + }, + { + "epoch": 5.265117664581472, + "grad_norm": 0.01190185546875, + "learning_rate": 0.02762977282865506, + "loss": 0.7988, + "num_input_tokens_seen": 20514256, + "step": 35350 + }, + { + "epoch": 5.26586237712243, + "grad_norm": 0.0208740234375, + "learning_rate": 0.027628720881984373, + "loss": 0.7942, + "num_input_tokens_seen": 20517296, + "step": 35355 + }, + { + "epoch": 5.26660708966339, + "grad_norm": 0.0322265625, + "learning_rate": 0.027627668721964278, + "loss": 0.8107, + "num_input_tokens_seen": 20520304, + "step": 35360 + }, + { + "epoch": 5.267351802204349, + "grad_norm": 0.0196533203125, + "learning_rate": 0.027626616348612555, + "loss": 0.8043, + "num_input_tokens_seen": 20523248, + "step": 35365 + }, + { + "epoch": 5.2680965147453085, + "grad_norm": 0.0252685546875, + "learning_rate": 0.027625563761946987, + "loss": 0.7961, + "num_input_tokens_seen": 20526000, + "step": 35370 + }, + { + "epoch": 5.268841227286267, + "grad_norm": 0.01483154296875, + "learning_rate": 0.027624510961985344, + "loss": 0.8264, + "num_input_tokens_seen": 20528816, + "step": 35375 + }, + { + "epoch": 5.269585939827227, + "grad_norm": 0.01953125, + "learning_rate": 0.027623457948745423, + "loss": 0.7976, + "num_input_tokens_seen": 20531440, + "step": 35380 + }, + { + "epoch": 5.270330652368186, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02762240472224501, + "loss": 0.8096, + "num_input_tokens_seen": 20534576, + "step": 35385 + }, + { + "epoch": 5.271075364909145, + "grad_norm": 0.0244140625, + "learning_rate": 0.027621351282501896, + "loss": 0.8029, + "num_input_tokens_seen": 20537328, + "step": 35390 + }, + { + "epoch": 5.271820077450104, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02762029762953388, + "loss": 0.8015, + "num_input_tokens_seen": 20540272, + "step": 35395 + }, + { + "epoch": 5.272564789991064, + "grad_norm": 0.025390625, + "learning_rate": 0.027619243763358767, + "loss": 0.8137, + "num_input_tokens_seen": 20543088, + "step": 35400 + }, + { + "epoch": 5.2733095025320225, + "grad_norm": 0.02880859375, + "learning_rate": 0.027618189683994352, + "loss": 0.7914, + "num_input_tokens_seen": 20546032, + "step": 35405 + }, + { + "epoch": 5.274054215072982, + "grad_norm": 0.020751953125, + "learning_rate": 0.027617135391458447, + "loss": 0.812, + "num_input_tokens_seen": 20548784, + "step": 35410 + }, + { + "epoch": 5.274798927613941, + "grad_norm": 0.0272216796875, + "learning_rate": 0.027616080885768862, + "loss": 0.7975, + "num_input_tokens_seen": 20551856, + "step": 35415 + }, + { + "epoch": 5.2755436401549005, + "grad_norm": 0.020751953125, + "learning_rate": 0.027615026166943415, + "loss": 0.7968, + "num_input_tokens_seen": 20554384, + "step": 35420 + }, + { + "epoch": 5.276288352695859, + "grad_norm": 0.0115966796875, + "learning_rate": 0.02761397123499992, + "loss": 0.7827, + "num_input_tokens_seen": 20557264, + "step": 35425 + }, + { + "epoch": 5.277033065236819, + "grad_norm": 0.0284423828125, + "learning_rate": 0.027612916089956203, + "loss": 0.8072, + "num_input_tokens_seen": 20560016, + "step": 35430 + }, + { + "epoch": 5.277777777777778, + "grad_norm": 0.0269775390625, + "learning_rate": 0.027611860731830088, + "loss": 0.8045, + "num_input_tokens_seen": 20563216, + "step": 35435 + }, + { + "epoch": 5.278522490318737, + "grad_norm": 0.027099609375, + "learning_rate": 0.027610805160639403, + "loss": 0.8002, + "num_input_tokens_seen": 20566000, + "step": 35440 + }, + { + "epoch": 5.279267202859696, + "grad_norm": 0.016845703125, + "learning_rate": 0.027609749376401985, + "loss": 0.7845, + "num_input_tokens_seen": 20568752, + "step": 35445 + }, + { + "epoch": 5.280011915400656, + "grad_norm": 0.01275634765625, + "learning_rate": 0.027608693379135664, + "loss": 0.8039, + "num_input_tokens_seen": 20571664, + "step": 35450 + }, + { + "epoch": 5.2807566279416145, + "grad_norm": 0.0205078125, + "learning_rate": 0.027607637168858286, + "loss": 0.8119, + "num_input_tokens_seen": 20574448, + "step": 35455 + }, + { + "epoch": 5.281501340482574, + "grad_norm": 0.024658203125, + "learning_rate": 0.027606580745587687, + "loss": 0.8328, + "num_input_tokens_seen": 20577616, + "step": 35460 + }, + { + "epoch": 5.282246053023533, + "grad_norm": 0.0120849609375, + "learning_rate": 0.027605524109341723, + "loss": 0.818, + "num_input_tokens_seen": 20580848, + "step": 35465 + }, + { + "epoch": 5.282990765564492, + "grad_norm": 0.018310546875, + "learning_rate": 0.027604467260138238, + "loss": 0.795, + "num_input_tokens_seen": 20583760, + "step": 35470 + }, + { + "epoch": 5.283735478105451, + "grad_norm": 0.028564453125, + "learning_rate": 0.027603410197995094, + "loss": 0.8034, + "num_input_tokens_seen": 20586768, + "step": 35475 + }, + { + "epoch": 5.284480190646411, + "grad_norm": 0.0257568359375, + "learning_rate": 0.027602352922930135, + "loss": 0.802, + "num_input_tokens_seen": 20589712, + "step": 35480 + }, + { + "epoch": 5.28522490318737, + "grad_norm": 0.035888671875, + "learning_rate": 0.027601295434961245, + "loss": 0.7972, + "num_input_tokens_seen": 20592368, + "step": 35485 + }, + { + "epoch": 5.2859696157283285, + "grad_norm": 0.047607421875, + "learning_rate": 0.02760023773410627, + "loss": 0.8105, + "num_input_tokens_seen": 20595344, + "step": 35490 + }, + { + "epoch": 5.286714328269288, + "grad_norm": 0.0203857421875, + "learning_rate": 0.027599179820383082, + "loss": 0.7914, + "num_input_tokens_seen": 20598384, + "step": 35495 + }, + { + "epoch": 5.287459040810247, + "grad_norm": 0.0172119140625, + "learning_rate": 0.027598121693809558, + "loss": 0.8035, + "num_input_tokens_seen": 20601136, + "step": 35500 + }, + { + "epoch": 5.2882037533512065, + "grad_norm": 0.01226806640625, + "learning_rate": 0.027597063354403574, + "loss": 0.8213, + "num_input_tokens_seen": 20604176, + "step": 35505 + }, + { + "epoch": 5.288948465892165, + "grad_norm": 0.0196533203125, + "learning_rate": 0.027596004802183, + "loss": 0.8166, + "num_input_tokens_seen": 20607312, + "step": 35510 + }, + { + "epoch": 5.289693178433125, + "grad_norm": 0.033203125, + "learning_rate": 0.027594946037165738, + "loss": 0.7933, + "num_input_tokens_seen": 20610000, + "step": 35515 + }, + { + "epoch": 5.290437890974084, + "grad_norm": 0.03076171875, + "learning_rate": 0.027593887059369656, + "loss": 0.7901, + "num_input_tokens_seen": 20612880, + "step": 35520 + }, + { + "epoch": 5.291182603515043, + "grad_norm": 0.0211181640625, + "learning_rate": 0.027592827868812654, + "loss": 0.7897, + "num_input_tokens_seen": 20616048, + "step": 35525 + }, + { + "epoch": 5.291927316056002, + "grad_norm": 0.0198974609375, + "learning_rate": 0.027591768465512624, + "loss": 0.7916, + "num_input_tokens_seen": 20618832, + "step": 35530 + }, + { + "epoch": 5.292672028596962, + "grad_norm": 0.0189208984375, + "learning_rate": 0.027590708849487465, + "loss": 0.7951, + "num_input_tokens_seen": 20621552, + "step": 35535 + }, + { + "epoch": 5.2934167411379205, + "grad_norm": 0.01806640625, + "learning_rate": 0.027589649020755076, + "loss": 0.7961, + "num_input_tokens_seen": 20624624, + "step": 35540 + }, + { + "epoch": 5.29416145367888, + "grad_norm": 0.01904296875, + "learning_rate": 0.02758858897933336, + "loss": 0.7829, + "num_input_tokens_seen": 20627696, + "step": 35545 + }, + { + "epoch": 5.294906166219839, + "grad_norm": 0.01251220703125, + "learning_rate": 0.027587528725240233, + "loss": 0.808, + "num_input_tokens_seen": 20630704, + "step": 35550 + }, + { + "epoch": 5.2956508787607985, + "grad_norm": 0.020751953125, + "learning_rate": 0.027586468258493597, + "loss": 0.7798, + "num_input_tokens_seen": 20633520, + "step": 35555 + }, + { + "epoch": 5.296395591301757, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027585407579111373, + "loss": 0.808, + "num_input_tokens_seen": 20636336, + "step": 35560 + }, + { + "epoch": 5.297140303842717, + "grad_norm": 0.01336669921875, + "learning_rate": 0.027584346687111478, + "loss": 0.8142, + "num_input_tokens_seen": 20639120, + "step": 35565 + }, + { + "epoch": 5.297885016383676, + "grad_norm": 0.024658203125, + "learning_rate": 0.027583285582511836, + "loss": 0.8016, + "num_input_tokens_seen": 20641904, + "step": 35570 + }, + { + "epoch": 5.298629728924635, + "grad_norm": 0.012939453125, + "learning_rate": 0.027582224265330378, + "loss": 0.7945, + "num_input_tokens_seen": 20645008, + "step": 35575 + }, + { + "epoch": 5.299374441465594, + "grad_norm": 0.01300048828125, + "learning_rate": 0.027581162735585023, + "loss": 0.8019, + "num_input_tokens_seen": 20647984, + "step": 35580 + }, + { + "epoch": 5.300119154006554, + "grad_norm": 0.02783203125, + "learning_rate": 0.027580100993293714, + "loss": 0.7742, + "num_input_tokens_seen": 20650928, + "step": 35585 + }, + { + "epoch": 5.3008638665475125, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027579039038474387, + "loss": 0.8067, + "num_input_tokens_seen": 20653712, + "step": 35590 + }, + { + "epoch": 5.301608579088472, + "grad_norm": 0.0228271484375, + "learning_rate": 0.027577976871144976, + "loss": 0.7782, + "num_input_tokens_seen": 20656560, + "step": 35595 + }, + { + "epoch": 5.302353291629431, + "grad_norm": 0.013427734375, + "learning_rate": 0.027576914491323427, + "loss": 0.8178, + "num_input_tokens_seen": 20659344, + "step": 35600 + }, + { + "epoch": 5.303098004170391, + "grad_norm": 0.0186767578125, + "learning_rate": 0.027575851899027694, + "loss": 0.8241, + "num_input_tokens_seen": 20662416, + "step": 35605 + }, + { + "epoch": 5.303842716711349, + "grad_norm": 0.0263671875, + "learning_rate": 0.027574789094275725, + "loss": 0.8115, + "num_input_tokens_seen": 20665296, + "step": 35610 + }, + { + "epoch": 5.304587429252309, + "grad_norm": 0.0185546875, + "learning_rate": 0.027573726077085473, + "loss": 0.804, + "num_input_tokens_seen": 20667920, + "step": 35615 + }, + { + "epoch": 5.305332141793268, + "grad_norm": 0.01275634765625, + "learning_rate": 0.0275726628474749, + "loss": 0.8079, + "num_input_tokens_seen": 20671120, + "step": 35620 + }, + { + "epoch": 5.306076854334227, + "grad_norm": 0.0230712890625, + "learning_rate": 0.027571599405461965, + "loss": 0.8074, + "num_input_tokens_seen": 20674096, + "step": 35625 + }, + { + "epoch": 5.306821566875186, + "grad_norm": 0.018310546875, + "learning_rate": 0.027570535751064634, + "loss": 0.7812, + "num_input_tokens_seen": 20676784, + "step": 35630 + }, + { + "epoch": 5.307566279416146, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02756947188430088, + "loss": 0.8056, + "num_input_tokens_seen": 20679824, + "step": 35635 + }, + { + "epoch": 5.3083109919571045, + "grad_norm": 0.03369140625, + "learning_rate": 0.027568407805188672, + "loss": 0.7733, + "num_input_tokens_seen": 20682544, + "step": 35640 + }, + { + "epoch": 5.309055704498064, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027567343513745988, + "loss": 0.7909, + "num_input_tokens_seen": 20685488, + "step": 35645 + }, + { + "epoch": 5.309800417039023, + "grad_norm": 0.0263671875, + "learning_rate": 0.027566279009990813, + "loss": 0.7938, + "num_input_tokens_seen": 20688304, + "step": 35650 + }, + { + "epoch": 5.310545129579982, + "grad_norm": 0.031982421875, + "learning_rate": 0.027565214293941123, + "loss": 0.8038, + "num_input_tokens_seen": 20691024, + "step": 35655 + }, + { + "epoch": 5.311289842120941, + "grad_norm": 0.00982666015625, + "learning_rate": 0.027564149365614903, + "loss": 0.7897, + "num_input_tokens_seen": 20693840, + "step": 35660 + }, + { + "epoch": 5.3120345546619, + "grad_norm": 0.01708984375, + "learning_rate": 0.02756308422503016, + "loss": 0.7993, + "num_input_tokens_seen": 20696816, + "step": 35665 + }, + { + "epoch": 5.31277926720286, + "grad_norm": 0.0240478515625, + "learning_rate": 0.027562018872204868, + "loss": 0.756, + "num_input_tokens_seen": 20699856, + "step": 35670 + }, + { + "epoch": 5.3135239797438185, + "grad_norm": 0.0252685546875, + "learning_rate": 0.027560953307157035, + "loss": 0.7988, + "num_input_tokens_seen": 20702544, + "step": 35675 + }, + { + "epoch": 5.314268692284778, + "grad_norm": 0.0225830078125, + "learning_rate": 0.027559887529904667, + "loss": 0.8201, + "num_input_tokens_seen": 20705264, + "step": 35680 + }, + { + "epoch": 5.315013404825737, + "grad_norm": 0.031982421875, + "learning_rate": 0.027558821540465763, + "loss": 0.7985, + "num_input_tokens_seen": 20708080, + "step": 35685 + }, + { + "epoch": 5.315758117366697, + "grad_norm": 0.021240234375, + "learning_rate": 0.027557755338858337, + "loss": 0.7916, + "num_input_tokens_seen": 20710864, + "step": 35690 + }, + { + "epoch": 5.316502829907655, + "grad_norm": 0.0308837890625, + "learning_rate": 0.027556688925100388, + "loss": 0.8157, + "num_input_tokens_seen": 20713840, + "step": 35695 + }, + { + "epoch": 5.317247542448615, + "grad_norm": 0.03369140625, + "learning_rate": 0.02755562229920995, + "loss": 0.8263, + "num_input_tokens_seen": 20716528, + "step": 35700 + }, + { + "epoch": 5.317992254989574, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02755455546120503, + "loss": 0.8125, + "num_input_tokens_seen": 20719216, + "step": 35705 + }, + { + "epoch": 5.318736967530533, + "grad_norm": 0.0225830078125, + "learning_rate": 0.027553488411103656, + "loss": 0.7879, + "num_input_tokens_seen": 20721904, + "step": 35710 + }, + { + "epoch": 5.319481680071492, + "grad_norm": 0.0228271484375, + "learning_rate": 0.027552421148923856, + "loss": 0.8313, + "num_input_tokens_seen": 20725072, + "step": 35715 + }, + { + "epoch": 5.320226392612452, + "grad_norm": 0.022216796875, + "learning_rate": 0.027551353674683657, + "loss": 0.8112, + "num_input_tokens_seen": 20727760, + "step": 35720 + }, + { + "epoch": 5.3209711051534105, + "grad_norm": 0.0274658203125, + "learning_rate": 0.027550285988401094, + "loss": 0.7944, + "num_input_tokens_seen": 20730992, + "step": 35725 + }, + { + "epoch": 5.32171581769437, + "grad_norm": 0.01190185546875, + "learning_rate": 0.027549218090094207, + "loss": 0.8135, + "num_input_tokens_seen": 20733584, + "step": 35730 + }, + { + "epoch": 5.322460530235329, + "grad_norm": 0.020751953125, + "learning_rate": 0.02754814997978103, + "loss": 0.8119, + "num_input_tokens_seen": 20736784, + "step": 35735 + }, + { + "epoch": 5.323205242776289, + "grad_norm": 0.018798828125, + "learning_rate": 0.027547081657479617, + "loss": 0.7783, + "num_input_tokens_seen": 20739856, + "step": 35740 + }, + { + "epoch": 5.323949955317247, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02754601312320801, + "loss": 0.8063, + "num_input_tokens_seen": 20742928, + "step": 35745 + }, + { + "epoch": 5.324694667858207, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02754494437698426, + "loss": 0.7837, + "num_input_tokens_seen": 20746096, + "step": 35750 + }, + { + "epoch": 5.325439380399166, + "grad_norm": 0.0283203125, + "learning_rate": 0.02754387541882643, + "loss": 0.7864, + "num_input_tokens_seen": 20749008, + "step": 35755 + }, + { + "epoch": 5.326184092940125, + "grad_norm": 0.024169921875, + "learning_rate": 0.02754280624875257, + "loss": 0.7951, + "num_input_tokens_seen": 20752016, + "step": 35760 + }, + { + "epoch": 5.326928805481084, + "grad_norm": 0.021240234375, + "learning_rate": 0.02754173686678075, + "loss": 0.8162, + "num_input_tokens_seen": 20755248, + "step": 35765 + }, + { + "epoch": 5.327673518022044, + "grad_norm": 0.0211181640625, + "learning_rate": 0.027540667272929028, + "loss": 0.826, + "num_input_tokens_seen": 20758224, + "step": 35770 + }, + { + "epoch": 5.328418230563003, + "grad_norm": 0.019287109375, + "learning_rate": 0.02753959746721548, + "loss": 0.8009, + "num_input_tokens_seen": 20761232, + "step": 35775 + }, + { + "epoch": 5.329162943103962, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02753852744965818, + "loss": 0.8101, + "num_input_tokens_seen": 20764272, + "step": 35780 + }, + { + "epoch": 5.329907655644921, + "grad_norm": 0.0201416015625, + "learning_rate": 0.0275374572202752, + "loss": 0.8004, + "num_input_tokens_seen": 20767024, + "step": 35785 + }, + { + "epoch": 5.330652368185881, + "grad_norm": 0.031982421875, + "learning_rate": 0.027536386779084623, + "loss": 0.8086, + "num_input_tokens_seen": 20769968, + "step": 35790 + }, + { + "epoch": 5.331397080726839, + "grad_norm": 0.0133056640625, + "learning_rate": 0.02753531612610453, + "loss": 0.7994, + "num_input_tokens_seen": 20772816, + "step": 35795 + }, + { + "epoch": 5.332141793267799, + "grad_norm": 0.02001953125, + "learning_rate": 0.027534245261353018, + "loss": 0.7982, + "num_input_tokens_seen": 20775568, + "step": 35800 + }, + { + "epoch": 5.332886505808758, + "grad_norm": 0.01953125, + "learning_rate": 0.027533174184848165, + "loss": 0.8039, + "num_input_tokens_seen": 20778384, + "step": 35805 + }, + { + "epoch": 5.333631218349717, + "grad_norm": 0.032958984375, + "learning_rate": 0.02753210289660808, + "loss": 0.7986, + "num_input_tokens_seen": 20780944, + "step": 35810 + }, + { + "epoch": 5.334375930890676, + "grad_norm": 0.0308837890625, + "learning_rate": 0.027531031396650845, + "loss": 0.785, + "num_input_tokens_seen": 20783792, + "step": 35815 + }, + { + "epoch": 5.335120643431635, + "grad_norm": 0.033935546875, + "learning_rate": 0.027529959684994578, + "loss": 0.8152, + "num_input_tokens_seen": 20786704, + "step": 35820 + }, + { + "epoch": 5.335865355972595, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02752888776165737, + "loss": 0.8023, + "num_input_tokens_seen": 20789904, + "step": 35825 + }, + { + "epoch": 5.336610068513554, + "grad_norm": 0.0308837890625, + "learning_rate": 0.027527815626657338, + "loss": 0.7955, + "num_input_tokens_seen": 20792912, + "step": 35830 + }, + { + "epoch": 5.337354781054513, + "grad_norm": 0.0286865234375, + "learning_rate": 0.027526743280012596, + "loss": 0.8002, + "num_input_tokens_seen": 20795728, + "step": 35835 + }, + { + "epoch": 5.338099493595472, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02752567072174126, + "loss": 0.7873, + "num_input_tokens_seen": 20798480, + "step": 35840 + }, + { + "epoch": 5.338844206136431, + "grad_norm": 0.01904296875, + "learning_rate": 0.027524597951861445, + "loss": 0.8023, + "num_input_tokens_seen": 20801552, + "step": 35845 + }, + { + "epoch": 5.33958891867739, + "grad_norm": 0.0130615234375, + "learning_rate": 0.02752352497039128, + "loss": 0.7978, + "num_input_tokens_seen": 20804496, + "step": 35850 + }, + { + "epoch": 5.34033363121835, + "grad_norm": 0.0208740234375, + "learning_rate": 0.027522451777348883, + "loss": 0.8413, + "num_input_tokens_seen": 20807472, + "step": 35855 + }, + { + "epoch": 5.341078343759309, + "grad_norm": 0.01806640625, + "learning_rate": 0.027521378372752394, + "loss": 0.7997, + "num_input_tokens_seen": 20810448, + "step": 35860 + }, + { + "epoch": 5.341823056300268, + "grad_norm": 0.0135498046875, + "learning_rate": 0.027520304756619946, + "loss": 0.8135, + "num_input_tokens_seen": 20813488, + "step": 35865 + }, + { + "epoch": 5.342567768841227, + "grad_norm": 0.01177978515625, + "learning_rate": 0.027519230928969674, + "loss": 0.8294, + "num_input_tokens_seen": 20816208, + "step": 35870 + }, + { + "epoch": 5.343312481382187, + "grad_norm": 0.0118408203125, + "learning_rate": 0.027518156889819716, + "loss": 0.7882, + "num_input_tokens_seen": 20819120, + "step": 35875 + }, + { + "epoch": 5.344057193923145, + "grad_norm": 0.0203857421875, + "learning_rate": 0.027517082639188218, + "loss": 0.8223, + "num_input_tokens_seen": 20822128, + "step": 35880 + }, + { + "epoch": 5.344801906464105, + "grad_norm": 0.019775390625, + "learning_rate": 0.027516008177093335, + "loss": 0.7983, + "num_input_tokens_seen": 20825168, + "step": 35885 + }, + { + "epoch": 5.345546619005064, + "grad_norm": 0.0216064453125, + "learning_rate": 0.027514933503553215, + "loss": 0.8238, + "num_input_tokens_seen": 20827824, + "step": 35890 + }, + { + "epoch": 5.346291331546023, + "grad_norm": 0.02001953125, + "learning_rate": 0.02751385861858601, + "loss": 0.8079, + "num_input_tokens_seen": 20830576, + "step": 35895 + }, + { + "epoch": 5.347036044086982, + "grad_norm": 0.0128173828125, + "learning_rate": 0.027512783522209885, + "loss": 0.7932, + "num_input_tokens_seen": 20833328, + "step": 35900 + }, + { + "epoch": 5.347780756627942, + "grad_norm": 0.0145263671875, + "learning_rate": 0.027511708214442997, + "loss": 0.8097, + "num_input_tokens_seen": 20836208, + "step": 35905 + }, + { + "epoch": 5.348525469168901, + "grad_norm": 0.0196533203125, + "learning_rate": 0.027510632695303516, + "loss": 0.8046, + "num_input_tokens_seen": 20838896, + "step": 35910 + }, + { + "epoch": 5.34927018170986, + "grad_norm": 0.01544189453125, + "learning_rate": 0.02750955696480961, + "loss": 0.7995, + "num_input_tokens_seen": 20842128, + "step": 35915 + }, + { + "epoch": 5.350014894250819, + "grad_norm": 0.0130615234375, + "learning_rate": 0.027508481022979456, + "loss": 0.8063, + "num_input_tokens_seen": 20845104, + "step": 35920 + }, + { + "epoch": 5.350759606791779, + "grad_norm": 0.0123291015625, + "learning_rate": 0.027507404869831227, + "loss": 0.8113, + "num_input_tokens_seen": 20848144, + "step": 35925 + }, + { + "epoch": 5.351504319332737, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027506328505383103, + "loss": 0.7926, + "num_input_tokens_seen": 20851088, + "step": 35930 + }, + { + "epoch": 5.352249031873697, + "grad_norm": 0.028076171875, + "learning_rate": 0.027505251929653272, + "loss": 0.7909, + "num_input_tokens_seen": 20853968, + "step": 35935 + }, + { + "epoch": 5.352993744414656, + "grad_norm": 0.020263671875, + "learning_rate": 0.02750417514265992, + "loss": 0.802, + "num_input_tokens_seen": 20856944, + "step": 35940 + }, + { + "epoch": 5.3537384569556155, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02750309814442124, + "loss": 0.7984, + "num_input_tokens_seen": 20859856, + "step": 35945 + }, + { + "epoch": 5.354483169496574, + "grad_norm": 0.01287841796875, + "learning_rate": 0.027502020934955417, + "loss": 0.7989, + "num_input_tokens_seen": 20862608, + "step": 35950 + }, + { + "epoch": 5.355227882037534, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02750094351428066, + "loss": 0.7986, + "num_input_tokens_seen": 20865616, + "step": 35955 + }, + { + "epoch": 5.355972594578493, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02749986588241517, + "loss": 0.8019, + "num_input_tokens_seen": 20868368, + "step": 35960 + }, + { + "epoch": 5.356717307119452, + "grad_norm": 0.022216796875, + "learning_rate": 0.027498788039377147, + "loss": 0.7947, + "num_input_tokens_seen": 20871568, + "step": 35965 + }, + { + "epoch": 5.357462019660411, + "grad_norm": 0.0269775390625, + "learning_rate": 0.027497709985184804, + "loss": 0.7733, + "num_input_tokens_seen": 20874320, + "step": 35970 + }, + { + "epoch": 5.358206732201371, + "grad_norm": 0.0257568359375, + "learning_rate": 0.027496631719856352, + "loss": 0.83, + "num_input_tokens_seen": 20877296, + "step": 35975 + }, + { + "epoch": 5.358951444742329, + "grad_norm": 0.019775390625, + "learning_rate": 0.027495553243410013, + "loss": 0.8156, + "num_input_tokens_seen": 20879984, + "step": 35980 + }, + { + "epoch": 5.359696157283288, + "grad_norm": 0.013427734375, + "learning_rate": 0.027494474555864002, + "loss": 0.8026, + "num_input_tokens_seen": 20882896, + "step": 35985 + }, + { + "epoch": 5.360440869824248, + "grad_norm": 0.02978515625, + "learning_rate": 0.027493395657236536, + "loss": 0.8004, + "num_input_tokens_seen": 20885968, + "step": 35990 + }, + { + "epoch": 5.3611855823652075, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02749231654754585, + "loss": 0.7979, + "num_input_tokens_seen": 20888784, + "step": 35995 + }, + { + "epoch": 5.361930294906166, + "grad_norm": 0.01953125, + "learning_rate": 0.027491237226810173, + "loss": 0.8004, + "num_input_tokens_seen": 20891888, + "step": 36000 + }, + { + "epoch": 5.362675007447125, + "grad_norm": 0.0125732421875, + "learning_rate": 0.027490157695047743, + "loss": 0.8089, + "num_input_tokens_seen": 20894672, + "step": 36005 + }, + { + "epoch": 5.363419719988085, + "grad_norm": 0.021240234375, + "learning_rate": 0.02748907795227679, + "loss": 0.8141, + "num_input_tokens_seen": 20897712, + "step": 36010 + }, + { + "epoch": 5.364164432529043, + "grad_norm": 0.0213623046875, + "learning_rate": 0.027487997998515555, + "loss": 0.7844, + "num_input_tokens_seen": 20900688, + "step": 36015 + }, + { + "epoch": 5.364909145070003, + "grad_norm": 0.01123046875, + "learning_rate": 0.027486917833782286, + "loss": 0.7876, + "num_input_tokens_seen": 20903632, + "step": 36020 + }, + { + "epoch": 5.365653857610962, + "grad_norm": 0.01904296875, + "learning_rate": 0.027485837458095234, + "loss": 0.8159, + "num_input_tokens_seen": 20906448, + "step": 36025 + }, + { + "epoch": 5.3663985701519215, + "grad_norm": 0.01165771484375, + "learning_rate": 0.027484756871472654, + "loss": 0.8023, + "num_input_tokens_seen": 20909648, + "step": 36030 + }, + { + "epoch": 5.36714328269288, + "grad_norm": 0.0185546875, + "learning_rate": 0.027483676073932786, + "loss": 0.8054, + "num_input_tokens_seen": 20912688, + "step": 36035 + }, + { + "epoch": 5.36788799523384, + "grad_norm": 0.0213623046875, + "learning_rate": 0.027482595065493904, + "loss": 0.7901, + "num_input_tokens_seen": 20915536, + "step": 36040 + }, + { + "epoch": 5.368632707774799, + "grad_norm": 0.024658203125, + "learning_rate": 0.02748151384617426, + "loss": 0.8387, + "num_input_tokens_seen": 20918416, + "step": 36045 + }, + { + "epoch": 5.369377420315758, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02748043241599213, + "loss": 0.8174, + "num_input_tokens_seen": 20921104, + "step": 36050 + }, + { + "epoch": 5.370122132856717, + "grad_norm": 0.018798828125, + "learning_rate": 0.02747935077496578, + "loss": 0.8222, + "num_input_tokens_seen": 20924400, + "step": 36055 + }, + { + "epoch": 5.370866845397677, + "grad_norm": 0.019287109375, + "learning_rate": 0.027478268923113475, + "loss": 0.804, + "num_input_tokens_seen": 20927440, + "step": 36060 + }, + { + "epoch": 5.371611557938635, + "grad_norm": 0.0130615234375, + "learning_rate": 0.027477186860453508, + "loss": 0.7977, + "num_input_tokens_seen": 20930192, + "step": 36065 + }, + { + "epoch": 5.372356270479595, + "grad_norm": 0.01153564453125, + "learning_rate": 0.027476104587004144, + "loss": 0.8328, + "num_input_tokens_seen": 20933136, + "step": 36070 + }, + { + "epoch": 5.373100983020554, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027475022102783676, + "loss": 0.7978, + "num_input_tokens_seen": 20936048, + "step": 36075 + }, + { + "epoch": 5.3738456955615135, + "grad_norm": 0.0172119140625, + "learning_rate": 0.027473939407810385, + "loss": 0.7902, + "num_input_tokens_seen": 20939120, + "step": 36080 + }, + { + "epoch": 5.374590408102472, + "grad_norm": 0.0115966796875, + "learning_rate": 0.027472856502102568, + "loss": 0.7824, + "num_input_tokens_seen": 20941808, + "step": 36085 + }, + { + "epoch": 5.375335120643432, + "grad_norm": 0.018310546875, + "learning_rate": 0.02747177338567852, + "loss": 0.7999, + "num_input_tokens_seen": 20944624, + "step": 36090 + }, + { + "epoch": 5.376079833184391, + "grad_norm": 0.018798828125, + "learning_rate": 0.027470690058556534, + "loss": 0.8008, + "num_input_tokens_seen": 20947568, + "step": 36095 + }, + { + "epoch": 5.37682454572535, + "grad_norm": 0.01092529296875, + "learning_rate": 0.027469606520754912, + "loss": 0.801, + "num_input_tokens_seen": 20950416, + "step": 36100 + }, + { + "epoch": 5.377569258266309, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027468522772291965, + "loss": 0.8096, + "num_input_tokens_seen": 20953360, + "step": 36105 + }, + { + "epoch": 5.378313970807269, + "grad_norm": 0.0260009765625, + "learning_rate": 0.027467438813185995, + "loss": 0.7822, + "num_input_tokens_seen": 20956144, + "step": 36110 + }, + { + "epoch": 5.3790586833482275, + "grad_norm": 0.013916015625, + "learning_rate": 0.027466354643455322, + "loss": 0.8018, + "num_input_tokens_seen": 20958928, + "step": 36115 + }, + { + "epoch": 5.379803395889187, + "grad_norm": 0.024658203125, + "learning_rate": 0.02746527026311825, + "loss": 0.7901, + "num_input_tokens_seen": 20961904, + "step": 36120 + }, + { + "epoch": 5.380548108430146, + "grad_norm": 0.0189208984375, + "learning_rate": 0.027464185672193112, + "loss": 0.8087, + "num_input_tokens_seen": 20964592, + "step": 36125 + }, + { + "epoch": 5.3812928209711055, + "grad_norm": 0.019775390625, + "learning_rate": 0.027463100870698226, + "loss": 0.7937, + "num_input_tokens_seen": 20967376, + "step": 36130 + }, + { + "epoch": 5.382037533512064, + "grad_norm": 0.013916015625, + "learning_rate": 0.027462015858651913, + "loss": 0.7957, + "num_input_tokens_seen": 20970352, + "step": 36135 + }, + { + "epoch": 5.382782246053024, + "grad_norm": 0.024658203125, + "learning_rate": 0.027460930636072512, + "loss": 0.8044, + "num_input_tokens_seen": 20973168, + "step": 36140 + }, + { + "epoch": 5.383526958593983, + "grad_norm": 0.01300048828125, + "learning_rate": 0.027459845202978353, + "loss": 0.7801, + "num_input_tokens_seen": 20976304, + "step": 36145 + }, + { + "epoch": 5.384271671134942, + "grad_norm": 0.033203125, + "learning_rate": 0.027458759559387768, + "loss": 0.8004, + "num_input_tokens_seen": 20979376, + "step": 36150 + }, + { + "epoch": 5.385016383675901, + "grad_norm": 0.0201416015625, + "learning_rate": 0.027457673705319104, + "loss": 0.8046, + "num_input_tokens_seen": 20982160, + "step": 36155 + }, + { + "epoch": 5.385761096216861, + "grad_norm": 0.017822265625, + "learning_rate": 0.027456587640790706, + "loss": 0.7977, + "num_input_tokens_seen": 20984880, + "step": 36160 + }, + { + "epoch": 5.3865058087578195, + "grad_norm": 0.02099609375, + "learning_rate": 0.027455501365820922, + "loss": 0.7933, + "num_input_tokens_seen": 20987696, + "step": 36165 + }, + { + "epoch": 5.387250521298778, + "grad_norm": 0.0283203125, + "learning_rate": 0.0274544148804281, + "loss": 0.7756, + "num_input_tokens_seen": 20990704, + "step": 36170 + }, + { + "epoch": 5.387995233839738, + "grad_norm": 0.01953125, + "learning_rate": 0.027453328184630598, + "loss": 0.8094, + "num_input_tokens_seen": 20993680, + "step": 36175 + }, + { + "epoch": 5.388739946380697, + "grad_norm": 0.038818359375, + "learning_rate": 0.027452241278446768, + "loss": 0.809, + "num_input_tokens_seen": 20996816, + "step": 36180 + }, + { + "epoch": 5.389484658921656, + "grad_norm": 0.010498046875, + "learning_rate": 0.027451154161894982, + "loss": 0.7828, + "num_input_tokens_seen": 20999568, + "step": 36185 + }, + { + "epoch": 5.390229371462615, + "grad_norm": 0.01446533203125, + "learning_rate": 0.0274500668349936, + "loss": 0.792, + "num_input_tokens_seen": 21002256, + "step": 36190 + }, + { + "epoch": 5.390974084003575, + "grad_norm": 0.02734375, + "learning_rate": 0.027448979297760995, + "loss": 0.781, + "num_input_tokens_seen": 21005040, + "step": 36195 + }, + { + "epoch": 5.3917187965445335, + "grad_norm": 0.0201416015625, + "learning_rate": 0.027447891550215535, + "loss": 0.8235, + "num_input_tokens_seen": 21008208, + "step": 36200 + }, + { + "epoch": 5.392463509085493, + "grad_norm": 0.0181884765625, + "learning_rate": 0.027446803592375595, + "loss": 0.8344, + "num_input_tokens_seen": 21011312, + "step": 36205 + }, + { + "epoch": 5.393208221626452, + "grad_norm": 0.0120849609375, + "learning_rate": 0.027445715424259564, + "loss": 0.8026, + "num_input_tokens_seen": 21014192, + "step": 36210 + }, + { + "epoch": 5.3939529341674115, + "grad_norm": 0.0240478515625, + "learning_rate": 0.027444627045885822, + "loss": 0.8084, + "num_input_tokens_seen": 21017200, + "step": 36215 + }, + { + "epoch": 5.39469764670837, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02744353845727275, + "loss": 0.7957, + "num_input_tokens_seen": 21019856, + "step": 36220 + }, + { + "epoch": 5.39544235924933, + "grad_norm": 0.013671875, + "learning_rate": 0.027442449658438745, + "loss": 0.8156, + "num_input_tokens_seen": 21022800, + "step": 36225 + }, + { + "epoch": 5.396187071790289, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027441360649402197, + "loss": 0.8113, + "num_input_tokens_seen": 21025840, + "step": 36230 + }, + { + "epoch": 5.396931784331248, + "grad_norm": 0.01104736328125, + "learning_rate": 0.027440271430181507, + "loss": 0.7809, + "num_input_tokens_seen": 21028752, + "step": 36235 + }, + { + "epoch": 5.397676496872207, + "grad_norm": 0.03173828125, + "learning_rate": 0.02743918200079507, + "loss": 0.7978, + "num_input_tokens_seen": 21031760, + "step": 36240 + }, + { + "epoch": 5.398421209413167, + "grad_norm": 0.0234375, + "learning_rate": 0.0274380923612613, + "loss": 0.8114, + "num_input_tokens_seen": 21034800, + "step": 36245 + }, + { + "epoch": 5.3991659219541255, + "grad_norm": 0.025634765625, + "learning_rate": 0.027437002511598602, + "loss": 0.8076, + "num_input_tokens_seen": 21037360, + "step": 36250 + }, + { + "epoch": 5.399910634495085, + "grad_norm": 0.0137939453125, + "learning_rate": 0.027435912451825386, + "loss": 0.8101, + "num_input_tokens_seen": 21040336, + "step": 36255 + }, + { + "epoch": 5.400655347036044, + "grad_norm": 0.017578125, + "learning_rate": 0.02743482218196007, + "loss": 0.7969, + "num_input_tokens_seen": 21043184, + "step": 36260 + }, + { + "epoch": 5.4014000595770035, + "grad_norm": 0.0142822265625, + "learning_rate": 0.02743373170202107, + "loss": 0.7912, + "num_input_tokens_seen": 21046096, + "step": 36265 + }, + { + "epoch": 5.402144772117962, + "grad_norm": 0.0234375, + "learning_rate": 0.027432641012026807, + "loss": 0.831, + "num_input_tokens_seen": 21048784, + "step": 36270 + }, + { + "epoch": 5.402889484658922, + "grad_norm": 0.027099609375, + "learning_rate": 0.027431550111995714, + "loss": 0.8051, + "num_input_tokens_seen": 21051376, + "step": 36275 + }, + { + "epoch": 5.403634197199881, + "grad_norm": 0.012451171875, + "learning_rate": 0.02743045900194622, + "loss": 0.7932, + "num_input_tokens_seen": 21054128, + "step": 36280 + }, + { + "epoch": 5.40437890974084, + "grad_norm": 0.0294189453125, + "learning_rate": 0.027429367681896747, + "loss": 0.8182, + "num_input_tokens_seen": 21056944, + "step": 36285 + }, + { + "epoch": 5.405123622281799, + "grad_norm": 0.014404296875, + "learning_rate": 0.027428276151865744, + "loss": 0.8041, + "num_input_tokens_seen": 21059696, + "step": 36290 + }, + { + "epoch": 5.405868334822759, + "grad_norm": 0.0189208984375, + "learning_rate": 0.027427184411871645, + "loss": 0.7953, + "num_input_tokens_seen": 21062736, + "step": 36295 + }, + { + "epoch": 5.4066130473637175, + "grad_norm": 0.02001953125, + "learning_rate": 0.0274260924619329, + "loss": 0.7992, + "num_input_tokens_seen": 21066160, + "step": 36300 + }, + { + "epoch": 5.407357759904677, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02742500030206795, + "loss": 0.7837, + "num_input_tokens_seen": 21068944, + "step": 36305 + }, + { + "epoch": 5.408102472445636, + "grad_norm": 0.0118408203125, + "learning_rate": 0.027423907932295242, + "loss": 0.823, + "num_input_tokens_seen": 21071856, + "step": 36310 + }, + { + "epoch": 5.408847184986596, + "grad_norm": 0.033447265625, + "learning_rate": 0.027422815352633245, + "loss": 0.782, + "num_input_tokens_seen": 21074992, + "step": 36315 + }, + { + "epoch": 5.409591897527554, + "grad_norm": 0.0181884765625, + "learning_rate": 0.027421722563100402, + "loss": 0.7902, + "num_input_tokens_seen": 21077744, + "step": 36320 + }, + { + "epoch": 5.410336610068514, + "grad_norm": 0.012939453125, + "learning_rate": 0.027420629563715188, + "loss": 0.7943, + "num_input_tokens_seen": 21080272, + "step": 36325 + }, + { + "epoch": 5.411081322609473, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027419536354496055, + "loss": 0.8095, + "num_input_tokens_seen": 21083376, + "step": 36330 + }, + { + "epoch": 5.4118260351504315, + "grad_norm": 0.020751953125, + "learning_rate": 0.027418442935461487, + "loss": 0.7943, + "num_input_tokens_seen": 21086096, + "step": 36335 + }, + { + "epoch": 5.412570747691391, + "grad_norm": 0.01019287109375, + "learning_rate": 0.027417349306629937, + "loss": 0.7785, + "num_input_tokens_seen": 21089072, + "step": 36340 + }, + { + "epoch": 5.413315460232351, + "grad_norm": 0.01190185546875, + "learning_rate": 0.027416255468019894, + "loss": 0.8222, + "num_input_tokens_seen": 21092112, + "step": 36345 + }, + { + "epoch": 5.4140601727733095, + "grad_norm": 0.02978515625, + "learning_rate": 0.027415161419649836, + "loss": 0.7984, + "num_input_tokens_seen": 21094768, + "step": 36350 + }, + { + "epoch": 5.414804885314268, + "grad_norm": 0.010986328125, + "learning_rate": 0.02741406716153824, + "loss": 0.8374, + "num_input_tokens_seen": 21097744, + "step": 36355 + }, + { + "epoch": 5.415549597855228, + "grad_norm": 0.02001953125, + "learning_rate": 0.0274129726937036, + "loss": 0.8241, + "num_input_tokens_seen": 21100848, + "step": 36360 + }, + { + "epoch": 5.416294310396187, + "grad_norm": 0.0181884765625, + "learning_rate": 0.027411878016164404, + "loss": 0.7861, + "num_input_tokens_seen": 21103952, + "step": 36365 + }, + { + "epoch": 5.417039022937146, + "grad_norm": 0.01806640625, + "learning_rate": 0.02741078312893914, + "loss": 0.7923, + "num_input_tokens_seen": 21107088, + "step": 36370 + }, + { + "epoch": 5.417783735478105, + "grad_norm": 0.0137939453125, + "learning_rate": 0.027409688032046315, + "loss": 0.7821, + "num_input_tokens_seen": 21109680, + "step": 36375 + }, + { + "epoch": 5.418528448019065, + "grad_norm": 0.01153564453125, + "learning_rate": 0.027408592725504416, + "loss": 0.7822, + "num_input_tokens_seen": 21112432, + "step": 36380 + }, + { + "epoch": 5.4192731605600235, + "grad_norm": 0.01220703125, + "learning_rate": 0.027407497209331957, + "loss": 0.8202, + "num_input_tokens_seen": 21115408, + "step": 36385 + }, + { + "epoch": 5.420017873100983, + "grad_norm": 0.02294921875, + "learning_rate": 0.027406401483547446, + "loss": 0.8363, + "num_input_tokens_seen": 21118416, + "step": 36390 + }, + { + "epoch": 5.420762585641942, + "grad_norm": 0.0174560546875, + "learning_rate": 0.027405305548169383, + "loss": 0.7899, + "num_input_tokens_seen": 21121328, + "step": 36395 + }, + { + "epoch": 5.421507298182902, + "grad_norm": 0.01904296875, + "learning_rate": 0.0274042094032163, + "loss": 0.7963, + "num_input_tokens_seen": 21124144, + "step": 36400 + }, + { + "epoch": 5.42225201072386, + "grad_norm": 0.0113525390625, + "learning_rate": 0.0274031130487067, + "loss": 0.791, + "num_input_tokens_seen": 21126928, + "step": 36405 + }, + { + "epoch": 5.42299672326482, + "grad_norm": 0.0115966796875, + "learning_rate": 0.027402016484659116, + "loss": 0.8134, + "num_input_tokens_seen": 21129360, + "step": 36410 + }, + { + "epoch": 5.423741435805779, + "grad_norm": 0.0164794921875, + "learning_rate": 0.027400919711092066, + "loss": 0.7955, + "num_input_tokens_seen": 21132144, + "step": 36415 + }, + { + "epoch": 5.424486148346738, + "grad_norm": 0.017822265625, + "learning_rate": 0.02739982272802408, + "loss": 0.7894, + "num_input_tokens_seen": 21135088, + "step": 36420 + }, + { + "epoch": 5.425230860887697, + "grad_norm": 0.024658203125, + "learning_rate": 0.027398725535473692, + "loss": 0.8251, + "num_input_tokens_seen": 21137840, + "step": 36425 + }, + { + "epoch": 5.425975573428657, + "grad_norm": 0.018798828125, + "learning_rate": 0.027397628133459438, + "loss": 0.7925, + "num_input_tokens_seen": 21140528, + "step": 36430 + }, + { + "epoch": 5.4267202859696155, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02739653052199986, + "loss": 0.8001, + "num_input_tokens_seen": 21143248, + "step": 36435 + }, + { + "epoch": 5.427464998510575, + "grad_norm": 0.0286865234375, + "learning_rate": 0.027395432701113494, + "loss": 0.7994, + "num_input_tokens_seen": 21146320, + "step": 36440 + }, + { + "epoch": 5.428209711051534, + "grad_norm": 0.0133056640625, + "learning_rate": 0.027394334670818893, + "loss": 0.8048, + "num_input_tokens_seen": 21148880, + "step": 36445 + }, + { + "epoch": 5.428954423592494, + "grad_norm": 0.01422119140625, + "learning_rate": 0.027393236431134605, + "loss": 0.81, + "num_input_tokens_seen": 21151792, + "step": 36450 + }, + { + "epoch": 5.429699136133452, + "grad_norm": 0.0185546875, + "learning_rate": 0.027392137982079182, + "loss": 0.8029, + "num_input_tokens_seen": 21154672, + "step": 36455 + }, + { + "epoch": 5.430443848674412, + "grad_norm": 0.0213623046875, + "learning_rate": 0.027391039323671183, + "loss": 0.8208, + "num_input_tokens_seen": 21157552, + "step": 36460 + }, + { + "epoch": 5.431188561215371, + "grad_norm": 0.01708984375, + "learning_rate": 0.02738994045592917, + "loss": 0.8067, + "num_input_tokens_seen": 21160400, + "step": 36465 + }, + { + "epoch": 5.43193327375633, + "grad_norm": 0.0262451171875, + "learning_rate": 0.027388841378871702, + "loss": 0.8056, + "num_input_tokens_seen": 21163312, + "step": 36470 + }, + { + "epoch": 5.432677986297289, + "grad_norm": 0.0145263671875, + "learning_rate": 0.027387742092517355, + "loss": 0.8021, + "num_input_tokens_seen": 21166064, + "step": 36475 + }, + { + "epoch": 5.433422698838249, + "grad_norm": 0.023681640625, + "learning_rate": 0.02738664259688469, + "loss": 0.7906, + "num_input_tokens_seen": 21168784, + "step": 36480 + }, + { + "epoch": 5.434167411379208, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02738554289199229, + "loss": 0.803, + "num_input_tokens_seen": 21171824, + "step": 36485 + }, + { + "epoch": 5.434912123920167, + "grad_norm": 0.0289306640625, + "learning_rate": 0.027384442977858733, + "loss": 0.8191, + "num_input_tokens_seen": 21175440, + "step": 36490 + }, + { + "epoch": 5.435656836461126, + "grad_norm": 0.018798828125, + "learning_rate": 0.027383342854502595, + "loss": 0.805, + "num_input_tokens_seen": 21178320, + "step": 36495 + }, + { + "epoch": 5.436401549002086, + "grad_norm": 0.032958984375, + "learning_rate": 0.02738224252194247, + "loss": 0.8045, + "num_input_tokens_seen": 21181136, + "step": 36500 + }, + { + "epoch": 5.437146261543044, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02738114198019694, + "loss": 0.8038, + "num_input_tokens_seen": 21183920, + "step": 36505 + }, + { + "epoch": 5.437890974084004, + "grad_norm": 0.018310546875, + "learning_rate": 0.027380041229284603, + "loss": 0.8004, + "num_input_tokens_seen": 21186704, + "step": 36510 + }, + { + "epoch": 5.438635686624963, + "grad_norm": 0.0213623046875, + "learning_rate": 0.027378940269224047, + "loss": 0.7922, + "num_input_tokens_seen": 21189552, + "step": 36515 + }, + { + "epoch": 5.4393803991659215, + "grad_norm": 0.04443359375, + "learning_rate": 0.027377839100033878, + "loss": 0.797, + "num_input_tokens_seen": 21192336, + "step": 36520 + }, + { + "epoch": 5.440125111706881, + "grad_norm": 0.0198974609375, + "learning_rate": 0.027376737721732696, + "loss": 0.7996, + "num_input_tokens_seen": 21195376, + "step": 36525 + }, + { + "epoch": 5.44086982424784, + "grad_norm": 0.01904296875, + "learning_rate": 0.027375636134339115, + "loss": 0.7731, + "num_input_tokens_seen": 21198192, + "step": 36530 + }, + { + "epoch": 5.4416145367888, + "grad_norm": 0.027099609375, + "learning_rate": 0.027374534337871733, + "loss": 0.8034, + "num_input_tokens_seen": 21201168, + "step": 36535 + }, + { + "epoch": 5.442359249329758, + "grad_norm": 0.017822265625, + "learning_rate": 0.027373432332349176, + "loss": 0.7873, + "num_input_tokens_seen": 21203984, + "step": 36540 + }, + { + "epoch": 5.443103961870718, + "grad_norm": 0.022705078125, + "learning_rate": 0.027372330117790052, + "loss": 0.7972, + "num_input_tokens_seen": 21206608, + "step": 36545 + }, + { + "epoch": 5.443848674411677, + "grad_norm": 0.0272216796875, + "learning_rate": 0.027371227694212987, + "loss": 0.795, + "num_input_tokens_seen": 21209680, + "step": 36550 + }, + { + "epoch": 5.444593386952636, + "grad_norm": 0.0184326171875, + "learning_rate": 0.0273701250616366, + "loss": 0.8087, + "num_input_tokens_seen": 21212752, + "step": 36555 + }, + { + "epoch": 5.445338099493595, + "grad_norm": 0.0208740234375, + "learning_rate": 0.027369022220079524, + "loss": 0.8481, + "num_input_tokens_seen": 21215536, + "step": 36560 + }, + { + "epoch": 5.446082812034555, + "grad_norm": 0.024169921875, + "learning_rate": 0.027367919169560394, + "loss": 0.8001, + "num_input_tokens_seen": 21218224, + "step": 36565 + }, + { + "epoch": 5.446827524575514, + "grad_norm": 0.0201416015625, + "learning_rate": 0.027366815910097833, + "loss": 0.8045, + "num_input_tokens_seen": 21221168, + "step": 36570 + }, + { + "epoch": 5.447572237116473, + "grad_norm": 0.017578125, + "learning_rate": 0.027365712441710488, + "loss": 0.7862, + "num_input_tokens_seen": 21223984, + "step": 36575 + }, + { + "epoch": 5.448316949657432, + "grad_norm": 0.0201416015625, + "learning_rate": 0.027364608764417004, + "loss": 0.8001, + "num_input_tokens_seen": 21227088, + "step": 36580 + }, + { + "epoch": 5.449061662198392, + "grad_norm": 0.039306640625, + "learning_rate": 0.027363504878236015, + "loss": 0.7711, + "num_input_tokens_seen": 21230032, + "step": 36585 + }, + { + "epoch": 5.44980637473935, + "grad_norm": 0.0234375, + "learning_rate": 0.02736240078318618, + "loss": 0.8055, + "num_input_tokens_seen": 21233072, + "step": 36590 + }, + { + "epoch": 5.45055108728031, + "grad_norm": 0.0252685546875, + "learning_rate": 0.027361296479286153, + "loss": 0.8268, + "num_input_tokens_seen": 21236080, + "step": 36595 + }, + { + "epoch": 5.451295799821269, + "grad_norm": 0.019775390625, + "learning_rate": 0.02736019196655458, + "loss": 0.7913, + "num_input_tokens_seen": 21239056, + "step": 36600 + }, + { + "epoch": 5.452040512362228, + "grad_norm": 0.01397705078125, + "learning_rate": 0.027359087245010127, + "loss": 0.8051, + "num_input_tokens_seen": 21241872, + "step": 36605 + }, + { + "epoch": 5.452785224903187, + "grad_norm": 0.0220947265625, + "learning_rate": 0.027357982314671455, + "loss": 0.812, + "num_input_tokens_seen": 21244752, + "step": 36610 + }, + { + "epoch": 5.453529937444147, + "grad_norm": 0.0267333984375, + "learning_rate": 0.027356877175557234, + "loss": 0.8132, + "num_input_tokens_seen": 21247408, + "step": 36615 + }, + { + "epoch": 5.454274649985106, + "grad_norm": 0.026611328125, + "learning_rate": 0.02735577182768613, + "loss": 0.8109, + "num_input_tokens_seen": 21250032, + "step": 36620 + }, + { + "epoch": 5.455019362526065, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02735466627107682, + "loss": 0.8009, + "num_input_tokens_seen": 21253136, + "step": 36625 + }, + { + "epoch": 5.455764075067024, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02735356050574798, + "loss": 0.8025, + "num_input_tokens_seen": 21256016, + "step": 36630 + }, + { + "epoch": 5.456508787607984, + "grad_norm": 0.0198974609375, + "learning_rate": 0.027352454531718293, + "loss": 0.8116, + "num_input_tokens_seen": 21259248, + "step": 36635 + }, + { + "epoch": 5.457253500148942, + "grad_norm": 0.0185546875, + "learning_rate": 0.02735134834900644, + "loss": 0.7941, + "num_input_tokens_seen": 21262192, + "step": 36640 + }, + { + "epoch": 5.457998212689902, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027350241957631104, + "loss": 0.8149, + "num_input_tokens_seen": 21264912, + "step": 36645 + }, + { + "epoch": 5.458742925230861, + "grad_norm": 0.01287841796875, + "learning_rate": 0.027349135357610986, + "loss": 0.7904, + "num_input_tokens_seen": 21267824, + "step": 36650 + }, + { + "epoch": 5.4594876377718204, + "grad_norm": 0.019775390625, + "learning_rate": 0.027348028548964783, + "loss": 0.8027, + "num_input_tokens_seen": 21270672, + "step": 36655 + }, + { + "epoch": 5.460232350312779, + "grad_norm": 0.0205078125, + "learning_rate": 0.027346921531711182, + "loss": 0.7849, + "num_input_tokens_seen": 21273488, + "step": 36660 + }, + { + "epoch": 5.460977062853739, + "grad_norm": 0.0205078125, + "learning_rate": 0.027345814305868892, + "loss": 0.8132, + "num_input_tokens_seen": 21276688, + "step": 36665 + }, + { + "epoch": 5.461721775394698, + "grad_norm": 0.02490234375, + "learning_rate": 0.027344706871456615, + "loss": 0.8118, + "num_input_tokens_seen": 21279696, + "step": 36670 + }, + { + "epoch": 5.462466487935657, + "grad_norm": 0.01300048828125, + "learning_rate": 0.027343599228493064, + "loss": 0.8083, + "num_input_tokens_seen": 21282704, + "step": 36675 + }, + { + "epoch": 5.463211200476616, + "grad_norm": 0.025146484375, + "learning_rate": 0.02734249137699695, + "loss": 0.8003, + "num_input_tokens_seen": 21285648, + "step": 36680 + }, + { + "epoch": 5.463955913017575, + "grad_norm": 0.018798828125, + "learning_rate": 0.027341383316986986, + "loss": 0.8311, + "num_input_tokens_seen": 21288880, + "step": 36685 + }, + { + "epoch": 5.464700625558534, + "grad_norm": 0.0255126953125, + "learning_rate": 0.027340275048481896, + "loss": 0.819, + "num_input_tokens_seen": 21291888, + "step": 36690 + }, + { + "epoch": 5.465445338099494, + "grad_norm": 0.0203857421875, + "learning_rate": 0.027339166571500404, + "loss": 0.8137, + "num_input_tokens_seen": 21294832, + "step": 36695 + }, + { + "epoch": 5.466190050640453, + "grad_norm": 0.02490234375, + "learning_rate": 0.02733805788606123, + "loss": 0.7976, + "num_input_tokens_seen": 21297392, + "step": 36700 + }, + { + "epoch": 5.466934763181412, + "grad_norm": 0.029296875, + "learning_rate": 0.02733694899218311, + "loss": 0.7955, + "num_input_tokens_seen": 21300496, + "step": 36705 + }, + { + "epoch": 5.467679475722371, + "grad_norm": 0.01312255859375, + "learning_rate": 0.027335839889884774, + "loss": 0.7977, + "num_input_tokens_seen": 21303376, + "step": 36710 + }, + { + "epoch": 5.46842418826333, + "grad_norm": 0.024169921875, + "learning_rate": 0.027334730579184962, + "loss": 0.8184, + "num_input_tokens_seen": 21306288, + "step": 36715 + }, + { + "epoch": 5.46916890080429, + "grad_norm": 0.030517578125, + "learning_rate": 0.027333621060102408, + "loss": 0.8087, + "num_input_tokens_seen": 21309232, + "step": 36720 + }, + { + "epoch": 5.469913613345248, + "grad_norm": 0.01434326171875, + "learning_rate": 0.02733251133265587, + "loss": 0.8014, + "num_input_tokens_seen": 21312176, + "step": 36725 + }, + { + "epoch": 5.470658325886208, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02733140139686408, + "loss": 0.7915, + "num_input_tokens_seen": 21315184, + "step": 36730 + }, + { + "epoch": 5.471403038427167, + "grad_norm": 0.041015625, + "learning_rate": 0.0273302912527458, + "loss": 0.7885, + "num_input_tokens_seen": 21318352, + "step": 36735 + }, + { + "epoch": 5.4721477509681264, + "grad_norm": 0.01214599609375, + "learning_rate": 0.027329180900319782, + "loss": 0.8084, + "num_input_tokens_seen": 21321584, + "step": 36740 + }, + { + "epoch": 5.472892463509085, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02732807033960478, + "loss": 0.7889, + "num_input_tokens_seen": 21324400, + "step": 36745 + }, + { + "epoch": 5.473637176050045, + "grad_norm": 0.032958984375, + "learning_rate": 0.027326959570619563, + "loss": 0.8181, + "num_input_tokens_seen": 21327312, + "step": 36750 + }, + { + "epoch": 5.474381888591004, + "grad_norm": 0.0234375, + "learning_rate": 0.02732584859338289, + "loss": 0.7881, + "num_input_tokens_seen": 21330160, + "step": 36755 + }, + { + "epoch": 5.475126601131963, + "grad_norm": 0.019287109375, + "learning_rate": 0.02732473740791353, + "loss": 0.7917, + "num_input_tokens_seen": 21332944, + "step": 36760 + }, + { + "epoch": 5.475871313672922, + "grad_norm": 0.01153564453125, + "learning_rate": 0.027323626014230263, + "loss": 0.7898, + "num_input_tokens_seen": 21335920, + "step": 36765 + }, + { + "epoch": 5.476616026213882, + "grad_norm": 0.02099609375, + "learning_rate": 0.02732251441235186, + "loss": 0.8043, + "num_input_tokens_seen": 21338864, + "step": 36770 + }, + { + "epoch": 5.47736073875484, + "grad_norm": 0.02001953125, + "learning_rate": 0.0273214026022971, + "loss": 0.7903, + "num_input_tokens_seen": 21341872, + "step": 36775 + }, + { + "epoch": 5.4781054512958, + "grad_norm": 0.019775390625, + "learning_rate": 0.027320290584084764, + "loss": 0.7893, + "num_input_tokens_seen": 21344432, + "step": 36780 + }, + { + "epoch": 5.478850163836759, + "grad_norm": 0.0194091796875, + "learning_rate": 0.027319178357733637, + "loss": 0.7816, + "num_input_tokens_seen": 21347248, + "step": 36785 + }, + { + "epoch": 5.4795948763777185, + "grad_norm": 0.0257568359375, + "learning_rate": 0.027318065923262518, + "loss": 0.8245, + "num_input_tokens_seen": 21349968, + "step": 36790 + }, + { + "epoch": 5.480339588918677, + "grad_norm": 0.0247802734375, + "learning_rate": 0.027316953280690195, + "loss": 0.7865, + "num_input_tokens_seen": 21352784, + "step": 36795 + }, + { + "epoch": 5.481084301459637, + "grad_norm": 0.016845703125, + "learning_rate": 0.02731584043003546, + "loss": 0.786, + "num_input_tokens_seen": 21355632, + "step": 36800 + }, + { + "epoch": 5.481829014000596, + "grad_norm": 0.01519775390625, + "learning_rate": 0.02731472737131712, + "loss": 0.8014, + "num_input_tokens_seen": 21358288, + "step": 36805 + }, + { + "epoch": 5.482573726541555, + "grad_norm": 0.0194091796875, + "learning_rate": 0.027313614104553977, + "loss": 0.7978, + "num_input_tokens_seen": 21361232, + "step": 36810 + }, + { + "epoch": 5.483318439082514, + "grad_norm": 0.022216796875, + "learning_rate": 0.027312500629764837, + "loss": 0.7901, + "num_input_tokens_seen": 21364080, + "step": 36815 + }, + { + "epoch": 5.484063151623474, + "grad_norm": 0.01177978515625, + "learning_rate": 0.027311386946968514, + "loss": 0.8136, + "num_input_tokens_seen": 21366928, + "step": 36820 + }, + { + "epoch": 5.4848078641644324, + "grad_norm": 0.0177001953125, + "learning_rate": 0.027310273056183824, + "loss": 0.7917, + "num_input_tokens_seen": 21369840, + "step": 36825 + }, + { + "epoch": 5.485552576705392, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02730915895742958, + "loss": 0.8118, + "num_input_tokens_seen": 21372624, + "step": 36830 + }, + { + "epoch": 5.486297289246351, + "grad_norm": 0.0250244140625, + "learning_rate": 0.027308044650724606, + "loss": 0.7967, + "num_input_tokens_seen": 21375536, + "step": 36835 + }, + { + "epoch": 5.4870420017873105, + "grad_norm": 0.01953125, + "learning_rate": 0.027306930136087728, + "loss": 0.7961, + "num_input_tokens_seen": 21378544, + "step": 36840 + }, + { + "epoch": 5.487786714328269, + "grad_norm": 0.034423828125, + "learning_rate": 0.027305815413537774, + "loss": 0.8264, + "num_input_tokens_seen": 21381200, + "step": 36845 + }, + { + "epoch": 5.488531426869228, + "grad_norm": 0.0205078125, + "learning_rate": 0.027304700483093573, + "loss": 0.8063, + "num_input_tokens_seen": 21384656, + "step": 36850 + }, + { + "epoch": 5.489276139410188, + "grad_norm": 0.03076171875, + "learning_rate": 0.027303585344773968, + "loss": 0.7639, + "num_input_tokens_seen": 21387440, + "step": 36855 + }, + { + "epoch": 5.490020851951147, + "grad_norm": 0.017333984375, + "learning_rate": 0.02730246999859779, + "loss": 0.7706, + "num_input_tokens_seen": 21390128, + "step": 36860 + }, + { + "epoch": 5.490765564492106, + "grad_norm": 0.0255126953125, + "learning_rate": 0.027301354444583883, + "loss": 0.8071, + "num_input_tokens_seen": 21392912, + "step": 36865 + }, + { + "epoch": 5.491510277033065, + "grad_norm": 0.0274658203125, + "learning_rate": 0.027300238682751096, + "loss": 0.8288, + "num_input_tokens_seen": 21396048, + "step": 36870 + }, + { + "epoch": 5.4922549895740245, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027299122713118285, + "loss": 0.7908, + "num_input_tokens_seen": 21398832, + "step": 36875 + }, + { + "epoch": 5.492999702114983, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02729800653570429, + "loss": 0.8128, + "num_input_tokens_seen": 21401616, + "step": 36880 + }, + { + "epoch": 5.493744414655943, + "grad_norm": 0.0205078125, + "learning_rate": 0.027296890150527973, + "loss": 0.817, + "num_input_tokens_seen": 21404464, + "step": 36885 + }, + { + "epoch": 5.494489127196902, + "grad_norm": 0.0255126953125, + "learning_rate": 0.027295773557608195, + "loss": 0.7882, + "num_input_tokens_seen": 21407536, + "step": 36890 + }, + { + "epoch": 5.495233839737861, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02729465675696382, + "loss": 0.8156, + "num_input_tokens_seen": 21410448, + "step": 36895 + }, + { + "epoch": 5.49597855227882, + "grad_norm": 0.025390625, + "learning_rate": 0.027293539748613715, + "loss": 0.8001, + "num_input_tokens_seen": 21413200, + "step": 36900 + }, + { + "epoch": 5.49672326481978, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02729242253257675, + "loss": 0.8057, + "num_input_tokens_seen": 21415952, + "step": 36905 + }, + { + "epoch": 5.4974679773607384, + "grad_norm": 0.0260009765625, + "learning_rate": 0.027291305108871802, + "loss": 0.7785, + "num_input_tokens_seen": 21419312, + "step": 36910 + }, + { + "epoch": 5.498212689901698, + "grad_norm": 0.02880859375, + "learning_rate": 0.027290187477517745, + "loss": 0.7891, + "num_input_tokens_seen": 21422320, + "step": 36915 + }, + { + "epoch": 5.498957402442657, + "grad_norm": 0.01263427734375, + "learning_rate": 0.027289069638533465, + "loss": 0.8058, + "num_input_tokens_seen": 21425392, + "step": 36920 + }, + { + "epoch": 5.4997021149836165, + "grad_norm": 0.0181884765625, + "learning_rate": 0.02728795159193784, + "loss": 0.8052, + "num_input_tokens_seen": 21428176, + "step": 36925 + }, + { + "epoch": 5.500446827524575, + "grad_norm": 0.02490234375, + "learning_rate": 0.02728683333774976, + "loss": 0.7826, + "num_input_tokens_seen": 21431344, + "step": 36930 + }, + { + "epoch": 5.501191540065535, + "grad_norm": 0.019775390625, + "learning_rate": 0.027285714875988118, + "loss": 0.7795, + "num_input_tokens_seen": 21434288, + "step": 36935 + }, + { + "epoch": 5.501936252606494, + "grad_norm": 0.0185546875, + "learning_rate": 0.027284596206671816, + "loss": 0.7827, + "num_input_tokens_seen": 21437104, + "step": 36940 + }, + { + "epoch": 5.502680965147453, + "grad_norm": 0.0311279296875, + "learning_rate": 0.02728347732981974, + "loss": 0.8112, + "num_input_tokens_seen": 21440272, + "step": 36945 + }, + { + "epoch": 5.503425677688412, + "grad_norm": 0.016845703125, + "learning_rate": 0.0272823582454508, + "loss": 0.8063, + "num_input_tokens_seen": 21442896, + "step": 36950 + }, + { + "epoch": 5.504170390229372, + "grad_norm": 0.02783203125, + "learning_rate": 0.027281238953583906, + "loss": 0.8011, + "num_input_tokens_seen": 21445584, + "step": 36955 + }, + { + "epoch": 5.5049151027703305, + "grad_norm": 0.0172119140625, + "learning_rate": 0.02728011945423796, + "loss": 0.8104, + "num_input_tokens_seen": 21448400, + "step": 36960 + }, + { + "epoch": 5.50565981531129, + "grad_norm": 0.01141357421875, + "learning_rate": 0.027278999747431875, + "loss": 0.7772, + "num_input_tokens_seen": 21451088, + "step": 36965 + }, + { + "epoch": 5.506404527852249, + "grad_norm": 0.012939453125, + "learning_rate": 0.027277879833184568, + "loss": 0.8105, + "num_input_tokens_seen": 21454160, + "step": 36970 + }, + { + "epoch": 5.5071492403932085, + "grad_norm": 0.0155029296875, + "learning_rate": 0.027276759711514963, + "loss": 0.822, + "num_input_tokens_seen": 21456752, + "step": 36975 + }, + { + "epoch": 5.507893952934167, + "grad_norm": 0.035400390625, + "learning_rate": 0.027275639382441978, + "loss": 0.8018, + "num_input_tokens_seen": 21459920, + "step": 36980 + }, + { + "epoch": 5.508638665475127, + "grad_norm": 0.0260009765625, + "learning_rate": 0.027274518845984544, + "loss": 0.8111, + "num_input_tokens_seen": 21462864, + "step": 36985 + }, + { + "epoch": 5.509383378016086, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02727339810216159, + "loss": 0.7922, + "num_input_tokens_seen": 21465616, + "step": 36990 + }, + { + "epoch": 5.510128090557045, + "grad_norm": 0.0152587890625, + "learning_rate": 0.027272277150992046, + "loss": 0.8005, + "num_input_tokens_seen": 21468784, + "step": 36995 + }, + { + "epoch": 5.510872803098004, + "grad_norm": 0.0198974609375, + "learning_rate": 0.027271155992494855, + "loss": 0.7755, + "num_input_tokens_seen": 21471600, + "step": 37000 + }, + { + "epoch": 5.511617515638964, + "grad_norm": 0.020751953125, + "learning_rate": 0.027270034626688953, + "loss": 0.7848, + "num_input_tokens_seen": 21474192, + "step": 37005 + }, + { + "epoch": 5.5123622281799225, + "grad_norm": 0.032958984375, + "learning_rate": 0.027268913053593285, + "loss": 0.8138, + "num_input_tokens_seen": 21476752, + "step": 37010 + }, + { + "epoch": 5.513106940720881, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02726779127322681, + "loss": 0.8282, + "num_input_tokens_seen": 21479856, + "step": 37015 + }, + { + "epoch": 5.513851653261841, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02726666928560846, + "loss": 0.7815, + "num_input_tokens_seen": 21482640, + "step": 37020 + }, + { + "epoch": 5.5145963658028005, + "grad_norm": 0.0186767578125, + "learning_rate": 0.027265547090757205, + "loss": 0.7897, + "num_input_tokens_seen": 21485424, + "step": 37025 + }, + { + "epoch": 5.515341078343759, + "grad_norm": 0.0233154296875, + "learning_rate": 0.027264424688691995, + "loss": 0.8037, + "num_input_tokens_seen": 21488368, + "step": 37030 + }, + { + "epoch": 5.516085790884718, + "grad_norm": 0.026611328125, + "learning_rate": 0.027263302079431798, + "loss": 0.8143, + "num_input_tokens_seen": 21491088, + "step": 37035 + }, + { + "epoch": 5.516830503425678, + "grad_norm": 0.03955078125, + "learning_rate": 0.027262179262995567, + "loss": 0.834, + "num_input_tokens_seen": 21494288, + "step": 37040 + }, + { + "epoch": 5.517575215966637, + "grad_norm": 0.022216796875, + "learning_rate": 0.02726105623940229, + "loss": 0.8133, + "num_input_tokens_seen": 21497328, + "step": 37045 + }, + { + "epoch": 5.518319928507596, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02725993300867093, + "loss": 0.8135, + "num_input_tokens_seen": 21500240, + "step": 37050 + }, + { + "epoch": 5.519064641048555, + "grad_norm": 0.0135498046875, + "learning_rate": 0.027258809570820455, + "loss": 0.8047, + "num_input_tokens_seen": 21503632, + "step": 37055 + }, + { + "epoch": 5.5198093535895145, + "grad_norm": 0.0283203125, + "learning_rate": 0.027257685925869857, + "loss": 0.8111, + "num_input_tokens_seen": 21506640, + "step": 37060 + }, + { + "epoch": 5.520554066130473, + "grad_norm": 0.020751953125, + "learning_rate": 0.027256562073838114, + "loss": 0.7845, + "num_input_tokens_seen": 21509296, + "step": 37065 + }, + { + "epoch": 5.521298778671433, + "grad_norm": 0.0224609375, + "learning_rate": 0.027255438014744206, + "loss": 0.8005, + "num_input_tokens_seen": 21512080, + "step": 37070 + }, + { + "epoch": 5.522043491212392, + "grad_norm": 0.0220947265625, + "learning_rate": 0.027254313748607133, + "loss": 0.8237, + "num_input_tokens_seen": 21515376, + "step": 37075 + }, + { + "epoch": 5.522788203753351, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027253189275445885, + "loss": 0.7949, + "num_input_tokens_seen": 21518096, + "step": 37080 + }, + { + "epoch": 5.52353291629431, + "grad_norm": 0.031005859375, + "learning_rate": 0.027252064595279454, + "loss": 0.8099, + "num_input_tokens_seen": 21521232, + "step": 37085 + }, + { + "epoch": 5.52427762883527, + "grad_norm": 0.0201416015625, + "learning_rate": 0.027250939708126847, + "loss": 0.7772, + "num_input_tokens_seen": 21524080, + "step": 37090 + }, + { + "epoch": 5.5250223413762285, + "grad_norm": 0.017578125, + "learning_rate": 0.027249814614007065, + "loss": 0.7948, + "num_input_tokens_seen": 21527248, + "step": 37095 + }, + { + "epoch": 5.525767053917188, + "grad_norm": 0.0216064453125, + "learning_rate": 0.027248689312939115, + "loss": 0.7811, + "num_input_tokens_seen": 21530128, + "step": 37100 + }, + { + "epoch": 5.526511766458147, + "grad_norm": 0.0252685546875, + "learning_rate": 0.027247563804942004, + "loss": 0.8042, + "num_input_tokens_seen": 21532720, + "step": 37105 + }, + { + "epoch": 5.5272564789991065, + "grad_norm": 0.0205078125, + "learning_rate": 0.02724643809003476, + "loss": 0.7757, + "num_input_tokens_seen": 21535984, + "step": 37110 + }, + { + "epoch": 5.528001191540065, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02724531216823638, + "loss": 0.7839, + "num_input_tokens_seen": 21538672, + "step": 37115 + }, + { + "epoch": 5.528745904081025, + "grad_norm": 0.0164794921875, + "learning_rate": 0.027244186039565903, + "loss": 0.7914, + "num_input_tokens_seen": 21541328, + "step": 37120 + }, + { + "epoch": 5.529490616621984, + "grad_norm": 0.02880859375, + "learning_rate": 0.027243059704042345, + "loss": 0.7986, + "num_input_tokens_seen": 21544176, + "step": 37125 + }, + { + "epoch": 5.530235329162943, + "grad_norm": 0.0235595703125, + "learning_rate": 0.027241933161684738, + "loss": 0.8328, + "num_input_tokens_seen": 21546928, + "step": 37130 + }, + { + "epoch": 5.530980041703902, + "grad_norm": 0.01953125, + "learning_rate": 0.02724080641251211, + "loss": 0.8132, + "num_input_tokens_seen": 21549904, + "step": 37135 + }, + { + "epoch": 5.531724754244862, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027239679456543504, + "loss": 0.8083, + "num_input_tokens_seen": 21553136, + "step": 37140 + }, + { + "epoch": 5.5324694667858205, + "grad_norm": 0.0224609375, + "learning_rate": 0.027238552293797947, + "loss": 0.8204, + "num_input_tokens_seen": 21556144, + "step": 37145 + }, + { + "epoch": 5.53321417932678, + "grad_norm": 0.022216796875, + "learning_rate": 0.02723742492429449, + "loss": 0.7684, + "num_input_tokens_seen": 21559024, + "step": 37150 + }, + { + "epoch": 5.533958891867739, + "grad_norm": 0.0286865234375, + "learning_rate": 0.027236297348052177, + "loss": 0.7954, + "num_input_tokens_seen": 21562224, + "step": 37155 + }, + { + "epoch": 5.534703604408699, + "grad_norm": 0.01483154296875, + "learning_rate": 0.027235169565090058, + "loss": 0.8345, + "num_input_tokens_seen": 21565264, + "step": 37160 + }, + { + "epoch": 5.535448316949657, + "grad_norm": 0.02294921875, + "learning_rate": 0.027234041575427186, + "loss": 0.808, + "num_input_tokens_seen": 21568400, + "step": 37165 + }, + { + "epoch": 5.536193029490617, + "grad_norm": 0.014404296875, + "learning_rate": 0.027232913379082613, + "loss": 0.8198, + "num_input_tokens_seen": 21571216, + "step": 37170 + }, + { + "epoch": 5.536937742031576, + "grad_norm": 0.0234375, + "learning_rate": 0.0272317849760754, + "loss": 0.8017, + "num_input_tokens_seen": 21574192, + "step": 37175 + }, + { + "epoch": 5.537682454572535, + "grad_norm": 0.01806640625, + "learning_rate": 0.02723065636642461, + "loss": 0.8163, + "num_input_tokens_seen": 21577040, + "step": 37180 + }, + { + "epoch": 5.538427167113494, + "grad_norm": 0.0220947265625, + "learning_rate": 0.027229527550149315, + "loss": 0.8103, + "num_input_tokens_seen": 21579920, + "step": 37185 + }, + { + "epoch": 5.539171879654454, + "grad_norm": 0.01953125, + "learning_rate": 0.027228398527268584, + "loss": 0.8029, + "num_input_tokens_seen": 21582608, + "step": 37190 + }, + { + "epoch": 5.5399165921954125, + "grad_norm": 0.01904296875, + "learning_rate": 0.027227269297801483, + "loss": 0.8073, + "num_input_tokens_seen": 21585520, + "step": 37195 + }, + { + "epoch": 5.540661304736371, + "grad_norm": 0.01904296875, + "learning_rate": 0.0272261398617671, + "loss": 0.7954, + "num_input_tokens_seen": 21588496, + "step": 37200 + }, + { + "epoch": 5.541406017277331, + "grad_norm": 0.0220947265625, + "learning_rate": 0.027225010219184505, + "loss": 0.8123, + "num_input_tokens_seen": 21591536, + "step": 37205 + }, + { + "epoch": 5.542150729818291, + "grad_norm": 0.02197265625, + "learning_rate": 0.027223880370072787, + "loss": 0.7905, + "num_input_tokens_seen": 21594192, + "step": 37210 + }, + { + "epoch": 5.542895442359249, + "grad_norm": 0.0194091796875, + "learning_rate": 0.027222750314451035, + "loss": 0.8044, + "num_input_tokens_seen": 21597136, + "step": 37215 + }, + { + "epoch": 5.543640154900208, + "grad_norm": 0.01409912109375, + "learning_rate": 0.027221620052338344, + "loss": 0.8105, + "num_input_tokens_seen": 21599984, + "step": 37220 + }, + { + "epoch": 5.544384867441168, + "grad_norm": 0.0242919921875, + "learning_rate": 0.027220489583753796, + "loss": 0.7953, + "num_input_tokens_seen": 21602736, + "step": 37225 + }, + { + "epoch": 5.5451295799821265, + "grad_norm": 0.021728515625, + "learning_rate": 0.027219358908716504, + "loss": 0.802, + "num_input_tokens_seen": 21605616, + "step": 37230 + }, + { + "epoch": 5.545874292523086, + "grad_norm": 0.018310546875, + "learning_rate": 0.027218228027245557, + "loss": 0.8201, + "num_input_tokens_seen": 21608912, + "step": 37235 + }, + { + "epoch": 5.546619005064045, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02721709693936007, + "loss": 0.7976, + "num_input_tokens_seen": 21611920, + "step": 37240 + }, + { + "epoch": 5.547363717605005, + "grad_norm": 0.0189208984375, + "learning_rate": 0.027215965645079145, + "loss": 0.7928, + "num_input_tokens_seen": 21614704, + "step": 37245 + }, + { + "epoch": 5.548108430145963, + "grad_norm": 0.0198974609375, + "learning_rate": 0.027214834144421895, + "loss": 0.8271, + "num_input_tokens_seen": 21617648, + "step": 37250 + }, + { + "epoch": 5.548853142686923, + "grad_norm": 0.0216064453125, + "learning_rate": 0.027213702437407433, + "loss": 0.7873, + "num_input_tokens_seen": 21620432, + "step": 37255 + }, + { + "epoch": 5.549597855227882, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02721257052405489, + "loss": 0.8105, + "num_input_tokens_seen": 21623216, + "step": 37260 + }, + { + "epoch": 5.550342567768841, + "grad_norm": 0.0240478515625, + "learning_rate": 0.027211438404383375, + "loss": 0.7938, + "num_input_tokens_seen": 21626064, + "step": 37265 + }, + { + "epoch": 5.5510872803098, + "grad_norm": 0.028076171875, + "learning_rate": 0.027210306078412023, + "loss": 0.8094, + "num_input_tokens_seen": 21629328, + "step": 37270 + }, + { + "epoch": 5.55183199285076, + "grad_norm": 0.0203857421875, + "learning_rate": 0.027209173546159957, + "loss": 0.8078, + "num_input_tokens_seen": 21632240, + "step": 37275 + }, + { + "epoch": 5.5525767053917185, + "grad_norm": 0.01904296875, + "learning_rate": 0.027208040807646317, + "loss": 0.7845, + "num_input_tokens_seen": 21635024, + "step": 37280 + }, + { + "epoch": 5.553321417932678, + "grad_norm": 0.025390625, + "learning_rate": 0.027206907862890227, + "loss": 0.7913, + "num_input_tokens_seen": 21637872, + "step": 37285 + }, + { + "epoch": 5.554066130473637, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02720577471191084, + "loss": 0.7957, + "num_input_tokens_seen": 21640944, + "step": 37290 + }, + { + "epoch": 5.554810843014597, + "grad_norm": 0.0218505859375, + "learning_rate": 0.027204641354727298, + "loss": 0.7838, + "num_input_tokens_seen": 21644048, + "step": 37295 + }, + { + "epoch": 5.555555555555555, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02720350779135874, + "loss": 0.7839, + "num_input_tokens_seen": 21646832, + "step": 37300 + }, + { + "epoch": 5.556300268096515, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02720237402182432, + "loss": 0.8184, + "num_input_tokens_seen": 21649680, + "step": 37305 + }, + { + "epoch": 5.557044980637474, + "grad_norm": 0.03369140625, + "learning_rate": 0.0272012400461432, + "loss": 0.7917, + "num_input_tokens_seen": 21652464, + "step": 37310 + }, + { + "epoch": 5.557789693178433, + "grad_norm": 0.0299072265625, + "learning_rate": 0.027200105864334523, + "loss": 0.8407, + "num_input_tokens_seen": 21655344, + "step": 37315 + }, + { + "epoch": 5.558534405719392, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02719897147641746, + "loss": 0.8127, + "num_input_tokens_seen": 21658480, + "step": 37320 + }, + { + "epoch": 5.559279118260352, + "grad_norm": 0.01446533203125, + "learning_rate": 0.02719783688241117, + "loss": 0.7741, + "num_input_tokens_seen": 21661232, + "step": 37325 + }, + { + "epoch": 5.560023830801311, + "grad_norm": 0.0294189453125, + "learning_rate": 0.027196702082334823, + "loss": 0.8365, + "num_input_tokens_seen": 21664112, + "step": 37330 + }, + { + "epoch": 5.56076854334227, + "grad_norm": 0.019775390625, + "learning_rate": 0.027195567076207593, + "loss": 0.7886, + "num_input_tokens_seen": 21666640, + "step": 37335 + }, + { + "epoch": 5.561513255883229, + "grad_norm": 0.0302734375, + "learning_rate": 0.027194431864048654, + "loss": 0.7904, + "num_input_tokens_seen": 21669968, + "step": 37340 + }, + { + "epoch": 5.562257968424189, + "grad_norm": 0.02685546875, + "learning_rate": 0.02719329644587718, + "loss": 0.798, + "num_input_tokens_seen": 21672912, + "step": 37345 + }, + { + "epoch": 5.563002680965147, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027192160821712354, + "loss": 0.811, + "num_input_tokens_seen": 21675760, + "step": 37350 + }, + { + "epoch": 5.563747393506107, + "grad_norm": 0.01220703125, + "learning_rate": 0.027191024991573366, + "loss": 0.7917, + "num_input_tokens_seen": 21678480, + "step": 37355 + }, + { + "epoch": 5.564492106047066, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0271898889554794, + "loss": 0.837, + "num_input_tokens_seen": 21681360, + "step": 37360 + }, + { + "epoch": 5.5652368185880245, + "grad_norm": 0.0291748046875, + "learning_rate": 0.027188752713449647, + "loss": 0.8118, + "num_input_tokens_seen": 21684496, + "step": 37365 + }, + { + "epoch": 5.565981531128984, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027187616265503308, + "loss": 0.8327, + "num_input_tokens_seen": 21687152, + "step": 37370 + }, + { + "epoch": 5.566726243669944, + "grad_norm": 0.01953125, + "learning_rate": 0.027186479611659577, + "loss": 0.7972, + "num_input_tokens_seen": 21689936, + "step": 37375 + }, + { + "epoch": 5.567470956210903, + "grad_norm": 0.0284423828125, + "learning_rate": 0.027185342751937663, + "loss": 0.8217, + "num_input_tokens_seen": 21692592, + "step": 37380 + }, + { + "epoch": 5.568215668751861, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02718420568635676, + "loss": 0.8071, + "num_input_tokens_seen": 21695440, + "step": 37385 + }, + { + "epoch": 5.568960381292821, + "grad_norm": 0.01287841796875, + "learning_rate": 0.027183068414936094, + "loss": 0.7851, + "num_input_tokens_seen": 21698640, + "step": 37390 + }, + { + "epoch": 5.569705093833781, + "grad_norm": 0.0169677734375, + "learning_rate": 0.027181930937694863, + "loss": 0.7886, + "num_input_tokens_seen": 21701328, + "step": 37395 + }, + { + "epoch": 5.570449806374739, + "grad_norm": 0.0135498046875, + "learning_rate": 0.027180793254652287, + "loss": 0.8214, + "num_input_tokens_seen": 21704112, + "step": 37400 + }, + { + "epoch": 5.571194518915698, + "grad_norm": 0.0191650390625, + "learning_rate": 0.027179655365827598, + "loss": 0.8004, + "num_input_tokens_seen": 21706736, + "step": 37405 + }, + { + "epoch": 5.571939231456658, + "grad_norm": 0.029541015625, + "learning_rate": 0.027178517271240005, + "loss": 0.7882, + "num_input_tokens_seen": 21709360, + "step": 37410 + }, + { + "epoch": 5.572683943997617, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02717737897090874, + "loss": 0.8204, + "num_input_tokens_seen": 21711920, + "step": 37415 + }, + { + "epoch": 5.573428656538576, + "grad_norm": 0.020751953125, + "learning_rate": 0.02717624046485304, + "loss": 0.7926, + "num_input_tokens_seen": 21714928, + "step": 37420 + }, + { + "epoch": 5.574173369079535, + "grad_norm": 0.025634765625, + "learning_rate": 0.02717510175309213, + "loss": 0.8038, + "num_input_tokens_seen": 21717872, + "step": 37425 + }, + { + "epoch": 5.574918081620495, + "grad_norm": 0.0203857421875, + "learning_rate": 0.027173962835645243, + "loss": 0.8018, + "num_input_tokens_seen": 21721072, + "step": 37430 + }, + { + "epoch": 5.575662794161453, + "grad_norm": 0.028564453125, + "learning_rate": 0.027172823712531633, + "loss": 0.8177, + "num_input_tokens_seen": 21724208, + "step": 37435 + }, + { + "epoch": 5.576407506702413, + "grad_norm": 0.025390625, + "learning_rate": 0.027171684383770538, + "loss": 0.8049, + "num_input_tokens_seen": 21727248, + "step": 37440 + }, + { + "epoch": 5.577152219243372, + "grad_norm": 0.02001953125, + "learning_rate": 0.027170544849381206, + "loss": 0.7916, + "num_input_tokens_seen": 21730448, + "step": 37445 + }, + { + "epoch": 5.577896931784331, + "grad_norm": 0.013671875, + "learning_rate": 0.02716940510938289, + "loss": 0.8149, + "num_input_tokens_seen": 21733488, + "step": 37450 + }, + { + "epoch": 5.57864164432529, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02716826516379484, + "loss": 0.7991, + "num_input_tokens_seen": 21736432, + "step": 37455 + }, + { + "epoch": 5.57938635686625, + "grad_norm": 0.029052734375, + "learning_rate": 0.02716712501263632, + "loss": 0.8101, + "num_input_tokens_seen": 21739184, + "step": 37460 + }, + { + "epoch": 5.580131069407209, + "grad_norm": 0.019287109375, + "learning_rate": 0.027165984655926585, + "loss": 0.7882, + "num_input_tokens_seen": 21741776, + "step": 37465 + }, + { + "epoch": 5.580875781948168, + "grad_norm": 0.03076171875, + "learning_rate": 0.027164844093684908, + "loss": 0.7825, + "num_input_tokens_seen": 21744528, + "step": 37470 + }, + { + "epoch": 5.581620494489127, + "grad_norm": 0.03271484375, + "learning_rate": 0.027163703325930554, + "loss": 0.8138, + "num_input_tokens_seen": 21747312, + "step": 37475 + }, + { + "epoch": 5.582365207030087, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02716256235268279, + "loss": 0.8057, + "num_input_tokens_seen": 21750096, + "step": 37480 + }, + { + "epoch": 5.583109919571045, + "grad_norm": 0.022216796875, + "learning_rate": 0.027161421173960903, + "loss": 0.8083, + "num_input_tokens_seen": 21753104, + "step": 37485 + }, + { + "epoch": 5.583854632112005, + "grad_norm": 0.02001953125, + "learning_rate": 0.027160279789784164, + "loss": 0.8168, + "num_input_tokens_seen": 21755888, + "step": 37490 + }, + { + "epoch": 5.584599344652964, + "grad_norm": 0.0155029296875, + "learning_rate": 0.02715913820017185, + "loss": 0.7994, + "num_input_tokens_seen": 21758736, + "step": 37495 + }, + { + "epoch": 5.5853440571939235, + "grad_norm": 0.0244140625, + "learning_rate": 0.027157996405143263, + "loss": 0.7984, + "num_input_tokens_seen": 21761616, + "step": 37500 + }, + { + "epoch": 5.586088769734882, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02715685440471768, + "loss": 0.8104, + "num_input_tokens_seen": 21764336, + "step": 37505 + }, + { + "epoch": 5.586833482275842, + "grad_norm": 0.0155029296875, + "learning_rate": 0.0271557121989144, + "loss": 0.8003, + "num_input_tokens_seen": 21767280, + "step": 37510 + }, + { + "epoch": 5.587578194816801, + "grad_norm": 0.0115966796875, + "learning_rate": 0.027154569787752714, + "loss": 0.8151, + "num_input_tokens_seen": 21770224, + "step": 37515 + }, + { + "epoch": 5.58832290735776, + "grad_norm": 0.01519775390625, + "learning_rate": 0.027153427171251923, + "loss": 0.8117, + "num_input_tokens_seen": 21773200, + "step": 37520 + }, + { + "epoch": 5.589067619898719, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02715228434943134, + "loss": 0.7792, + "num_input_tokens_seen": 21776176, + "step": 37525 + }, + { + "epoch": 5.589812332439678, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02715114132231026, + "loss": 0.7798, + "num_input_tokens_seen": 21778928, + "step": 37530 + }, + { + "epoch": 5.590557044980637, + "grad_norm": 0.045654296875, + "learning_rate": 0.027149998089907992, + "loss": 0.8047, + "num_input_tokens_seen": 21782192, + "step": 37535 + }, + { + "epoch": 5.591301757521597, + "grad_norm": 0.021484375, + "learning_rate": 0.02714885465224386, + "loss": 0.798, + "num_input_tokens_seen": 21785040, + "step": 37540 + }, + { + "epoch": 5.592046470062556, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027147711009337175, + "loss": 0.7923, + "num_input_tokens_seen": 21787792, + "step": 37545 + }, + { + "epoch": 5.592791182603515, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02714656716120726, + "loss": 0.8058, + "num_input_tokens_seen": 21790576, + "step": 37550 + }, + { + "epoch": 5.593535895144474, + "grad_norm": 0.02294921875, + "learning_rate": 0.027145423107873438, + "loss": 0.8065, + "num_input_tokens_seen": 21793360, + "step": 37555 + }, + { + "epoch": 5.594280607685434, + "grad_norm": 0.0211181640625, + "learning_rate": 0.027144278849355035, + "loss": 0.793, + "num_input_tokens_seen": 21796304, + "step": 37560 + }, + { + "epoch": 5.595025320226393, + "grad_norm": 0.023193359375, + "learning_rate": 0.027143134385671387, + "loss": 0.8038, + "num_input_tokens_seen": 21799568, + "step": 37565 + }, + { + "epoch": 5.595770032767351, + "grad_norm": 0.019287109375, + "learning_rate": 0.02714198971684182, + "loss": 0.8035, + "num_input_tokens_seen": 21802384, + "step": 37570 + }, + { + "epoch": 5.596514745308311, + "grad_norm": 0.0150146484375, + "learning_rate": 0.02714084484288568, + "loss": 0.801, + "num_input_tokens_seen": 21805264, + "step": 37575 + }, + { + "epoch": 5.59725945784927, + "grad_norm": 0.01336669921875, + "learning_rate": 0.027139699763822304, + "loss": 0.7944, + "num_input_tokens_seen": 21808048, + "step": 37580 + }, + { + "epoch": 5.5980041703902295, + "grad_norm": 0.0147705078125, + "learning_rate": 0.027138554479671038, + "loss": 0.8161, + "num_input_tokens_seen": 21810992, + "step": 37585 + }, + { + "epoch": 5.598748882931188, + "grad_norm": 0.028564453125, + "learning_rate": 0.027137408990451237, + "loss": 0.7767, + "num_input_tokens_seen": 21813648, + "step": 37590 + }, + { + "epoch": 5.599493595472148, + "grad_norm": 0.022216796875, + "learning_rate": 0.027136263296182246, + "loss": 0.8009, + "num_input_tokens_seen": 21816624, + "step": 37595 + }, + { + "epoch": 5.600238308013107, + "grad_norm": 0.0189208984375, + "learning_rate": 0.027135117396883415, + "loss": 0.7897, + "num_input_tokens_seen": 21819600, + "step": 37600 + }, + { + "epoch": 5.600983020554066, + "grad_norm": 0.0240478515625, + "learning_rate": 0.027133971292574114, + "loss": 0.8459, + "num_input_tokens_seen": 21822480, + "step": 37605 + }, + { + "epoch": 5.601727733095025, + "grad_norm": 0.0279541015625, + "learning_rate": 0.027132824983273702, + "loss": 0.8148, + "num_input_tokens_seen": 21825328, + "step": 37610 + }, + { + "epoch": 5.602472445635985, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02713167846900154, + "loss": 0.7789, + "num_input_tokens_seen": 21827952, + "step": 37615 + }, + { + "epoch": 5.603217158176943, + "grad_norm": 0.01904296875, + "learning_rate": 0.027130531749777, + "loss": 0.799, + "num_input_tokens_seen": 21830640, + "step": 37620 + }, + { + "epoch": 5.603961870717903, + "grad_norm": 0.01953125, + "learning_rate": 0.027129384825619458, + "loss": 0.8033, + "num_input_tokens_seen": 21833584, + "step": 37625 + }, + { + "epoch": 5.604706583258862, + "grad_norm": 0.0274658203125, + "learning_rate": 0.027128237696548287, + "loss": 0.8223, + "num_input_tokens_seen": 21836336, + "step": 37630 + }, + { + "epoch": 5.6054512957998215, + "grad_norm": 0.017578125, + "learning_rate": 0.027127090362582863, + "loss": 0.8038, + "num_input_tokens_seen": 21839248, + "step": 37635 + }, + { + "epoch": 5.60619600834078, + "grad_norm": 0.013427734375, + "learning_rate": 0.027125942823742574, + "loss": 0.7945, + "num_input_tokens_seen": 21842096, + "step": 37640 + }, + { + "epoch": 5.60694072088174, + "grad_norm": 0.041748046875, + "learning_rate": 0.027124795080046806, + "loss": 0.8301, + "num_input_tokens_seen": 21845040, + "step": 37645 + }, + { + "epoch": 5.607685433422699, + "grad_norm": 0.02099609375, + "learning_rate": 0.02712364713151495, + "loss": 0.7913, + "num_input_tokens_seen": 21847760, + "step": 37650 + }, + { + "epoch": 5.608430145963658, + "grad_norm": 0.0301513671875, + "learning_rate": 0.027122498978166393, + "loss": 0.7924, + "num_input_tokens_seen": 21851088, + "step": 37655 + }, + { + "epoch": 5.609174858504617, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02712135062002054, + "loss": 0.7976, + "num_input_tokens_seen": 21854096, + "step": 37660 + }, + { + "epoch": 5.609919571045577, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02712020205709679, + "loss": 0.8052, + "num_input_tokens_seen": 21856944, + "step": 37665 + }, + { + "epoch": 5.6106642835865355, + "grad_norm": 0.0120849609375, + "learning_rate": 0.027119053289414538, + "loss": 0.7808, + "num_input_tokens_seen": 21859792, + "step": 37670 + }, + { + "epoch": 5.611408996127495, + "grad_norm": 0.0252685546875, + "learning_rate": 0.027117904316993204, + "loss": 0.8224, + "num_input_tokens_seen": 21862896, + "step": 37675 + }, + { + "epoch": 5.612153708668454, + "grad_norm": 0.01153564453125, + "learning_rate": 0.027116755139852188, + "loss": 0.7913, + "num_input_tokens_seen": 21865808, + "step": 37680 + }, + { + "epoch": 5.6128984212094135, + "grad_norm": 0.01324462890625, + "learning_rate": 0.027115605758010915, + "loss": 0.7965, + "num_input_tokens_seen": 21868720, + "step": 37685 + }, + { + "epoch": 5.613643133750372, + "grad_norm": 0.0194091796875, + "learning_rate": 0.027114456171488794, + "loss": 0.8107, + "num_input_tokens_seen": 21871472, + "step": 37690 + }, + { + "epoch": 5.614387846291332, + "grad_norm": 0.0211181640625, + "learning_rate": 0.027113306380305242, + "loss": 0.7938, + "num_input_tokens_seen": 21874320, + "step": 37695 + }, + { + "epoch": 5.615132558832291, + "grad_norm": 0.019287109375, + "learning_rate": 0.027112156384479696, + "loss": 0.8, + "num_input_tokens_seen": 21877168, + "step": 37700 + }, + { + "epoch": 5.61587727137325, + "grad_norm": 0.016357421875, + "learning_rate": 0.027111006184031578, + "loss": 0.8005, + "num_input_tokens_seen": 21880016, + "step": 37705 + }, + { + "epoch": 5.616621983914209, + "grad_norm": 0.0205078125, + "learning_rate": 0.02710985577898032, + "loss": 0.8005, + "num_input_tokens_seen": 21882704, + "step": 37710 + }, + { + "epoch": 5.617366696455168, + "grad_norm": 0.0216064453125, + "learning_rate": 0.027108705169345354, + "loss": 0.776, + "num_input_tokens_seen": 21885744, + "step": 37715 + }, + { + "epoch": 5.6181114089961275, + "grad_norm": 0.01904296875, + "learning_rate": 0.027107554355146127, + "loss": 0.7884, + "num_input_tokens_seen": 21888560, + "step": 37720 + }, + { + "epoch": 5.618856121537087, + "grad_norm": 0.0223388671875, + "learning_rate": 0.027106403336402067, + "loss": 0.7984, + "num_input_tokens_seen": 21891600, + "step": 37725 + }, + { + "epoch": 5.619600834078046, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02710525211313263, + "loss": 0.7841, + "num_input_tokens_seen": 21894416, + "step": 37730 + }, + { + "epoch": 5.620345546619005, + "grad_norm": 0.015380859375, + "learning_rate": 0.027104100685357263, + "loss": 0.7817, + "num_input_tokens_seen": 21897712, + "step": 37735 + }, + { + "epoch": 5.621090259159964, + "grad_norm": 0.0252685546875, + "learning_rate": 0.027102949053095418, + "loss": 0.807, + "num_input_tokens_seen": 21900752, + "step": 37740 + }, + { + "epoch": 5.621834971700923, + "grad_norm": 0.0145263671875, + "learning_rate": 0.027101797216366547, + "loss": 0.8034, + "num_input_tokens_seen": 21903728, + "step": 37745 + }, + { + "epoch": 5.622579684241883, + "grad_norm": 0.02001953125, + "learning_rate": 0.027100645175190112, + "loss": 0.7893, + "num_input_tokens_seen": 21906704, + "step": 37750 + }, + { + "epoch": 5.6233243967828415, + "grad_norm": 0.025146484375, + "learning_rate": 0.027099492929585575, + "loss": 0.7774, + "num_input_tokens_seen": 21909552, + "step": 37755 + }, + { + "epoch": 5.624069109323801, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027098340479572405, + "loss": 0.8234, + "num_input_tokens_seen": 21912432, + "step": 37760 + }, + { + "epoch": 5.62481382186476, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02709718782517007, + "loss": 0.8051, + "num_input_tokens_seen": 21915408, + "step": 37765 + }, + { + "epoch": 5.6255585344057195, + "grad_norm": 0.0245361328125, + "learning_rate": 0.027096034966398036, + "loss": 0.8221, + "num_input_tokens_seen": 21918256, + "step": 37770 + }, + { + "epoch": 5.626303246946678, + "grad_norm": 0.0196533203125, + "learning_rate": 0.027094881903275785, + "loss": 0.8138, + "num_input_tokens_seen": 21921392, + "step": 37775 + }, + { + "epoch": 5.627047959487638, + "grad_norm": 0.01300048828125, + "learning_rate": 0.027093728635822805, + "loss": 0.8188, + "num_input_tokens_seen": 21924240, + "step": 37780 + }, + { + "epoch": 5.627792672028597, + "grad_norm": 0.017822265625, + "learning_rate": 0.027092575164058563, + "loss": 0.8369, + "num_input_tokens_seen": 21927184, + "step": 37785 + }, + { + "epoch": 5.628537384569556, + "grad_norm": 0.01177978515625, + "learning_rate": 0.02709142148800256, + "loss": 0.8015, + "num_input_tokens_seen": 21929936, + "step": 37790 + }, + { + "epoch": 5.629282097110515, + "grad_norm": 0.0228271484375, + "learning_rate": 0.027090267607674273, + "loss": 0.7924, + "num_input_tokens_seen": 21932912, + "step": 37795 + }, + { + "epoch": 5.630026809651475, + "grad_norm": 0.0123291015625, + "learning_rate": 0.027089113523093206, + "loss": 0.7842, + "num_input_tokens_seen": 21935888, + "step": 37800 + }, + { + "epoch": 5.6307715221924335, + "grad_norm": 0.02099609375, + "learning_rate": 0.027087959234278855, + "loss": 0.787, + "num_input_tokens_seen": 21938672, + "step": 37805 + }, + { + "epoch": 5.631516234733393, + "grad_norm": 0.024658203125, + "learning_rate": 0.027086804741250716, + "loss": 0.8063, + "num_input_tokens_seen": 21941552, + "step": 37810 + }, + { + "epoch": 5.632260947274352, + "grad_norm": 0.01373291015625, + "learning_rate": 0.027085650044028298, + "loss": 0.7657, + "num_input_tokens_seen": 21944336, + "step": 37815 + }, + { + "epoch": 5.6330056598153115, + "grad_norm": 0.0279541015625, + "learning_rate": 0.027084495142631105, + "loss": 0.7941, + "num_input_tokens_seen": 21947056, + "step": 37820 + }, + { + "epoch": 5.63375037235627, + "grad_norm": 0.036376953125, + "learning_rate": 0.027083340037078647, + "loss": 0.8151, + "num_input_tokens_seen": 21949872, + "step": 37825 + }, + { + "epoch": 5.63449508489723, + "grad_norm": 0.015869140625, + "learning_rate": 0.027082184727390442, + "loss": 0.7876, + "num_input_tokens_seen": 21952368, + "step": 37830 + }, + { + "epoch": 5.635239797438189, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027081029213586006, + "loss": 0.8079, + "num_input_tokens_seen": 21955344, + "step": 37835 + }, + { + "epoch": 5.635984509979148, + "grad_norm": 0.0147705078125, + "learning_rate": 0.02707987349568486, + "loss": 0.8174, + "num_input_tokens_seen": 21958064, + "step": 37840 + }, + { + "epoch": 5.636729222520107, + "grad_norm": 0.0177001953125, + "learning_rate": 0.027078717573706525, + "loss": 0.8062, + "num_input_tokens_seen": 21961072, + "step": 37845 + }, + { + "epoch": 5.637473935061067, + "grad_norm": 0.0279541015625, + "learning_rate": 0.027077561447670536, + "loss": 0.8035, + "num_input_tokens_seen": 21964208, + "step": 37850 + }, + { + "epoch": 5.6382186476020255, + "grad_norm": 0.031494140625, + "learning_rate": 0.02707640511759642, + "loss": 0.8254, + "num_input_tokens_seen": 21967120, + "step": 37855 + }, + { + "epoch": 5.638963360142985, + "grad_norm": 0.025390625, + "learning_rate": 0.027075248583503717, + "loss": 0.8054, + "num_input_tokens_seen": 21970000, + "step": 37860 + }, + { + "epoch": 5.639708072683944, + "grad_norm": 0.0224609375, + "learning_rate": 0.027074091845411957, + "loss": 0.7953, + "num_input_tokens_seen": 21972944, + "step": 37865 + }, + { + "epoch": 5.640452785224904, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02707293490334069, + "loss": 0.7857, + "num_input_tokens_seen": 21975792, + "step": 37870 + }, + { + "epoch": 5.641197497765862, + "grad_norm": 0.01483154296875, + "learning_rate": 0.027071777757309456, + "loss": 0.8052, + "num_input_tokens_seen": 21978448, + "step": 37875 + }, + { + "epoch": 5.641942210306821, + "grad_norm": 0.021484375, + "learning_rate": 0.027070620407337806, + "loss": 0.7787, + "num_input_tokens_seen": 21981552, + "step": 37880 + }, + { + "epoch": 5.642686922847781, + "grad_norm": 0.020263671875, + "learning_rate": 0.027069462853445293, + "loss": 0.8049, + "num_input_tokens_seen": 21984304, + "step": 37885 + }, + { + "epoch": 5.64343163538874, + "grad_norm": 0.0184326171875, + "learning_rate": 0.027068305095651474, + "loss": 0.7892, + "num_input_tokens_seen": 21986928, + "step": 37890 + }, + { + "epoch": 5.644176347929699, + "grad_norm": 0.0291748046875, + "learning_rate": 0.027067147133975906, + "loss": 0.7978, + "num_input_tokens_seen": 21989680, + "step": 37895 + }, + { + "epoch": 5.644921060470658, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02706598896843815, + "loss": 0.8105, + "num_input_tokens_seen": 21992464, + "step": 37900 + }, + { + "epoch": 5.6456657730116175, + "grad_norm": 0.0205078125, + "learning_rate": 0.027064830599057777, + "loss": 0.7823, + "num_input_tokens_seen": 21995216, + "step": 37905 + }, + { + "epoch": 5.646410485552577, + "grad_norm": 0.0264892578125, + "learning_rate": 0.027063672025854347, + "loss": 0.8063, + "num_input_tokens_seen": 21998032, + "step": 37910 + }, + { + "epoch": 5.647155198093536, + "grad_norm": 0.018798828125, + "learning_rate": 0.02706251324884744, + "loss": 0.8248, + "num_input_tokens_seen": 22001008, + "step": 37915 + }, + { + "epoch": 5.647899910634495, + "grad_norm": 0.01513671875, + "learning_rate": 0.02706135426805663, + "loss": 0.8044, + "num_input_tokens_seen": 22004080, + "step": 37920 + }, + { + "epoch": 5.648644623175454, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02706019508350151, + "loss": 0.8241, + "num_input_tokens_seen": 22007248, + "step": 37925 + }, + { + "epoch": 5.649389335716413, + "grad_norm": 0.02587890625, + "learning_rate": 0.027059035695201638, + "loss": 0.7925, + "num_input_tokens_seen": 22010128, + "step": 37930 + }, + { + "epoch": 5.650134048257373, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02705787610317662, + "loss": 0.7696, + "num_input_tokens_seen": 22012848, + "step": 37935 + }, + { + "epoch": 5.6508787607983315, + "grad_norm": 0.0277099609375, + "learning_rate": 0.027056716307446042, + "loss": 0.7527, + "num_input_tokens_seen": 22015888, + "step": 37940 + }, + { + "epoch": 5.651623473339291, + "grad_norm": 0.02734375, + "learning_rate": 0.027055556308029492, + "loss": 0.8019, + "num_input_tokens_seen": 22018704, + "step": 37945 + }, + { + "epoch": 5.65236818588025, + "grad_norm": 0.028076171875, + "learning_rate": 0.027054396104946574, + "loss": 0.8082, + "num_input_tokens_seen": 22021264, + "step": 37950 + }, + { + "epoch": 5.65311289842121, + "grad_norm": 0.01239013671875, + "learning_rate": 0.027053235698216885, + "loss": 0.8192, + "num_input_tokens_seen": 22023984, + "step": 37955 + }, + { + "epoch": 5.653857610962168, + "grad_norm": 0.020263671875, + "learning_rate": 0.027052075087860027, + "loss": 0.7908, + "num_input_tokens_seen": 22026960, + "step": 37960 + }, + { + "epoch": 5.654602323503128, + "grad_norm": 0.017822265625, + "learning_rate": 0.02705091427389561, + "loss": 0.7949, + "num_input_tokens_seen": 22029616, + "step": 37965 + }, + { + "epoch": 5.655347036044087, + "grad_norm": 0.031005859375, + "learning_rate": 0.027049753256343245, + "loss": 0.7996, + "num_input_tokens_seen": 22032720, + "step": 37970 + }, + { + "epoch": 5.656091748585046, + "grad_norm": 0.025146484375, + "learning_rate": 0.027048592035222547, + "loss": 0.8141, + "num_input_tokens_seen": 22035888, + "step": 37975 + }, + { + "epoch": 5.656836461126005, + "grad_norm": 0.03564453125, + "learning_rate": 0.02704743061055313, + "loss": 0.8035, + "num_input_tokens_seen": 22038640, + "step": 37980 + }, + { + "epoch": 5.657581173666965, + "grad_norm": 0.023681640625, + "learning_rate": 0.02704626898235462, + "loss": 0.8301, + "num_input_tokens_seen": 22041264, + "step": 37985 + }, + { + "epoch": 5.6583258862079235, + "grad_norm": 0.0390625, + "learning_rate": 0.027045107150646636, + "loss": 0.8494, + "num_input_tokens_seen": 22044048, + "step": 37990 + }, + { + "epoch": 5.659070598748883, + "grad_norm": 0.0220947265625, + "learning_rate": 0.027043945115448807, + "loss": 0.8247, + "num_input_tokens_seen": 22046512, + "step": 37995 + }, + { + "epoch": 5.659815311289842, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02704278287678077, + "loss": 0.8085, + "num_input_tokens_seen": 22049424, + "step": 38000 + }, + { + "epoch": 5.660560023830802, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02704162043466216, + "loss": 0.7797, + "num_input_tokens_seen": 22052464, + "step": 38005 + }, + { + "epoch": 5.66130473637176, + "grad_norm": 0.0125732421875, + "learning_rate": 0.027040457789112602, + "loss": 0.8001, + "num_input_tokens_seen": 22055216, + "step": 38010 + }, + { + "epoch": 5.66204944891272, + "grad_norm": 0.0224609375, + "learning_rate": 0.027039294940151753, + "loss": 0.7852, + "num_input_tokens_seen": 22058224, + "step": 38015 + }, + { + "epoch": 5.662794161453679, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02703813188779925, + "loss": 0.8132, + "num_input_tokens_seen": 22061392, + "step": 38020 + }, + { + "epoch": 5.663538873994638, + "grad_norm": 0.0255126953125, + "learning_rate": 0.027036968632074745, + "loss": 0.7883, + "num_input_tokens_seen": 22064336, + "step": 38025 + }, + { + "epoch": 5.664283586535597, + "grad_norm": 0.0224609375, + "learning_rate": 0.027035805172997886, + "loss": 0.8283, + "num_input_tokens_seen": 22067536, + "step": 38030 + }, + { + "epoch": 5.665028299076557, + "grad_norm": 0.020263671875, + "learning_rate": 0.027034641510588337, + "loss": 0.8127, + "num_input_tokens_seen": 22070352, + "step": 38035 + }, + { + "epoch": 5.665773011617516, + "grad_norm": 0.01470947265625, + "learning_rate": 0.027033477644865744, + "loss": 0.8173, + "num_input_tokens_seen": 22073360, + "step": 38040 + }, + { + "epoch": 5.666517724158475, + "grad_norm": 0.019287109375, + "learning_rate": 0.02703231357584978, + "loss": 0.805, + "num_input_tokens_seen": 22076048, + "step": 38045 + }, + { + "epoch": 5.667262436699434, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02703114930356011, + "loss": 0.811, + "num_input_tokens_seen": 22078992, + "step": 38050 + }, + { + "epoch": 5.668007149240394, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0270299848280164, + "loss": 0.7976, + "num_input_tokens_seen": 22081488, + "step": 38055 + }, + { + "epoch": 5.668751861781352, + "grad_norm": 0.0206298828125, + "learning_rate": 0.027028820149238315, + "loss": 0.8047, + "num_input_tokens_seen": 22084368, + "step": 38060 + }, + { + "epoch": 5.669496574322311, + "grad_norm": 0.027587890625, + "learning_rate": 0.027027655267245543, + "loss": 0.7762, + "num_input_tokens_seen": 22086992, + "step": 38065 + }, + { + "epoch": 5.670241286863271, + "grad_norm": 0.01397705078125, + "learning_rate": 0.027026490182057765, + "loss": 0.8124, + "num_input_tokens_seen": 22089840, + "step": 38070 + }, + { + "epoch": 5.67098599940423, + "grad_norm": 0.0181884765625, + "learning_rate": 0.027025324893694653, + "loss": 0.8084, + "num_input_tokens_seen": 22092752, + "step": 38075 + }, + { + "epoch": 5.671730711945189, + "grad_norm": 0.02587890625, + "learning_rate": 0.027024159402175903, + "loss": 0.8022, + "num_input_tokens_seen": 22095408, + "step": 38080 + }, + { + "epoch": 5.672475424486148, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0270229937075212, + "loss": 0.788, + "num_input_tokens_seen": 22098032, + "step": 38085 + }, + { + "epoch": 5.673220137027108, + "grad_norm": 0.022705078125, + "learning_rate": 0.027021827809750234, + "loss": 0.8192, + "num_input_tokens_seen": 22100880, + "step": 38090 + }, + { + "epoch": 5.673964849568066, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027020661708882707, + "loss": 0.8005, + "num_input_tokens_seen": 22103952, + "step": 38095 + }, + { + "epoch": 5.674709562109026, + "grad_norm": 0.016845703125, + "learning_rate": 0.02701949540493832, + "loss": 0.791, + "num_input_tokens_seen": 22106864, + "step": 38100 + }, + { + "epoch": 5.675454274649985, + "grad_norm": 0.0179443359375, + "learning_rate": 0.027018328897936775, + "loss": 0.8006, + "num_input_tokens_seen": 22109840, + "step": 38105 + }, + { + "epoch": 5.676198987190944, + "grad_norm": 0.0286865234375, + "learning_rate": 0.027017162187897778, + "loss": 0.8049, + "num_input_tokens_seen": 22113008, + "step": 38110 + }, + { + "epoch": 5.676943699731903, + "grad_norm": 0.0240478515625, + "learning_rate": 0.027015995274841036, + "loss": 0.7962, + "num_input_tokens_seen": 22115824, + "step": 38115 + }, + { + "epoch": 5.677688412272863, + "grad_norm": 0.012939453125, + "learning_rate": 0.02701482815878627, + "loss": 0.7975, + "num_input_tokens_seen": 22118928, + "step": 38120 + }, + { + "epoch": 5.678433124813822, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02701366083975319, + "loss": 0.8126, + "num_input_tokens_seen": 22122160, + "step": 38125 + }, + { + "epoch": 5.679177837354781, + "grad_norm": 0.0103759765625, + "learning_rate": 0.027012493317761524, + "loss": 0.8057, + "num_input_tokens_seen": 22125072, + "step": 38130 + }, + { + "epoch": 5.67992254989574, + "grad_norm": 0.018798828125, + "learning_rate": 0.02701132559283099, + "loss": 0.8122, + "num_input_tokens_seen": 22127984, + "step": 38135 + }, + { + "epoch": 5.6806672624367, + "grad_norm": 0.02001953125, + "learning_rate": 0.027010157664981316, + "loss": 0.7892, + "num_input_tokens_seen": 22130896, + "step": 38140 + }, + { + "epoch": 5.681411974977658, + "grad_norm": 0.01416015625, + "learning_rate": 0.027008989534232235, + "loss": 0.8056, + "num_input_tokens_seen": 22133808, + "step": 38145 + }, + { + "epoch": 5.682156687518618, + "grad_norm": 0.01513671875, + "learning_rate": 0.027007821200603487, + "loss": 0.8136, + "num_input_tokens_seen": 22136784, + "step": 38150 + }, + { + "epoch": 5.682901400059577, + "grad_norm": 0.02783203125, + "learning_rate": 0.0270066526641148, + "loss": 0.8036, + "num_input_tokens_seen": 22139824, + "step": 38155 + }, + { + "epoch": 5.683646112600536, + "grad_norm": 0.0213623046875, + "learning_rate": 0.027005483924785918, + "loss": 0.8115, + "num_input_tokens_seen": 22142832, + "step": 38160 + }, + { + "epoch": 5.684390825141495, + "grad_norm": 0.018798828125, + "learning_rate": 0.027004314982636588, + "loss": 0.8085, + "num_input_tokens_seen": 22145520, + "step": 38165 + }, + { + "epoch": 5.685135537682455, + "grad_norm": 0.0286865234375, + "learning_rate": 0.027003145837686555, + "loss": 0.7717, + "num_input_tokens_seen": 22148560, + "step": 38170 + }, + { + "epoch": 5.685880250223414, + "grad_norm": 0.01318359375, + "learning_rate": 0.027001976489955577, + "loss": 0.8133, + "num_input_tokens_seen": 22151600, + "step": 38175 + }, + { + "epoch": 5.686624962764373, + "grad_norm": 0.0111083984375, + "learning_rate": 0.0270008069394634, + "loss": 0.7852, + "num_input_tokens_seen": 22154864, + "step": 38180 + }, + { + "epoch": 5.687369675305332, + "grad_norm": 0.020751953125, + "learning_rate": 0.026999637186229792, + "loss": 0.7891, + "num_input_tokens_seen": 22157648, + "step": 38185 + }, + { + "epoch": 5.688114387846292, + "grad_norm": 0.018798828125, + "learning_rate": 0.026998467230274506, + "loss": 0.8026, + "num_input_tokens_seen": 22160560, + "step": 38190 + }, + { + "epoch": 5.68885910038725, + "grad_norm": 0.025146484375, + "learning_rate": 0.026997297071617313, + "loss": 0.7908, + "num_input_tokens_seen": 22163696, + "step": 38195 + }, + { + "epoch": 5.68960381292821, + "grad_norm": 0.01263427734375, + "learning_rate": 0.026996126710277978, + "loss": 0.8, + "num_input_tokens_seen": 22166448, + "step": 38200 + }, + { + "epoch": 5.690348525469169, + "grad_norm": 0.016845703125, + "learning_rate": 0.026994956146276273, + "loss": 0.8268, + "num_input_tokens_seen": 22169232, + "step": 38205 + }, + { + "epoch": 5.6910932380101285, + "grad_norm": 0.0186767578125, + "learning_rate": 0.026993785379631978, + "loss": 0.8057, + "num_input_tokens_seen": 22172368, + "step": 38210 + }, + { + "epoch": 5.691837950551087, + "grad_norm": 0.013671875, + "learning_rate": 0.02699261441036487, + "loss": 0.8282, + "num_input_tokens_seen": 22175152, + "step": 38215 + }, + { + "epoch": 5.692582663092047, + "grad_norm": 0.01953125, + "learning_rate": 0.026991443238494724, + "loss": 0.8017, + "num_input_tokens_seen": 22177648, + "step": 38220 + }, + { + "epoch": 5.693327375633006, + "grad_norm": 0.0301513671875, + "learning_rate": 0.026990271864041338, + "loss": 0.8044, + "num_input_tokens_seen": 22180528, + "step": 38225 + }, + { + "epoch": 5.694072088173964, + "grad_norm": 0.030517578125, + "learning_rate": 0.026989100287024493, + "loss": 0.7848, + "num_input_tokens_seen": 22183600, + "step": 38230 + }, + { + "epoch": 5.694816800714924, + "grad_norm": 0.0137939453125, + "learning_rate": 0.026987928507463985, + "loss": 0.7885, + "num_input_tokens_seen": 22186384, + "step": 38235 + }, + { + "epoch": 5.695561513255884, + "grad_norm": 0.0189208984375, + "learning_rate": 0.026986756525379605, + "loss": 0.8291, + "num_input_tokens_seen": 22189488, + "step": 38240 + }, + { + "epoch": 5.696306225796842, + "grad_norm": 0.037109375, + "learning_rate": 0.02698558434079116, + "loss": 0.8011, + "num_input_tokens_seen": 22192208, + "step": 38245 + }, + { + "epoch": 5.697050938337801, + "grad_norm": 0.0269775390625, + "learning_rate": 0.026984411953718448, + "loss": 0.7845, + "num_input_tokens_seen": 22195248, + "step": 38250 + }, + { + "epoch": 5.697795650878761, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02698323936418128, + "loss": 0.8051, + "num_input_tokens_seen": 22198384, + "step": 38255 + }, + { + "epoch": 5.6985403634197205, + "grad_norm": 0.0133056640625, + "learning_rate": 0.026982066572199457, + "loss": 0.8207, + "num_input_tokens_seen": 22201104, + "step": 38260 + }, + { + "epoch": 5.699285075960679, + "grad_norm": 0.01177978515625, + "learning_rate": 0.0269808935777928, + "loss": 0.8125, + "num_input_tokens_seen": 22203888, + "step": 38265 + }, + { + "epoch": 5.700029788501638, + "grad_norm": 0.0208740234375, + "learning_rate": 0.026979720380981124, + "loss": 0.8118, + "num_input_tokens_seen": 22207120, + "step": 38270 + }, + { + "epoch": 5.700774501042598, + "grad_norm": 0.019775390625, + "learning_rate": 0.026978546981784247, + "loss": 0.7984, + "num_input_tokens_seen": 22210256, + "step": 38275 + }, + { + "epoch": 5.701519213583556, + "grad_norm": 0.0196533203125, + "learning_rate": 0.026977373380221985, + "loss": 0.8031, + "num_input_tokens_seen": 22213136, + "step": 38280 + }, + { + "epoch": 5.702263926124516, + "grad_norm": 0.020263671875, + "learning_rate": 0.026976199576314183, + "loss": 0.8178, + "num_input_tokens_seen": 22216144, + "step": 38285 + }, + { + "epoch": 5.703008638665475, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02697502557008066, + "loss": 0.7949, + "num_input_tokens_seen": 22218704, + "step": 38290 + }, + { + "epoch": 5.7037533512064345, + "grad_norm": 0.0272216796875, + "learning_rate": 0.026973851361541252, + "loss": 0.8109, + "num_input_tokens_seen": 22221680, + "step": 38295 + }, + { + "epoch": 5.704498063747393, + "grad_norm": 0.033447265625, + "learning_rate": 0.02697267695071579, + "loss": 0.8076, + "num_input_tokens_seen": 22224368, + "step": 38300 + }, + { + "epoch": 5.705242776288353, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02697150233762412, + "loss": 0.7889, + "num_input_tokens_seen": 22227280, + "step": 38305 + }, + { + "epoch": 5.705987488829312, + "grad_norm": 0.0198974609375, + "learning_rate": 0.026970327522286088, + "loss": 0.7899, + "num_input_tokens_seen": 22230064, + "step": 38310 + }, + { + "epoch": 5.706732201370271, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02696915250472154, + "loss": 0.7918, + "num_input_tokens_seen": 22233008, + "step": 38315 + }, + { + "epoch": 5.70747691391123, + "grad_norm": 0.03173828125, + "learning_rate": 0.026967977284950317, + "loss": 0.8006, + "num_input_tokens_seen": 22236144, + "step": 38320 + }, + { + "epoch": 5.70822162645219, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02696680186299229, + "loss": 0.7975, + "num_input_tokens_seen": 22239216, + "step": 38325 + }, + { + "epoch": 5.708966338993148, + "grad_norm": 0.018798828125, + "learning_rate": 0.026965626238867304, + "loss": 0.7954, + "num_input_tokens_seen": 22241808, + "step": 38330 + }, + { + "epoch": 5.709711051534108, + "grad_norm": 0.01092529296875, + "learning_rate": 0.026964450412595226, + "loss": 0.7788, + "num_input_tokens_seen": 22244752, + "step": 38335 + }, + { + "epoch": 5.710455764075067, + "grad_norm": 0.02490234375, + "learning_rate": 0.026963274384195917, + "loss": 0.8169, + "num_input_tokens_seen": 22247920, + "step": 38340 + }, + { + "epoch": 5.7112004766160265, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02696209815368925, + "loss": 0.7948, + "num_input_tokens_seen": 22250768, + "step": 38345 + }, + { + "epoch": 5.711945189156985, + "grad_norm": 0.01202392578125, + "learning_rate": 0.02696092172109509, + "loss": 0.8096, + "num_input_tokens_seen": 22253616, + "step": 38350 + }, + { + "epoch": 5.712689901697945, + "grad_norm": 0.029541015625, + "learning_rate": 0.026959745086433312, + "loss": 0.7985, + "num_input_tokens_seen": 22256208, + "step": 38355 + }, + { + "epoch": 5.713434614238904, + "grad_norm": 0.0184326171875, + "learning_rate": 0.026958568249723796, + "loss": 0.807, + "num_input_tokens_seen": 22259024, + "step": 38360 + }, + { + "epoch": 5.714179326779863, + "grad_norm": 0.01904296875, + "learning_rate": 0.026957391210986427, + "loss": 0.7908, + "num_input_tokens_seen": 22262192, + "step": 38365 + }, + { + "epoch": 5.714924039320822, + "grad_norm": 0.018798828125, + "learning_rate": 0.026956213970241085, + "loss": 0.782, + "num_input_tokens_seen": 22264912, + "step": 38370 + }, + { + "epoch": 5.715668751861782, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02695503652750766, + "loss": 0.8431, + "num_input_tokens_seen": 22267824, + "step": 38375 + }, + { + "epoch": 5.7164134644027405, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026953858882806037, + "loss": 0.8131, + "num_input_tokens_seen": 22270960, + "step": 38380 + }, + { + "epoch": 5.7171581769437, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026952681036156126, + "loss": 0.8302, + "num_input_tokens_seen": 22274032, + "step": 38385 + }, + { + "epoch": 5.717902889484659, + "grad_norm": 0.02001953125, + "learning_rate": 0.026951502987577817, + "loss": 0.8079, + "num_input_tokens_seen": 22276848, + "step": 38390 + }, + { + "epoch": 5.718647602025618, + "grad_norm": 0.0213623046875, + "learning_rate": 0.026950324737091007, + "loss": 0.7985, + "num_input_tokens_seen": 22279760, + "step": 38395 + }, + { + "epoch": 5.719392314566577, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02694914628471561, + "loss": 0.7999, + "num_input_tokens_seen": 22282608, + "step": 38400 + }, + { + "epoch": 5.720137027107537, + "grad_norm": 0.013916015625, + "learning_rate": 0.02694796763047153, + "loss": 0.8086, + "num_input_tokens_seen": 22285584, + "step": 38405 + }, + { + "epoch": 5.720881739648496, + "grad_norm": 0.019775390625, + "learning_rate": 0.02694678877437868, + "loss": 0.804, + "num_input_tokens_seen": 22288144, + "step": 38410 + }, + { + "epoch": 5.721626452189454, + "grad_norm": 0.0205078125, + "learning_rate": 0.026945609716456972, + "loss": 0.7899, + "num_input_tokens_seen": 22291056, + "step": 38415 + }, + { + "epoch": 5.722371164730414, + "grad_norm": 0.01953125, + "learning_rate": 0.026944430456726332, + "loss": 0.8088, + "num_input_tokens_seen": 22293808, + "step": 38420 + }, + { + "epoch": 5.723115877271374, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026943250995206683, + "loss": 0.8044, + "num_input_tokens_seen": 22296688, + "step": 38425 + }, + { + "epoch": 5.7238605898123325, + "grad_norm": 0.0125732421875, + "learning_rate": 0.026942071331917943, + "loss": 0.8192, + "num_input_tokens_seen": 22299184, + "step": 38430 + }, + { + "epoch": 5.724605302353291, + "grad_norm": 0.02099609375, + "learning_rate": 0.026940891466880047, + "loss": 0.8064, + "num_input_tokens_seen": 22302160, + "step": 38435 + }, + { + "epoch": 5.725350014894251, + "grad_norm": 0.01806640625, + "learning_rate": 0.026939711400112924, + "loss": 0.8054, + "num_input_tokens_seen": 22305520, + "step": 38440 + }, + { + "epoch": 5.72609472743521, + "grad_norm": 0.0142822265625, + "learning_rate": 0.026938531131636516, + "loss": 0.8062, + "num_input_tokens_seen": 22308656, + "step": 38445 + }, + { + "epoch": 5.726839439976169, + "grad_norm": 0.01953125, + "learning_rate": 0.026937350661470752, + "loss": 0.7839, + "num_input_tokens_seen": 22311472, + "step": 38450 + }, + { + "epoch": 5.727584152517128, + "grad_norm": 0.02001953125, + "learning_rate": 0.026936169989635585, + "loss": 0.8006, + "num_input_tokens_seen": 22314448, + "step": 38455 + }, + { + "epoch": 5.728328865058088, + "grad_norm": 0.0264892578125, + "learning_rate": 0.026934989116150956, + "loss": 0.8123, + "num_input_tokens_seen": 22317392, + "step": 38460 + }, + { + "epoch": 5.7290735775990465, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02693380804103682, + "loss": 0.8033, + "num_input_tokens_seen": 22320400, + "step": 38465 + }, + { + "epoch": 5.729818290140006, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02693262676431312, + "loss": 0.7899, + "num_input_tokens_seen": 22323376, + "step": 38470 + }, + { + "epoch": 5.730563002680965, + "grad_norm": 0.0130615234375, + "learning_rate": 0.026931445285999823, + "loss": 0.8021, + "num_input_tokens_seen": 22326320, + "step": 38475 + }, + { + "epoch": 5.7313077152219245, + "grad_norm": 0.0128173828125, + "learning_rate": 0.02693026360611688, + "loss": 0.7972, + "num_input_tokens_seen": 22328976, + "step": 38480 + }, + { + "epoch": 5.732052427762883, + "grad_norm": 0.0186767578125, + "learning_rate": 0.026929081724684265, + "loss": 0.7955, + "num_input_tokens_seen": 22332080, + "step": 38485 + }, + { + "epoch": 5.732797140303843, + "grad_norm": 0.0281982421875, + "learning_rate": 0.026927899641721934, + "loss": 0.8289, + "num_input_tokens_seen": 22335024, + "step": 38490 + }, + { + "epoch": 5.733541852844802, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02692671735724986, + "loss": 0.8076, + "num_input_tokens_seen": 22337712, + "step": 38495 + }, + { + "epoch": 5.734286565385761, + "grad_norm": 0.0235595703125, + "learning_rate": 0.026925534871288025, + "loss": 0.8109, + "num_input_tokens_seen": 22340752, + "step": 38500 + }, + { + "epoch": 5.73503127792672, + "grad_norm": 0.0279541015625, + "learning_rate": 0.026924352183856394, + "loss": 0.7949, + "num_input_tokens_seen": 22343312, + "step": 38505 + }, + { + "epoch": 5.73577599046768, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02692316929497495, + "loss": 0.7793, + "num_input_tokens_seen": 22346544, + "step": 38510 + }, + { + "epoch": 5.7365207030086385, + "grad_norm": 0.0263671875, + "learning_rate": 0.02692198620466368, + "loss": 0.8171, + "num_input_tokens_seen": 22349392, + "step": 38515 + }, + { + "epoch": 5.737265415549598, + "grad_norm": 0.0220947265625, + "learning_rate": 0.026920802912942573, + "loss": 0.7861, + "num_input_tokens_seen": 22352432, + "step": 38520 + }, + { + "epoch": 5.738010128090557, + "grad_norm": 0.018310546875, + "learning_rate": 0.02691961941983162, + "loss": 0.8013, + "num_input_tokens_seen": 22355536, + "step": 38525 + }, + { + "epoch": 5.7387548406315165, + "grad_norm": 0.0208740234375, + "learning_rate": 0.026918435725350805, + "loss": 0.8092, + "num_input_tokens_seen": 22358384, + "step": 38530 + }, + { + "epoch": 5.739499553172475, + "grad_norm": 0.029541015625, + "learning_rate": 0.026917251829520133, + "loss": 0.7799, + "num_input_tokens_seen": 22361360, + "step": 38535 + }, + { + "epoch": 5.740244265713435, + "grad_norm": 0.021484375, + "learning_rate": 0.026916067732359602, + "loss": 0.8152, + "num_input_tokens_seen": 22364048, + "step": 38540 + }, + { + "epoch": 5.740988978254394, + "grad_norm": 0.0322265625, + "learning_rate": 0.02691488343388922, + "loss": 0.7996, + "num_input_tokens_seen": 22366800, + "step": 38545 + }, + { + "epoch": 5.741733690795353, + "grad_norm": 0.0189208984375, + "learning_rate": 0.026913698934128993, + "loss": 0.7962, + "num_input_tokens_seen": 22369552, + "step": 38550 + }, + { + "epoch": 5.742478403336312, + "grad_norm": 0.0201416015625, + "learning_rate": 0.026912514233098928, + "loss": 0.8041, + "num_input_tokens_seen": 22372496, + "step": 38555 + }, + { + "epoch": 5.743223115877272, + "grad_norm": 0.013671875, + "learning_rate": 0.026911329330819043, + "loss": 0.8063, + "num_input_tokens_seen": 22375376, + "step": 38560 + }, + { + "epoch": 5.7439678284182305, + "grad_norm": 0.024658203125, + "learning_rate": 0.026910144227309363, + "loss": 0.8047, + "num_input_tokens_seen": 22378384, + "step": 38565 + }, + { + "epoch": 5.74471254095919, + "grad_norm": 0.0206298828125, + "learning_rate": 0.026908958922589894, + "loss": 0.8211, + "num_input_tokens_seen": 22380912, + "step": 38570 + }, + { + "epoch": 5.745457253500149, + "grad_norm": 0.01239013671875, + "learning_rate": 0.026907773416680665, + "loss": 0.8079, + "num_input_tokens_seen": 22383888, + "step": 38575 + }, + { + "epoch": 5.746201966041108, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02690658770960171, + "loss": 0.8167, + "num_input_tokens_seen": 22386640, + "step": 38580 + }, + { + "epoch": 5.746946678582067, + "grad_norm": 0.027587890625, + "learning_rate": 0.026905401801373057, + "loss": 0.8028, + "num_input_tokens_seen": 22389552, + "step": 38585 + }, + { + "epoch": 5.747691391123027, + "grad_norm": 0.0189208984375, + "learning_rate": 0.026904215692014743, + "loss": 0.7834, + "num_input_tokens_seen": 22392464, + "step": 38590 + }, + { + "epoch": 5.748436103663986, + "grad_norm": 0.02490234375, + "learning_rate": 0.026903029381546796, + "loss": 0.8296, + "num_input_tokens_seen": 22395248, + "step": 38595 + }, + { + "epoch": 5.7491808162049445, + "grad_norm": 0.015625, + "learning_rate": 0.02690184286998927, + "loss": 0.8117, + "num_input_tokens_seen": 22398384, + "step": 38600 + }, + { + "epoch": 5.749925528745904, + "grad_norm": 0.0145263671875, + "learning_rate": 0.02690065615736221, + "loss": 0.8046, + "num_input_tokens_seen": 22401072, + "step": 38605 + }, + { + "epoch": 5.750670241286863, + "grad_norm": 0.0208740234375, + "learning_rate": 0.026899469243685654, + "loss": 0.8132, + "num_input_tokens_seen": 22403888, + "step": 38610 + }, + { + "epoch": 5.7514149538278225, + "grad_norm": 0.019287109375, + "learning_rate": 0.02689828212897966, + "loss": 0.8005, + "num_input_tokens_seen": 22406832, + "step": 38615 + }, + { + "epoch": 5.752159666368781, + "grad_norm": 0.020263671875, + "learning_rate": 0.026897094813264286, + "loss": 0.8088, + "num_input_tokens_seen": 22409616, + "step": 38620 + }, + { + "epoch": 5.752904378909741, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02689590729655958, + "loss": 0.7857, + "num_input_tokens_seen": 22412656, + "step": 38625 + }, + { + "epoch": 5.7536490914507, + "grad_norm": 0.0245361328125, + "learning_rate": 0.026894719578885614, + "loss": 0.8004, + "num_input_tokens_seen": 22415280, + "step": 38630 + }, + { + "epoch": 5.754393803991659, + "grad_norm": 0.0286865234375, + "learning_rate": 0.026893531660262454, + "loss": 0.7874, + "num_input_tokens_seen": 22417936, + "step": 38635 + }, + { + "epoch": 5.755138516532618, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02689234354071016, + "loss": 0.7963, + "num_input_tokens_seen": 22420880, + "step": 38640 + }, + { + "epoch": 5.755883229073578, + "grad_norm": 0.024658203125, + "learning_rate": 0.02689115522024881, + "loss": 0.8055, + "num_input_tokens_seen": 22423632, + "step": 38645 + }, + { + "epoch": 5.7566279416145365, + "grad_norm": 0.0255126953125, + "learning_rate": 0.026889966698898475, + "loss": 0.8057, + "num_input_tokens_seen": 22426992, + "step": 38650 + }, + { + "epoch": 5.757372654155496, + "grad_norm": 0.02880859375, + "learning_rate": 0.02688877797667924, + "loss": 0.8163, + "num_input_tokens_seen": 22430288, + "step": 38655 + }, + { + "epoch": 5.758117366696455, + "grad_norm": 0.01165771484375, + "learning_rate": 0.026887589053611185, + "loss": 0.8094, + "num_input_tokens_seen": 22433392, + "step": 38660 + }, + { + "epoch": 5.7588620792374146, + "grad_norm": 0.029296875, + "learning_rate": 0.026886399929714394, + "loss": 0.7952, + "num_input_tokens_seen": 22436368, + "step": 38665 + }, + { + "epoch": 5.759606791778373, + "grad_norm": 0.014404296875, + "learning_rate": 0.02688521060500896, + "loss": 0.8201, + "num_input_tokens_seen": 22439376, + "step": 38670 + }, + { + "epoch": 5.760351504319333, + "grad_norm": 0.01171875, + "learning_rate": 0.02688402107951497, + "loss": 0.8171, + "num_input_tokens_seen": 22442544, + "step": 38675 + }, + { + "epoch": 5.761096216860292, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02688283135325252, + "loss": 0.7996, + "num_input_tokens_seen": 22445744, + "step": 38680 + }, + { + "epoch": 5.761840929401251, + "grad_norm": 0.0137939453125, + "learning_rate": 0.02688164142624171, + "loss": 0.8131, + "num_input_tokens_seen": 22448592, + "step": 38685 + }, + { + "epoch": 5.76258564194221, + "grad_norm": 0.01312255859375, + "learning_rate": 0.026880451298502653, + "loss": 0.8019, + "num_input_tokens_seen": 22451536, + "step": 38690 + }, + { + "epoch": 5.76333035448317, + "grad_norm": 0.025634765625, + "learning_rate": 0.026879260970055434, + "loss": 0.817, + "num_input_tokens_seen": 22454736, + "step": 38695 + }, + { + "epoch": 5.7640750670241285, + "grad_norm": 0.024658203125, + "learning_rate": 0.026878070440920183, + "loss": 0.7934, + "num_input_tokens_seen": 22457616, + "step": 38700 + }, + { + "epoch": 5.764819779565088, + "grad_norm": 0.0224609375, + "learning_rate": 0.026876879711117, + "loss": 0.7934, + "num_input_tokens_seen": 22460464, + "step": 38705 + }, + { + "epoch": 5.765564492106047, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02687568878066601, + "loss": 0.7999, + "num_input_tokens_seen": 22463376, + "step": 38710 + }, + { + "epoch": 5.766309204647007, + "grad_norm": 0.020263671875, + "learning_rate": 0.026874497649587326, + "loss": 0.8065, + "num_input_tokens_seen": 22466192, + "step": 38715 + }, + { + "epoch": 5.767053917187965, + "grad_norm": 0.02392578125, + "learning_rate": 0.02687330631790107, + "loss": 0.8036, + "num_input_tokens_seen": 22469520, + "step": 38720 + }, + { + "epoch": 5.767798629728925, + "grad_norm": 0.0252685546875, + "learning_rate": 0.026872114785627375, + "loss": 0.8099, + "num_input_tokens_seen": 22472272, + "step": 38725 + }, + { + "epoch": 5.768543342269884, + "grad_norm": 0.01385498046875, + "learning_rate": 0.026870923052786367, + "loss": 0.7958, + "num_input_tokens_seen": 22475216, + "step": 38730 + }, + { + "epoch": 5.769288054810843, + "grad_norm": 0.032470703125, + "learning_rate": 0.026869731119398176, + "loss": 0.7872, + "num_input_tokens_seen": 22478000, + "step": 38735 + }, + { + "epoch": 5.770032767351802, + "grad_norm": 0.032470703125, + "learning_rate": 0.026868538985482943, + "loss": 0.7957, + "num_input_tokens_seen": 22480912, + "step": 38740 + }, + { + "epoch": 5.770777479892761, + "grad_norm": 0.030029296875, + "learning_rate": 0.026867346651060806, + "loss": 0.795, + "num_input_tokens_seen": 22483888, + "step": 38745 + }, + { + "epoch": 5.7715221924337206, + "grad_norm": 0.026611328125, + "learning_rate": 0.02686615411615191, + "loss": 0.7951, + "num_input_tokens_seen": 22486640, + "step": 38750 + }, + { + "epoch": 5.77226690497468, + "grad_norm": 0.0174560546875, + "learning_rate": 0.0268649613807764, + "loss": 0.7854, + "num_input_tokens_seen": 22489168, + "step": 38755 + }, + { + "epoch": 5.773011617515639, + "grad_norm": 0.0240478515625, + "learning_rate": 0.026863768444954427, + "loss": 0.7997, + "num_input_tokens_seen": 22491888, + "step": 38760 + }, + { + "epoch": 5.773756330056598, + "grad_norm": 0.0267333984375, + "learning_rate": 0.026862575308706147, + "loss": 0.8128, + "num_input_tokens_seen": 22494704, + "step": 38765 + }, + { + "epoch": 5.774501042597557, + "grad_norm": 0.014892578125, + "learning_rate": 0.026861381972051707, + "loss": 0.801, + "num_input_tokens_seen": 22497744, + "step": 38770 + }, + { + "epoch": 5.775245755138517, + "grad_norm": 0.032958984375, + "learning_rate": 0.026860188435011277, + "loss": 0.8217, + "num_input_tokens_seen": 22500816, + "step": 38775 + }, + { + "epoch": 5.775990467679476, + "grad_norm": 0.0281982421875, + "learning_rate": 0.026858994697605016, + "loss": 0.8188, + "num_input_tokens_seen": 22503568, + "step": 38780 + }, + { + "epoch": 5.7767351802204345, + "grad_norm": 0.01953125, + "learning_rate": 0.026857800759853095, + "loss": 0.8118, + "num_input_tokens_seen": 22506608, + "step": 38785 + }, + { + "epoch": 5.777479892761394, + "grad_norm": 0.021240234375, + "learning_rate": 0.02685660662177568, + "loss": 0.7745, + "num_input_tokens_seen": 22509360, + "step": 38790 + }, + { + "epoch": 5.778224605302353, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02685541228339295, + "loss": 0.8117, + "num_input_tokens_seen": 22512368, + "step": 38795 + }, + { + "epoch": 5.778969317843313, + "grad_norm": 0.01953125, + "learning_rate": 0.026854217744725076, + "loss": 0.789, + "num_input_tokens_seen": 22515376, + "step": 38800 + }, + { + "epoch": 5.779714030384271, + "grad_norm": 0.0257568359375, + "learning_rate": 0.026853023005792242, + "loss": 0.806, + "num_input_tokens_seen": 22518608, + "step": 38805 + }, + { + "epoch": 5.780458742925231, + "grad_norm": 0.021728515625, + "learning_rate": 0.02685182806661463, + "loss": 0.806, + "num_input_tokens_seen": 22521552, + "step": 38810 + }, + { + "epoch": 5.78120345546619, + "grad_norm": 0.01806640625, + "learning_rate": 0.02685063292721243, + "loss": 0.7894, + "num_input_tokens_seen": 22524464, + "step": 38815 + }, + { + "epoch": 5.781948168007149, + "grad_norm": 0.02587890625, + "learning_rate": 0.02684943758760583, + "loss": 0.8003, + "num_input_tokens_seen": 22527248, + "step": 38820 + }, + { + "epoch": 5.782692880548108, + "grad_norm": 0.0247802734375, + "learning_rate": 0.026848242047815024, + "loss": 0.7978, + "num_input_tokens_seen": 22530192, + "step": 38825 + }, + { + "epoch": 5.783437593089068, + "grad_norm": 0.020751953125, + "learning_rate": 0.026847046307860212, + "loss": 0.7888, + "num_input_tokens_seen": 22533072, + "step": 38830 + }, + { + "epoch": 5.7841823056300266, + "grad_norm": 0.0186767578125, + "learning_rate": 0.026845850367761593, + "loss": 0.7891, + "num_input_tokens_seen": 22535984, + "step": 38835 + }, + { + "epoch": 5.784927018170986, + "grad_norm": 0.0201416015625, + "learning_rate": 0.026844654227539375, + "loss": 0.7891, + "num_input_tokens_seen": 22538768, + "step": 38840 + }, + { + "epoch": 5.785671730711945, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02684345788721376, + "loss": 0.7886, + "num_input_tokens_seen": 22541968, + "step": 38845 + }, + { + "epoch": 5.786416443252905, + "grad_norm": 0.0120849609375, + "learning_rate": 0.02684226134680496, + "loss": 0.7931, + "num_input_tokens_seen": 22544944, + "step": 38850 + }, + { + "epoch": 5.787161155793863, + "grad_norm": 0.020751953125, + "learning_rate": 0.02684106460633319, + "loss": 0.8022, + "num_input_tokens_seen": 22547984, + "step": 38855 + }, + { + "epoch": 5.787905868334823, + "grad_norm": 0.02197265625, + "learning_rate": 0.026839867665818667, + "loss": 0.8001, + "num_input_tokens_seen": 22550992, + "step": 38860 + }, + { + "epoch": 5.788650580875782, + "grad_norm": 0.02734375, + "learning_rate": 0.02683867052528162, + "loss": 0.821, + "num_input_tokens_seen": 22553936, + "step": 38865 + }, + { + "epoch": 5.789395293416741, + "grad_norm": 0.0260009765625, + "learning_rate": 0.026837473184742264, + "loss": 0.7958, + "num_input_tokens_seen": 22556624, + "step": 38870 + }, + { + "epoch": 5.7901400059577, + "grad_norm": 0.0283203125, + "learning_rate": 0.026836275644220822, + "loss": 0.8018, + "num_input_tokens_seen": 22559600, + "step": 38875 + }, + { + "epoch": 5.79088471849866, + "grad_norm": 0.0184326171875, + "learning_rate": 0.026835077903737542, + "loss": 0.7887, + "num_input_tokens_seen": 22562512, + "step": 38880 + }, + { + "epoch": 5.791629431039619, + "grad_norm": 0.021484375, + "learning_rate": 0.026833879963312646, + "loss": 0.8143, + "num_input_tokens_seen": 22565680, + "step": 38885 + }, + { + "epoch": 5.792374143580578, + "grad_norm": 0.0186767578125, + "learning_rate": 0.026832681822966376, + "loss": 0.7745, + "num_input_tokens_seen": 22568528, + "step": 38890 + }, + { + "epoch": 5.793118856121537, + "grad_norm": 0.021728515625, + "learning_rate": 0.026831483482718972, + "loss": 0.842, + "num_input_tokens_seen": 22571472, + "step": 38895 + }, + { + "epoch": 5.793863568662497, + "grad_norm": 0.0185546875, + "learning_rate": 0.026830284942590682, + "loss": 0.8089, + "num_input_tokens_seen": 22574320, + "step": 38900 + }, + { + "epoch": 5.794608281203455, + "grad_norm": 0.025146484375, + "learning_rate": 0.02682908620260175, + "loss": 0.7978, + "num_input_tokens_seen": 22577232, + "step": 38905 + }, + { + "epoch": 5.795352993744415, + "grad_norm": 0.0283203125, + "learning_rate": 0.026827887262772427, + "loss": 0.8186, + "num_input_tokens_seen": 22580560, + "step": 38910 + }, + { + "epoch": 5.796097706285374, + "grad_norm": 0.0125732421875, + "learning_rate": 0.02682668812312297, + "loss": 0.7926, + "num_input_tokens_seen": 22583440, + "step": 38915 + }, + { + "epoch": 5.796842418826333, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02682548878367364, + "loss": 0.8001, + "num_input_tokens_seen": 22586064, + "step": 38920 + }, + { + "epoch": 5.797587131367292, + "grad_norm": 0.01171875, + "learning_rate": 0.0268242892444447, + "loss": 0.7946, + "num_input_tokens_seen": 22588816, + "step": 38925 + }, + { + "epoch": 5.798331843908251, + "grad_norm": 0.0177001953125, + "learning_rate": 0.026823089505456404, + "loss": 0.7801, + "num_input_tokens_seen": 22591472, + "step": 38930 + }, + { + "epoch": 5.799076556449211, + "grad_norm": 0.01263427734375, + "learning_rate": 0.02682188956672903, + "loss": 0.8126, + "num_input_tokens_seen": 22594352, + "step": 38935 + }, + { + "epoch": 5.79982126899017, + "grad_norm": 0.024169921875, + "learning_rate": 0.026820689428282845, + "loss": 0.8087, + "num_input_tokens_seen": 22597136, + "step": 38940 + }, + { + "epoch": 5.800565981531129, + "grad_norm": 0.0206298828125, + "learning_rate": 0.026819489090138125, + "loss": 0.8115, + "num_input_tokens_seen": 22600144, + "step": 38945 + }, + { + "epoch": 5.801310694072088, + "grad_norm": 0.01953125, + "learning_rate": 0.02681828855231515, + "loss": 0.781, + "num_input_tokens_seen": 22603056, + "step": 38950 + }, + { + "epoch": 5.802055406613047, + "grad_norm": 0.03173828125, + "learning_rate": 0.026817087814834207, + "loss": 0.7926, + "num_input_tokens_seen": 22605936, + "step": 38955 + }, + { + "epoch": 5.802800119154006, + "grad_norm": 0.030029296875, + "learning_rate": 0.02681588687771557, + "loss": 0.8073, + "num_input_tokens_seen": 22608848, + "step": 38960 + }, + { + "epoch": 5.803544831694966, + "grad_norm": 0.020751953125, + "learning_rate": 0.026814685740979536, + "loss": 0.8072, + "num_input_tokens_seen": 22611888, + "step": 38965 + }, + { + "epoch": 5.804289544235925, + "grad_norm": 0.0137939453125, + "learning_rate": 0.026813484404646395, + "loss": 0.7966, + "num_input_tokens_seen": 22615056, + "step": 38970 + }, + { + "epoch": 5.805034256776884, + "grad_norm": 0.0283203125, + "learning_rate": 0.02681228286873644, + "loss": 0.8059, + "num_input_tokens_seen": 22618064, + "step": 38975 + }, + { + "epoch": 5.805778969317843, + "grad_norm": 0.041015625, + "learning_rate": 0.026811081133269972, + "loss": 0.7952, + "num_input_tokens_seen": 22621072, + "step": 38980 + }, + { + "epoch": 5.806523681858803, + "grad_norm": 0.0223388671875, + "learning_rate": 0.026809879198267293, + "loss": 0.8, + "num_input_tokens_seen": 22624080, + "step": 38985 + }, + { + "epoch": 5.807268394399761, + "grad_norm": 0.0267333984375, + "learning_rate": 0.026808677063748707, + "loss": 0.8594, + "num_input_tokens_seen": 22628304, + "step": 38990 + }, + { + "epoch": 5.808013106940721, + "grad_norm": 0.01507568359375, + "learning_rate": 0.026807474729734524, + "loss": 0.8072, + "num_input_tokens_seen": 22631024, + "step": 38995 + }, + { + "epoch": 5.80875781948168, + "grad_norm": 0.021728515625, + "learning_rate": 0.026806272196245057, + "loss": 0.8004, + "num_input_tokens_seen": 22634288, + "step": 39000 + }, + { + "epoch": 5.809502532022639, + "grad_norm": 0.01806640625, + "learning_rate": 0.02680506946330062, + "loss": 0.8033, + "num_input_tokens_seen": 22637264, + "step": 39005 + }, + { + "epoch": 5.810247244563598, + "grad_norm": 0.01507568359375, + "learning_rate": 0.02680386653092153, + "loss": 0.8062, + "num_input_tokens_seen": 22640048, + "step": 39010 + }, + { + "epoch": 5.810991957104558, + "grad_norm": 0.0230712890625, + "learning_rate": 0.026802663399128117, + "loss": 0.8258, + "num_input_tokens_seen": 22642800, + "step": 39015 + }, + { + "epoch": 5.811736669645517, + "grad_norm": 0.036865234375, + "learning_rate": 0.026801460067940697, + "loss": 0.7951, + "num_input_tokens_seen": 22645648, + "step": 39020 + }, + { + "epoch": 5.812481382186476, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026800256537379604, + "loss": 0.81, + "num_input_tokens_seen": 22648752, + "step": 39025 + }, + { + "epoch": 5.813226094727435, + "grad_norm": 0.0126953125, + "learning_rate": 0.02679905280746517, + "loss": 0.8051, + "num_input_tokens_seen": 22651664, + "step": 39030 + }, + { + "epoch": 5.813970807268395, + "grad_norm": 0.020751953125, + "learning_rate": 0.026797848878217737, + "loss": 0.8078, + "num_input_tokens_seen": 22654672, + "step": 39035 + }, + { + "epoch": 5.814715519809353, + "grad_norm": 0.018798828125, + "learning_rate": 0.026796644749657633, + "loss": 0.7979, + "num_input_tokens_seen": 22657648, + "step": 39040 + }, + { + "epoch": 5.815460232350313, + "grad_norm": 0.0130615234375, + "learning_rate": 0.026795440421805206, + "loss": 0.8, + "num_input_tokens_seen": 22660592, + "step": 39045 + }, + { + "epoch": 5.816204944891272, + "grad_norm": 0.0252685546875, + "learning_rate": 0.0267942358946808, + "loss": 0.8125, + "num_input_tokens_seen": 22663280, + "step": 39050 + }, + { + "epoch": 5.8169496574322315, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02679303116830477, + "loss": 0.7983, + "num_input_tokens_seen": 22666416, + "step": 39055 + }, + { + "epoch": 5.81769436997319, + "grad_norm": 0.018798828125, + "learning_rate": 0.02679182624269746, + "loss": 0.8067, + "num_input_tokens_seen": 22669296, + "step": 39060 + }, + { + "epoch": 5.81843908251415, + "grad_norm": 0.02099609375, + "learning_rate": 0.026790621117879228, + "loss": 0.7933, + "num_input_tokens_seen": 22672208, + "step": 39065 + }, + { + "epoch": 5.819183795055109, + "grad_norm": 0.01953125, + "learning_rate": 0.026789415793870442, + "loss": 0.7911, + "num_input_tokens_seen": 22675120, + "step": 39070 + }, + { + "epoch": 5.819928507596068, + "grad_norm": 0.0289306640625, + "learning_rate": 0.026788210270691455, + "loss": 0.8206, + "num_input_tokens_seen": 22678064, + "step": 39075 + }, + { + "epoch": 5.820673220137027, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02678700454836264, + "loss": 0.8041, + "num_input_tokens_seen": 22680656, + "step": 39080 + }, + { + "epoch": 5.821417932677987, + "grad_norm": 0.0224609375, + "learning_rate": 0.026785798626904358, + "loss": 0.791, + "num_input_tokens_seen": 22683632, + "step": 39085 + }, + { + "epoch": 5.822162645218945, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026784592506336986, + "loss": 0.7954, + "num_input_tokens_seen": 22686576, + "step": 39090 + }, + { + "epoch": 5.822907357759904, + "grad_norm": 0.019775390625, + "learning_rate": 0.026783386186680906, + "loss": 0.8052, + "num_input_tokens_seen": 22689168, + "step": 39095 + }, + { + "epoch": 5.823652070300864, + "grad_norm": 0.0205078125, + "learning_rate": 0.02678217966795649, + "loss": 0.7875, + "num_input_tokens_seen": 22691856, + "step": 39100 + }, + { + "epoch": 5.8243967828418235, + "grad_norm": 0.013427734375, + "learning_rate": 0.026780972950184122, + "loss": 0.8047, + "num_input_tokens_seen": 22694800, + "step": 39105 + }, + { + "epoch": 5.825141495382782, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02677976603338419, + "loss": 0.8012, + "num_input_tokens_seen": 22697808, + "step": 39110 + }, + { + "epoch": 5.825886207923741, + "grad_norm": 0.025146484375, + "learning_rate": 0.026778558917577088, + "loss": 0.8289, + "num_input_tokens_seen": 22700848, + "step": 39115 + }, + { + "epoch": 5.826630920464701, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0267773516027832, + "loss": 0.8022, + "num_input_tokens_seen": 22703760, + "step": 39120 + }, + { + "epoch": 5.82737563300566, + "grad_norm": 0.0260009765625, + "learning_rate": 0.026776144089022926, + "loss": 0.7875, + "num_input_tokens_seen": 22706544, + "step": 39125 + }, + { + "epoch": 5.828120345546619, + "grad_norm": 0.0205078125, + "learning_rate": 0.026774936376316668, + "loss": 0.7921, + "num_input_tokens_seen": 22709712, + "step": 39130 + }, + { + "epoch": 5.828865058087578, + "grad_norm": 0.02099609375, + "learning_rate": 0.02677372846468483, + "loss": 0.7916, + "num_input_tokens_seen": 22712464, + "step": 39135 + }, + { + "epoch": 5.8296097706285375, + "grad_norm": 0.018798828125, + "learning_rate": 0.026772520354147813, + "loss": 0.7955, + "num_input_tokens_seen": 22715472, + "step": 39140 + }, + { + "epoch": 5.830354483169496, + "grad_norm": 0.019775390625, + "learning_rate": 0.026771312044726027, + "loss": 0.7819, + "num_input_tokens_seen": 22718448, + "step": 39145 + }, + { + "epoch": 5.831099195710456, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02677010353643989, + "loss": 0.8089, + "num_input_tokens_seen": 22721104, + "step": 39150 + }, + { + "epoch": 5.831843908251415, + "grad_norm": 0.01397705078125, + "learning_rate": 0.02676889482930982, + "loss": 0.8073, + "num_input_tokens_seen": 22724304, + "step": 39155 + }, + { + "epoch": 5.832588620792374, + "grad_norm": 0.020263671875, + "learning_rate": 0.026767685923356228, + "loss": 0.8145, + "num_input_tokens_seen": 22727088, + "step": 39160 + }, + { + "epoch": 5.833333333333333, + "grad_norm": 0.025390625, + "learning_rate": 0.026766476818599547, + "loss": 0.7866, + "num_input_tokens_seen": 22730000, + "step": 39165 + }, + { + "epoch": 5.834078045874293, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02676526751506019, + "loss": 0.7909, + "num_input_tokens_seen": 22732912, + "step": 39170 + }, + { + "epoch": 5.834822758415251, + "grad_norm": 0.0242919921875, + "learning_rate": 0.026764058012758607, + "loss": 0.8319, + "num_input_tokens_seen": 22735760, + "step": 39175 + }, + { + "epoch": 5.835567470956211, + "grad_norm": 0.0196533203125, + "learning_rate": 0.026762848311715215, + "loss": 0.7853, + "num_input_tokens_seen": 22738960, + "step": 39180 + }, + { + "epoch": 5.83631218349717, + "grad_norm": 0.023681640625, + "learning_rate": 0.02676163841195046, + "loss": 0.801, + "num_input_tokens_seen": 22741616, + "step": 39185 + }, + { + "epoch": 5.8370568960381295, + "grad_norm": 0.01507568359375, + "learning_rate": 0.026760428313484776, + "loss": 0.8165, + "num_input_tokens_seen": 22744560, + "step": 39190 + }, + { + "epoch": 5.837801608579088, + "grad_norm": 0.021240234375, + "learning_rate": 0.026759218016338605, + "loss": 0.8048, + "num_input_tokens_seen": 22747536, + "step": 39195 + }, + { + "epoch": 5.838546321120048, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0267580075205324, + "loss": 0.7904, + "num_input_tokens_seen": 22750480, + "step": 39200 + }, + { + "epoch": 5.839291033661007, + "grad_norm": 0.0257568359375, + "learning_rate": 0.026756796826086608, + "loss": 0.8107, + "num_input_tokens_seen": 22753264, + "step": 39205 + }, + { + "epoch": 5.840035746201966, + "grad_norm": 0.0216064453125, + "learning_rate": 0.026755585933021683, + "loss": 0.7924, + "num_input_tokens_seen": 22756144, + "step": 39210 + }, + { + "epoch": 5.840780458742925, + "grad_norm": 0.0113525390625, + "learning_rate": 0.02675437484135808, + "loss": 0.8096, + "num_input_tokens_seen": 22758896, + "step": 39215 + }, + { + "epoch": 5.841525171283885, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02675316355111626, + "loss": 0.8034, + "num_input_tokens_seen": 22761680, + "step": 39220 + }, + { + "epoch": 5.8422698838248435, + "grad_norm": 0.0244140625, + "learning_rate": 0.026751952062316692, + "loss": 0.8014, + "num_input_tokens_seen": 22764496, + "step": 39225 + }, + { + "epoch": 5.843014596365803, + "grad_norm": 0.021728515625, + "learning_rate": 0.026750740374979834, + "loss": 0.7825, + "num_input_tokens_seen": 22767184, + "step": 39230 + }, + { + "epoch": 5.843759308906762, + "grad_norm": 0.0240478515625, + "learning_rate": 0.026749528489126158, + "loss": 0.8025, + "num_input_tokens_seen": 22769968, + "step": 39235 + }, + { + "epoch": 5.8445040214477215, + "grad_norm": 0.03271484375, + "learning_rate": 0.026748316404776148, + "loss": 0.8145, + "num_input_tokens_seen": 22772592, + "step": 39240 + }, + { + "epoch": 5.84524873398868, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026747104121950267, + "loss": 0.8055, + "num_input_tokens_seen": 22775504, + "step": 39245 + }, + { + "epoch": 5.84599344652964, + "grad_norm": 0.039794921875, + "learning_rate": 0.026745891640669, + "loss": 0.809, + "num_input_tokens_seen": 22778544, + "step": 39250 + }, + { + "epoch": 5.846738159070599, + "grad_norm": 0.036376953125, + "learning_rate": 0.026744678960952836, + "loss": 0.8195, + "num_input_tokens_seen": 22781456, + "step": 39255 + }, + { + "epoch": 5.847482871611557, + "grad_norm": 0.0235595703125, + "learning_rate": 0.026743466082822256, + "loss": 0.7969, + "num_input_tokens_seen": 22784304, + "step": 39260 + }, + { + "epoch": 5.848227584152517, + "grad_norm": 0.02783203125, + "learning_rate": 0.02674225300629775, + "loss": 0.7929, + "num_input_tokens_seen": 22787056, + "step": 39265 + }, + { + "epoch": 5.848972296693477, + "grad_norm": 0.0205078125, + "learning_rate": 0.026741039731399813, + "loss": 0.7795, + "num_input_tokens_seen": 22789744, + "step": 39270 + }, + { + "epoch": 5.8497170092344355, + "grad_norm": 0.0196533203125, + "learning_rate": 0.026739826258148944, + "loss": 0.7917, + "num_input_tokens_seen": 22792496, + "step": 39275 + }, + { + "epoch": 5.850461721775394, + "grad_norm": 0.0240478515625, + "learning_rate": 0.026738612586565645, + "loss": 0.8157, + "num_input_tokens_seen": 22795312, + "step": 39280 + }, + { + "epoch": 5.851206434316354, + "grad_norm": 0.03857421875, + "learning_rate": 0.02673739871667042, + "loss": 0.8263, + "num_input_tokens_seen": 22798544, + "step": 39285 + }, + { + "epoch": 5.8519511468573135, + "grad_norm": 0.0166015625, + "learning_rate": 0.026736184648483766, + "loss": 0.791, + "num_input_tokens_seen": 22801104, + "step": 39290 + }, + { + "epoch": 5.852695859398272, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026734970382026203, + "loss": 0.7933, + "num_input_tokens_seen": 22804144, + "step": 39295 + }, + { + "epoch": 5.853440571939231, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02673375591731824, + "loss": 0.8163, + "num_input_tokens_seen": 22807152, + "step": 39300 + }, + { + "epoch": 5.854185284480191, + "grad_norm": 0.01434326171875, + "learning_rate": 0.0267325412543804, + "loss": 0.8057, + "num_input_tokens_seen": 22810032, + "step": 39305 + }, + { + "epoch": 5.8549299970211495, + "grad_norm": 0.039306640625, + "learning_rate": 0.026731326393233196, + "loss": 0.787, + "num_input_tokens_seen": 22812752, + "step": 39310 + }, + { + "epoch": 5.855674709562109, + "grad_norm": 0.0162353515625, + "learning_rate": 0.02673011133389716, + "loss": 0.7821, + "num_input_tokens_seen": 22815408, + "step": 39315 + }, + { + "epoch": 5.856419422103068, + "grad_norm": 0.020751953125, + "learning_rate": 0.026728896076392814, + "loss": 0.8081, + "num_input_tokens_seen": 22818704, + "step": 39320 + }, + { + "epoch": 5.8571641346440275, + "grad_norm": 0.021484375, + "learning_rate": 0.026727680620740688, + "loss": 0.7817, + "num_input_tokens_seen": 22821680, + "step": 39325 + }, + { + "epoch": 5.857908847184986, + "grad_norm": 0.02294921875, + "learning_rate": 0.02672646496696132, + "loss": 0.8138, + "num_input_tokens_seen": 22824560, + "step": 39330 + }, + { + "epoch": 5.858653559725946, + "grad_norm": 0.0263671875, + "learning_rate": 0.02672524911507524, + "loss": 0.81, + "num_input_tokens_seen": 22827344, + "step": 39335 + }, + { + "epoch": 5.859398272266905, + "grad_norm": 0.024169921875, + "learning_rate": 0.026724033065102997, + "loss": 0.8271, + "num_input_tokens_seen": 22830032, + "step": 39340 + }, + { + "epoch": 5.860142984807864, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02672281681706513, + "loss": 0.8066, + "num_input_tokens_seen": 22833264, + "step": 39345 + }, + { + "epoch": 5.860887697348823, + "grad_norm": 0.029052734375, + "learning_rate": 0.026721600370982188, + "loss": 0.8297, + "num_input_tokens_seen": 22836592, + "step": 39350 + }, + { + "epoch": 5.861632409889783, + "grad_norm": 0.021728515625, + "learning_rate": 0.026720383726874717, + "loss": 0.8155, + "num_input_tokens_seen": 22839664, + "step": 39355 + }, + { + "epoch": 5.8623771224307415, + "grad_norm": 0.026123046875, + "learning_rate": 0.02671916688476328, + "loss": 0.8062, + "num_input_tokens_seen": 22842480, + "step": 39360 + }, + { + "epoch": 5.863121834971701, + "grad_norm": 0.0234375, + "learning_rate": 0.026717949844668427, + "loss": 0.7938, + "num_input_tokens_seen": 22845520, + "step": 39365 + }, + { + "epoch": 5.86386654751266, + "grad_norm": 0.023681640625, + "learning_rate": 0.026716732606610723, + "loss": 0.8083, + "num_input_tokens_seen": 22848592, + "step": 39370 + }, + { + "epoch": 5.8646112600536195, + "grad_norm": 0.014892578125, + "learning_rate": 0.026715515170610726, + "loss": 0.8085, + "num_input_tokens_seen": 22851088, + "step": 39375 + }, + { + "epoch": 5.865355972594578, + "grad_norm": 0.03076171875, + "learning_rate": 0.026714297536689014, + "loss": 0.8047, + "num_input_tokens_seen": 22853840, + "step": 39380 + }, + { + "epoch": 5.866100685135538, + "grad_norm": 0.0147705078125, + "learning_rate": 0.026713079704866145, + "loss": 0.7945, + "num_input_tokens_seen": 22856816, + "step": 39385 + }, + { + "epoch": 5.866845397676497, + "grad_norm": 0.01422119140625, + "learning_rate": 0.0267118616751627, + "loss": 0.7945, + "num_input_tokens_seen": 22859632, + "step": 39390 + }, + { + "epoch": 5.867590110217456, + "grad_norm": 0.03564453125, + "learning_rate": 0.02671064344759926, + "loss": 0.8012, + "num_input_tokens_seen": 22862320, + "step": 39395 + }, + { + "epoch": 5.868334822758415, + "grad_norm": 0.019775390625, + "learning_rate": 0.026709425022196396, + "loss": 0.8058, + "num_input_tokens_seen": 22865008, + "step": 39400 + }, + { + "epoch": 5.869079535299375, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0267082063989747, + "loss": 0.7924, + "num_input_tokens_seen": 22867856, + "step": 39405 + }, + { + "epoch": 5.8698242478403335, + "grad_norm": 0.054931640625, + "learning_rate": 0.02670698757795475, + "loss": 0.8023, + "num_input_tokens_seen": 22871888, + "step": 39410 + }, + { + "epoch": 5.870568960381293, + "grad_norm": 0.0211181640625, + "learning_rate": 0.026705768559157152, + "loss": 0.8003, + "num_input_tokens_seen": 22874608, + "step": 39415 + }, + { + "epoch": 5.871313672922252, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02670454934260249, + "loss": 0.8046, + "num_input_tokens_seen": 22877520, + "step": 39420 + }, + { + "epoch": 5.872058385463212, + "grad_norm": 0.019775390625, + "learning_rate": 0.02670332992831136, + "loss": 0.7973, + "num_input_tokens_seen": 22880496, + "step": 39425 + }, + { + "epoch": 5.87280309800417, + "grad_norm": 0.024658203125, + "learning_rate": 0.026702110316304366, + "loss": 0.8005, + "num_input_tokens_seen": 22883056, + "step": 39430 + }, + { + "epoch": 5.87354781054513, + "grad_norm": 0.01434326171875, + "learning_rate": 0.026700890506602112, + "loss": 0.8117, + "num_input_tokens_seen": 22886160, + "step": 39435 + }, + { + "epoch": 5.874292523086089, + "grad_norm": 0.0291748046875, + "learning_rate": 0.026699670499225205, + "loss": 0.7922, + "num_input_tokens_seen": 22889264, + "step": 39440 + }, + { + "epoch": 5.8750372356270475, + "grad_norm": 0.032958984375, + "learning_rate": 0.026698450294194254, + "loss": 0.8034, + "num_input_tokens_seen": 22892400, + "step": 39445 + }, + { + "epoch": 5.875781948168007, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02669722989152988, + "loss": 0.7932, + "num_input_tokens_seen": 22895280, + "step": 39450 + }, + { + "epoch": 5.876526660708967, + "grad_norm": 0.0303955078125, + "learning_rate": 0.026696009291252692, + "loss": 0.8116, + "num_input_tokens_seen": 22898384, + "step": 39455 + }, + { + "epoch": 5.8772713732499255, + "grad_norm": 0.0255126953125, + "learning_rate": 0.026694788493383317, + "loss": 0.8018, + "num_input_tokens_seen": 22901232, + "step": 39460 + }, + { + "epoch": 5.878016085790884, + "grad_norm": 0.0230712890625, + "learning_rate": 0.026693567497942373, + "loss": 0.8129, + "num_input_tokens_seen": 22904304, + "step": 39465 + }, + { + "epoch": 5.878760798331844, + "grad_norm": 0.0206298828125, + "learning_rate": 0.026692346304950494, + "loss": 0.7973, + "num_input_tokens_seen": 22907184, + "step": 39470 + }, + { + "epoch": 5.879505510872804, + "grad_norm": 0.021728515625, + "learning_rate": 0.026691124914428304, + "loss": 0.8169, + "num_input_tokens_seen": 22909808, + "step": 39475 + }, + { + "epoch": 5.880250223413762, + "grad_norm": 0.022216796875, + "learning_rate": 0.026689903326396442, + "loss": 0.7885, + "num_input_tokens_seen": 22912464, + "step": 39480 + }, + { + "epoch": 5.880994935954721, + "grad_norm": 0.041748046875, + "learning_rate": 0.026688681540875548, + "loss": 0.8192, + "num_input_tokens_seen": 22915280, + "step": 39485 + }, + { + "epoch": 5.881739648495681, + "grad_norm": 0.0225830078125, + "learning_rate": 0.026687459557886255, + "loss": 0.8026, + "num_input_tokens_seen": 22918160, + "step": 39490 + }, + { + "epoch": 5.8824843610366395, + "grad_norm": 0.0213623046875, + "learning_rate": 0.026686237377449216, + "loss": 0.797, + "num_input_tokens_seen": 22921008, + "step": 39495 + }, + { + "epoch": 5.883229073577599, + "grad_norm": 0.0286865234375, + "learning_rate": 0.026685014999585067, + "loss": 0.7899, + "num_input_tokens_seen": 22923504, + "step": 39500 + }, + { + "epoch": 5.883973786118558, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02668379242431447, + "loss": 0.8175, + "num_input_tokens_seen": 22926416, + "step": 39505 + }, + { + "epoch": 5.884718498659518, + "grad_norm": 0.0179443359375, + "learning_rate": 0.026682569651658074, + "loss": 0.7919, + "num_input_tokens_seen": 22929168, + "step": 39510 + }, + { + "epoch": 5.885463211200476, + "grad_norm": 0.0306396484375, + "learning_rate": 0.02668134668163654, + "loss": 0.7864, + "num_input_tokens_seen": 22931952, + "step": 39515 + }, + { + "epoch": 5.886207923741436, + "grad_norm": 0.0361328125, + "learning_rate": 0.026680123514270522, + "loss": 0.7986, + "num_input_tokens_seen": 22935088, + "step": 39520 + }, + { + "epoch": 5.886952636282395, + "grad_norm": 0.01458740234375, + "learning_rate": 0.026678900149580694, + "loss": 0.8155, + "num_input_tokens_seen": 22937872, + "step": 39525 + }, + { + "epoch": 5.887697348823354, + "grad_norm": 0.01611328125, + "learning_rate": 0.026677676587587715, + "loss": 0.7932, + "num_input_tokens_seen": 22940784, + "step": 39530 + }, + { + "epoch": 5.888442061364313, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026676452828312255, + "loss": 0.7656, + "num_input_tokens_seen": 22943664, + "step": 39535 + }, + { + "epoch": 5.889186773905273, + "grad_norm": 0.0211181640625, + "learning_rate": 0.026675228871774998, + "loss": 0.7899, + "num_input_tokens_seen": 22946320, + "step": 39540 + }, + { + "epoch": 5.8899314864462315, + "grad_norm": 0.02490234375, + "learning_rate": 0.02667400471799661, + "loss": 0.8167, + "num_input_tokens_seen": 22949648, + "step": 39545 + }, + { + "epoch": 5.890676198987191, + "grad_norm": 0.02001953125, + "learning_rate": 0.02667278036699778, + "loss": 0.8113, + "num_input_tokens_seen": 22952560, + "step": 39550 + }, + { + "epoch": 5.89142091152815, + "grad_norm": 0.0230712890625, + "learning_rate": 0.026671555818799188, + "loss": 0.8057, + "num_input_tokens_seen": 22955408, + "step": 39555 + }, + { + "epoch": 5.89216562406911, + "grad_norm": 0.0242919921875, + "learning_rate": 0.026670331073421524, + "loss": 0.8105, + "num_input_tokens_seen": 22958576, + "step": 39560 + }, + { + "epoch": 5.892910336610068, + "grad_norm": 0.0244140625, + "learning_rate": 0.02666910613088548, + "loss": 0.8101, + "num_input_tokens_seen": 22961616, + "step": 39565 + }, + { + "epoch": 5.893655049151028, + "grad_norm": 0.036376953125, + "learning_rate": 0.026667880991211743, + "loss": 0.8301, + "num_input_tokens_seen": 22964528, + "step": 39570 + }, + { + "epoch": 5.894399761691987, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02666665565442102, + "loss": 0.8108, + "num_input_tokens_seen": 22967344, + "step": 39575 + }, + { + "epoch": 5.895144474232946, + "grad_norm": 0.018310546875, + "learning_rate": 0.026665430120534003, + "loss": 0.8186, + "num_input_tokens_seen": 22970416, + "step": 39580 + }, + { + "epoch": 5.895889186773905, + "grad_norm": 0.01483154296875, + "learning_rate": 0.0266642043895714, + "loss": 0.7934, + "num_input_tokens_seen": 22973296, + "step": 39585 + }, + { + "epoch": 5.896633899314865, + "grad_norm": 0.0400390625, + "learning_rate": 0.026662978461553924, + "loss": 0.7904, + "num_input_tokens_seen": 22976272, + "step": 39590 + }, + { + "epoch": 5.897378611855824, + "grad_norm": 0.023681640625, + "learning_rate": 0.026661752336502278, + "loss": 0.8076, + "num_input_tokens_seen": 22978992, + "step": 39595 + }, + { + "epoch": 5.898123324396783, + "grad_norm": 0.0196533203125, + "learning_rate": 0.026660526014437178, + "loss": 0.805, + "num_input_tokens_seen": 22981840, + "step": 39600 + }, + { + "epoch": 5.898868036937742, + "grad_norm": 0.01904296875, + "learning_rate": 0.026659299495379343, + "loss": 0.7869, + "num_input_tokens_seen": 22984752, + "step": 39605 + }, + { + "epoch": 5.899612749478701, + "grad_norm": 0.028076171875, + "learning_rate": 0.026658072779349496, + "loss": 0.8127, + "num_input_tokens_seen": 22988048, + "step": 39610 + }, + { + "epoch": 5.90035746201966, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02665684586636835, + "loss": 0.8102, + "num_input_tokens_seen": 22990864, + "step": 39615 + }, + { + "epoch": 5.90110217456062, + "grad_norm": 0.0244140625, + "learning_rate": 0.026655618756456646, + "loss": 0.7819, + "num_input_tokens_seen": 22993584, + "step": 39620 + }, + { + "epoch": 5.901846887101579, + "grad_norm": 0.01348876953125, + "learning_rate": 0.026654391449635106, + "loss": 0.7904, + "num_input_tokens_seen": 22996592, + "step": 39625 + }, + { + "epoch": 5.9025915996425375, + "grad_norm": 0.019287109375, + "learning_rate": 0.02665316394592447, + "loss": 0.801, + "num_input_tokens_seen": 22999888, + "step": 39630 + }, + { + "epoch": 5.903336312183497, + "grad_norm": 0.021728515625, + "learning_rate": 0.026651936245345474, + "loss": 0.8087, + "num_input_tokens_seen": 23002736, + "step": 39635 + }, + { + "epoch": 5.904081024724457, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02665070834791886, + "loss": 0.8195, + "num_input_tokens_seen": 23005392, + "step": 39640 + }, + { + "epoch": 5.904825737265416, + "grad_norm": 0.02099609375, + "learning_rate": 0.026649480253665363, + "loss": 0.8016, + "num_input_tokens_seen": 23008304, + "step": 39645 + }, + { + "epoch": 5.905570449806374, + "grad_norm": 0.0206298828125, + "learning_rate": 0.026648251962605743, + "loss": 0.7654, + "num_input_tokens_seen": 23011056, + "step": 39650 + }, + { + "epoch": 5.906315162347334, + "grad_norm": 0.0167236328125, + "learning_rate": 0.02664702347476074, + "loss": 0.7909, + "num_input_tokens_seen": 23013904, + "step": 39655 + }, + { + "epoch": 5.907059874888293, + "grad_norm": 0.0234375, + "learning_rate": 0.02664579479015112, + "loss": 0.8039, + "num_input_tokens_seen": 23016944, + "step": 39660 + }, + { + "epoch": 5.907804587429252, + "grad_norm": 0.0194091796875, + "learning_rate": 0.026644565908797626, + "loss": 0.7778, + "num_input_tokens_seen": 23019824, + "step": 39665 + }, + { + "epoch": 5.908549299970211, + "grad_norm": 0.020263671875, + "learning_rate": 0.026643336830721027, + "loss": 0.7866, + "num_input_tokens_seen": 23022704, + "step": 39670 + }, + { + "epoch": 5.909294012511171, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02664210755594209, + "loss": 0.7674, + "num_input_tokens_seen": 23025424, + "step": 39675 + }, + { + "epoch": 5.91003872505213, + "grad_norm": 0.048828125, + "learning_rate": 0.026640878084481574, + "loss": 0.7885, + "num_input_tokens_seen": 23028176, + "step": 39680 + }, + { + "epoch": 5.910783437593089, + "grad_norm": 0.01556396484375, + "learning_rate": 0.026639648416360254, + "loss": 0.8318, + "num_input_tokens_seen": 23030992, + "step": 39685 + }, + { + "epoch": 5.911528150134048, + "grad_norm": 0.0185546875, + "learning_rate": 0.02663841855159891, + "loss": 0.8073, + "num_input_tokens_seen": 23033552, + "step": 39690 + }, + { + "epoch": 5.912272862675008, + "grad_norm": 0.02294921875, + "learning_rate": 0.02663718849021831, + "loss": 0.8156, + "num_input_tokens_seen": 23036336, + "step": 39695 + }, + { + "epoch": 5.913017575215966, + "grad_norm": 0.0262451171875, + "learning_rate": 0.026635958232239233, + "loss": 0.792, + "num_input_tokens_seen": 23039152, + "step": 39700 + }, + { + "epoch": 5.913762287756926, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026634727777682474, + "loss": 0.8269, + "num_input_tokens_seen": 23042128, + "step": 39705 + }, + { + "epoch": 5.914507000297885, + "grad_norm": 0.01165771484375, + "learning_rate": 0.026633497126568813, + "loss": 0.8115, + "num_input_tokens_seen": 23044976, + "step": 39710 + }, + { + "epoch": 5.915251712838844, + "grad_norm": 0.03076171875, + "learning_rate": 0.02663226627891904, + "loss": 0.788, + "num_input_tokens_seen": 23047888, + "step": 39715 + }, + { + "epoch": 5.915996425379803, + "grad_norm": 0.0244140625, + "learning_rate": 0.026631035234753953, + "loss": 0.8026, + "num_input_tokens_seen": 23050672, + "step": 39720 + }, + { + "epoch": 5.916741137920763, + "grad_norm": 0.0279541015625, + "learning_rate": 0.026629803994094346, + "loss": 0.8093, + "num_input_tokens_seen": 23053328, + "step": 39725 + }, + { + "epoch": 5.917485850461722, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02662857255696102, + "loss": 0.8147, + "num_input_tokens_seen": 23056656, + "step": 39730 + }, + { + "epoch": 5.918230563002681, + "grad_norm": 0.036865234375, + "learning_rate": 0.02662734092337478, + "loss": 0.7891, + "num_input_tokens_seen": 23059440, + "step": 39735 + }, + { + "epoch": 5.91897527554364, + "grad_norm": 0.0235595703125, + "learning_rate": 0.026626109093356432, + "loss": 0.805, + "num_input_tokens_seen": 23062416, + "step": 39740 + }, + { + "epoch": 5.9197199880846, + "grad_norm": 0.03125, + "learning_rate": 0.02662487706692679, + "loss": 0.8246, + "num_input_tokens_seen": 23065040, + "step": 39745 + }, + { + "epoch": 5.920464700625558, + "grad_norm": 0.03564453125, + "learning_rate": 0.02662364484410666, + "loss": 0.8181, + "num_input_tokens_seen": 23068112, + "step": 39750 + }, + { + "epoch": 5.921209413166518, + "grad_norm": 0.021728515625, + "learning_rate": 0.026622412424916867, + "loss": 0.7969, + "num_input_tokens_seen": 23071056, + "step": 39755 + }, + { + "epoch": 5.921954125707477, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02662117980937823, + "loss": 0.7892, + "num_input_tokens_seen": 23073904, + "step": 39760 + }, + { + "epoch": 5.9226988382484365, + "grad_norm": 0.01373291015625, + "learning_rate": 0.02661994699751157, + "loss": 0.8092, + "num_input_tokens_seen": 23076880, + "step": 39765 + }, + { + "epoch": 5.923443550789395, + "grad_norm": 0.026611328125, + "learning_rate": 0.02661871398933771, + "loss": 0.8188, + "num_input_tokens_seen": 23079600, + "step": 39770 + }, + { + "epoch": 5.924188263330355, + "grad_norm": 0.02978515625, + "learning_rate": 0.026617480784877492, + "loss": 0.8185, + "num_input_tokens_seen": 23082640, + "step": 39775 + }, + { + "epoch": 5.924932975871314, + "grad_norm": 0.0361328125, + "learning_rate": 0.02661624738415174, + "loss": 0.813, + "num_input_tokens_seen": 23085456, + "step": 39780 + }, + { + "epoch": 5.925677688412273, + "grad_norm": 0.023681640625, + "learning_rate": 0.026615013787181303, + "loss": 0.8134, + "num_input_tokens_seen": 23087984, + "step": 39785 + }, + { + "epoch": 5.926422400953232, + "grad_norm": 0.0201416015625, + "learning_rate": 0.026613779993987004, + "loss": 0.802, + "num_input_tokens_seen": 23090640, + "step": 39790 + }, + { + "epoch": 5.927167113494191, + "grad_norm": 0.01708984375, + "learning_rate": 0.0266125460045897, + "loss": 0.8163, + "num_input_tokens_seen": 23093808, + "step": 39795 + }, + { + "epoch": 5.92791182603515, + "grad_norm": 0.02294921875, + "learning_rate": 0.026611311819010233, + "loss": 0.8058, + "num_input_tokens_seen": 23096688, + "step": 39800 + }, + { + "epoch": 5.92865653857611, + "grad_norm": 0.0213623046875, + "learning_rate": 0.026610077437269453, + "loss": 0.7926, + "num_input_tokens_seen": 23099376, + "step": 39805 + }, + { + "epoch": 5.929401251117069, + "grad_norm": 0.01080322265625, + "learning_rate": 0.026608842859388217, + "loss": 0.8104, + "num_input_tokens_seen": 23102256, + "step": 39810 + }, + { + "epoch": 5.930145963658028, + "grad_norm": 0.0252685546875, + "learning_rate": 0.026607608085387376, + "loss": 0.826, + "num_input_tokens_seen": 23105008, + "step": 39815 + }, + { + "epoch": 5.930890676198987, + "grad_norm": 0.015869140625, + "learning_rate": 0.026606373115287796, + "loss": 0.811, + "num_input_tokens_seen": 23107760, + "step": 39820 + }, + { + "epoch": 5.931635388739946, + "grad_norm": 0.019775390625, + "learning_rate": 0.026605137949110336, + "loss": 0.8008, + "num_input_tokens_seen": 23110544, + "step": 39825 + }, + { + "epoch": 5.932380101280906, + "grad_norm": 0.03125, + "learning_rate": 0.02660390258687587, + "loss": 0.8085, + "num_input_tokens_seen": 23113200, + "step": 39830 + }, + { + "epoch": 5.933124813821864, + "grad_norm": 0.0201416015625, + "learning_rate": 0.026602667028605257, + "loss": 0.8064, + "num_input_tokens_seen": 23116112, + "step": 39835 + }, + { + "epoch": 5.933869526362824, + "grad_norm": 0.0194091796875, + "learning_rate": 0.026601431274319385, + "loss": 0.8119, + "num_input_tokens_seen": 23119024, + "step": 39840 + }, + { + "epoch": 5.934614238903783, + "grad_norm": 0.0201416015625, + "learning_rate": 0.026600195324039115, + "loss": 0.8089, + "num_input_tokens_seen": 23122320, + "step": 39845 + }, + { + "epoch": 5.9353589514447425, + "grad_norm": 0.0361328125, + "learning_rate": 0.026598959177785337, + "loss": 0.8145, + "num_input_tokens_seen": 23125136, + "step": 39850 + }, + { + "epoch": 5.936103663985701, + "grad_norm": 0.010986328125, + "learning_rate": 0.026597722835578937, + "loss": 0.8167, + "num_input_tokens_seen": 23127952, + "step": 39855 + }, + { + "epoch": 5.936848376526661, + "grad_norm": 0.01318359375, + "learning_rate": 0.026596486297440797, + "loss": 0.8105, + "num_input_tokens_seen": 23130704, + "step": 39860 + }, + { + "epoch": 5.93759308906762, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0265952495633918, + "loss": 0.8055, + "num_input_tokens_seen": 23133296, + "step": 39865 + }, + { + "epoch": 5.938337801608579, + "grad_norm": 0.0191650390625, + "learning_rate": 0.026594012633452853, + "loss": 0.8042, + "num_input_tokens_seen": 23136176, + "step": 39870 + }, + { + "epoch": 5.939082514149538, + "grad_norm": 0.02783203125, + "learning_rate": 0.026592775507644842, + "loss": 0.8079, + "num_input_tokens_seen": 23138928, + "step": 39875 + }, + { + "epoch": 5.939827226690498, + "grad_norm": 0.0205078125, + "learning_rate": 0.026591538185988677, + "loss": 0.8031, + "num_input_tokens_seen": 23141904, + "step": 39880 + }, + { + "epoch": 5.940571939231456, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02659030066850525, + "loss": 0.7991, + "num_input_tokens_seen": 23144688, + "step": 39885 + }, + { + "epoch": 5.941316651772416, + "grad_norm": 0.01434326171875, + "learning_rate": 0.026589062955215473, + "loss": 0.7974, + "num_input_tokens_seen": 23147792, + "step": 39890 + }, + { + "epoch": 5.942061364313375, + "grad_norm": 0.0169677734375, + "learning_rate": 0.026587825046140257, + "loss": 0.7866, + "num_input_tokens_seen": 23150576, + "step": 39895 + }, + { + "epoch": 5.9428060768543345, + "grad_norm": 0.0279541015625, + "learning_rate": 0.026586586941300516, + "loss": 0.8279, + "num_input_tokens_seen": 23153680, + "step": 39900 + }, + { + "epoch": 5.943550789395293, + "grad_norm": 0.01080322265625, + "learning_rate": 0.026585348640717165, + "loss": 0.7971, + "num_input_tokens_seen": 23156976, + "step": 39905 + }, + { + "epoch": 5.944295501936253, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02658411014441112, + "loss": 0.8037, + "num_input_tokens_seen": 23160304, + "step": 39910 + }, + { + "epoch": 5.945040214477212, + "grad_norm": 0.0234375, + "learning_rate": 0.026582871452403307, + "loss": 0.8003, + "num_input_tokens_seen": 23163568, + "step": 39915 + }, + { + "epoch": 5.945784927018171, + "grad_norm": 0.01214599609375, + "learning_rate": 0.026581632564714658, + "loss": 0.8023, + "num_input_tokens_seen": 23166288, + "step": 39920 + }, + { + "epoch": 5.94652963955913, + "grad_norm": 0.01904296875, + "learning_rate": 0.026580393481366097, + "loss": 0.8107, + "num_input_tokens_seen": 23169072, + "step": 39925 + }, + { + "epoch": 5.94727435210009, + "grad_norm": 0.011962890625, + "learning_rate": 0.026579154202378556, + "loss": 0.8044, + "num_input_tokens_seen": 23171920, + "step": 39930 + }, + { + "epoch": 5.9480190646410485, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02657791472777297, + "loss": 0.7993, + "num_input_tokens_seen": 23174672, + "step": 39935 + }, + { + "epoch": 5.948763777182008, + "grad_norm": 0.0224609375, + "learning_rate": 0.02657667505757029, + "loss": 0.8109, + "num_input_tokens_seen": 23177456, + "step": 39940 + }, + { + "epoch": 5.949508489722967, + "grad_norm": 0.0181884765625, + "learning_rate": 0.026575435191791445, + "loss": 0.7898, + "num_input_tokens_seen": 23180400, + "step": 39945 + }, + { + "epoch": 5.9502532022639265, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02657419513045739, + "loss": 0.7839, + "num_input_tokens_seen": 23183056, + "step": 39950 + }, + { + "epoch": 5.950997914804885, + "grad_norm": 0.01202392578125, + "learning_rate": 0.02657295487358907, + "loss": 0.8004, + "num_input_tokens_seen": 23185840, + "step": 39955 + }, + { + "epoch": 5.951742627345844, + "grad_norm": 0.01251220703125, + "learning_rate": 0.02657171442120744, + "loss": 0.8252, + "num_input_tokens_seen": 23188816, + "step": 39960 + }, + { + "epoch": 5.952487339886804, + "grad_norm": 0.0194091796875, + "learning_rate": 0.026570473773333454, + "loss": 0.7979, + "num_input_tokens_seen": 23191376, + "step": 39965 + }, + { + "epoch": 5.953232052427763, + "grad_norm": 0.01165771484375, + "learning_rate": 0.026569232929988074, + "loss": 0.7989, + "num_input_tokens_seen": 23194224, + "step": 39970 + }, + { + "epoch": 5.953976764968722, + "grad_norm": 0.0206298828125, + "learning_rate": 0.026567991891192265, + "loss": 0.8191, + "num_input_tokens_seen": 23197008, + "step": 39975 + }, + { + "epoch": 5.954721477509681, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02656675065696699, + "loss": 0.7862, + "num_input_tokens_seen": 23200080, + "step": 39980 + }, + { + "epoch": 5.9554661900506405, + "grad_norm": 0.0146484375, + "learning_rate": 0.02656550922733322, + "loss": 0.7918, + "num_input_tokens_seen": 23203024, + "step": 39985 + }, + { + "epoch": 5.9562109025916, + "grad_norm": 0.019287109375, + "learning_rate": 0.02656426760231192, + "loss": 0.8049, + "num_input_tokens_seen": 23206288, + "step": 39990 + }, + { + "epoch": 5.956955615132559, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026563025781924078, + "loss": 0.8186, + "num_input_tokens_seen": 23209136, + "step": 39995 + }, + { + "epoch": 5.957700327673518, + "grad_norm": 0.020263671875, + "learning_rate": 0.02656178376619066, + "loss": 0.7986, + "num_input_tokens_seen": 23212272, + "step": 40000 + }, + { + "epoch": 5.958445040214477, + "grad_norm": 0.0172119140625, + "learning_rate": 0.026560541555132662, + "loss": 0.8036, + "num_input_tokens_seen": 23215088, + "step": 40005 + }, + { + "epoch": 5.959189752755436, + "grad_norm": 0.024658203125, + "learning_rate": 0.026559299148771066, + "loss": 0.7955, + "num_input_tokens_seen": 23218000, + "step": 40010 + }, + { + "epoch": 5.959934465296396, + "grad_norm": 0.019287109375, + "learning_rate": 0.026558056547126858, + "loss": 0.8193, + "num_input_tokens_seen": 23220880, + "step": 40015 + }, + { + "epoch": 5.9606791778373545, + "grad_norm": 0.0181884765625, + "learning_rate": 0.02655681375022103, + "loss": 0.8041, + "num_input_tokens_seen": 23223728, + "step": 40020 + }, + { + "epoch": 5.961423890378314, + "grad_norm": 0.0130615234375, + "learning_rate": 0.026555570758074583, + "loss": 0.8216, + "num_input_tokens_seen": 23226608, + "step": 40025 + }, + { + "epoch": 5.962168602919273, + "grad_norm": 0.023193359375, + "learning_rate": 0.026554327570708504, + "loss": 0.8206, + "num_input_tokens_seen": 23229264, + "step": 40030 + }, + { + "epoch": 5.9629133154602325, + "grad_norm": 0.02001953125, + "learning_rate": 0.026553084188143807, + "loss": 0.794, + "num_input_tokens_seen": 23232368, + "step": 40035 + }, + { + "epoch": 5.963658028001191, + "grad_norm": 0.01025390625, + "learning_rate": 0.0265518406104015, + "loss": 0.7869, + "num_input_tokens_seen": 23235120, + "step": 40040 + }, + { + "epoch": 5.964402740542151, + "grad_norm": 0.0264892578125, + "learning_rate": 0.026550596837502587, + "loss": 0.8063, + "num_input_tokens_seen": 23238192, + "step": 40045 + }, + { + "epoch": 5.96514745308311, + "grad_norm": 0.0211181640625, + "learning_rate": 0.026549352869468076, + "loss": 0.8037, + "num_input_tokens_seen": 23241328, + "step": 40050 + }, + { + "epoch": 5.965892165624069, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026548108706318987, + "loss": 0.796, + "num_input_tokens_seen": 23244304, + "step": 40055 + }, + { + "epoch": 5.966636878165028, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02654686434807634, + "loss": 0.7927, + "num_input_tokens_seen": 23247056, + "step": 40060 + }, + { + "epoch": 5.967381590705988, + "grad_norm": 0.027099609375, + "learning_rate": 0.026545619794761156, + "loss": 0.8145, + "num_input_tokens_seen": 23250128, + "step": 40065 + }, + { + "epoch": 5.9681263032469465, + "grad_norm": 0.022705078125, + "learning_rate": 0.026544375046394453, + "loss": 0.8101, + "num_input_tokens_seen": 23253008, + "step": 40070 + }, + { + "epoch": 5.968871015787906, + "grad_norm": 0.01904296875, + "learning_rate": 0.026543130102997272, + "loss": 0.7879, + "num_input_tokens_seen": 23255504, + "step": 40075 + }, + { + "epoch": 5.969615728328865, + "grad_norm": 0.018798828125, + "learning_rate": 0.02654188496459064, + "loss": 0.7991, + "num_input_tokens_seen": 23258288, + "step": 40080 + }, + { + "epoch": 5.9703604408698245, + "grad_norm": 0.018798828125, + "learning_rate": 0.026540639631195594, + "loss": 0.7988, + "num_input_tokens_seen": 23261168, + "step": 40085 + }, + { + "epoch": 5.971105153410783, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02653939410283317, + "loss": 0.8119, + "num_input_tokens_seen": 23263888, + "step": 40090 + }, + { + "epoch": 5.971849865951743, + "grad_norm": 0.022705078125, + "learning_rate": 0.02653814837952441, + "loss": 0.7986, + "num_input_tokens_seen": 23266992, + "step": 40095 + }, + { + "epoch": 5.972594578492702, + "grad_norm": 0.01806640625, + "learning_rate": 0.026536902461290354, + "loss": 0.8091, + "num_input_tokens_seen": 23269904, + "step": 40100 + }, + { + "epoch": 5.973339291033661, + "grad_norm": 0.0159912109375, + "learning_rate": 0.026535656348152064, + "loss": 0.7886, + "num_input_tokens_seen": 23272752, + "step": 40105 + }, + { + "epoch": 5.97408400357462, + "grad_norm": 0.01239013671875, + "learning_rate": 0.026534410040130584, + "loss": 0.8137, + "num_input_tokens_seen": 23275600, + "step": 40110 + }, + { + "epoch": 5.97482871611558, + "grad_norm": 0.01708984375, + "learning_rate": 0.026533163537246964, + "loss": 0.8063, + "num_input_tokens_seen": 23278416, + "step": 40115 + }, + { + "epoch": 5.9755734286565385, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02653191683952227, + "loss": 0.8015, + "num_input_tokens_seen": 23281424, + "step": 40120 + }, + { + "epoch": 5.976318141197497, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02653066994697757, + "loss": 0.8029, + "num_input_tokens_seen": 23284240, + "step": 40125 + }, + { + "epoch": 5.977062853738457, + "grad_norm": 0.023681640625, + "learning_rate": 0.02652942285963391, + "loss": 0.818, + "num_input_tokens_seen": 23287056, + "step": 40130 + }, + { + "epoch": 5.977807566279417, + "grad_norm": 0.02001953125, + "learning_rate": 0.026528175577512372, + "loss": 0.8122, + "num_input_tokens_seen": 23289904, + "step": 40135 + }, + { + "epoch": 5.978552278820375, + "grad_norm": 0.018798828125, + "learning_rate": 0.026526928100634027, + "loss": 0.7966, + "num_input_tokens_seen": 23292912, + "step": 40140 + }, + { + "epoch": 5.979296991361334, + "grad_norm": 0.0185546875, + "learning_rate": 0.026525680429019944, + "loss": 0.8063, + "num_input_tokens_seen": 23295952, + "step": 40145 + }, + { + "epoch": 5.980041703902294, + "grad_norm": 0.033203125, + "learning_rate": 0.026524432562691203, + "loss": 0.8201, + "num_input_tokens_seen": 23298896, + "step": 40150 + }, + { + "epoch": 5.980786416443253, + "grad_norm": 0.01385498046875, + "learning_rate": 0.026523184501668892, + "loss": 0.8016, + "num_input_tokens_seen": 23301840, + "step": 40155 + }, + { + "epoch": 5.981531128984212, + "grad_norm": 0.0281982421875, + "learning_rate": 0.026521936245974084, + "loss": 0.8002, + "num_input_tokens_seen": 23304752, + "step": 40160 + }, + { + "epoch": 5.982275841525171, + "grad_norm": 0.017578125, + "learning_rate": 0.02652068779562788, + "loss": 0.7952, + "num_input_tokens_seen": 23307760, + "step": 40165 + }, + { + "epoch": 5.9830205540661305, + "grad_norm": 0.019287109375, + "learning_rate": 0.026519439150651358, + "loss": 0.808, + "num_input_tokens_seen": 23310608, + "step": 40170 + }, + { + "epoch": 5.983765266607089, + "grad_norm": 0.0185546875, + "learning_rate": 0.026518190311065625, + "loss": 0.7981, + "num_input_tokens_seen": 23313456, + "step": 40175 + }, + { + "epoch": 5.984509979148049, + "grad_norm": 0.0191650390625, + "learning_rate": 0.026516941276891772, + "loss": 0.8119, + "num_input_tokens_seen": 23316752, + "step": 40180 + }, + { + "epoch": 5.985254691689008, + "grad_norm": 0.01312255859375, + "learning_rate": 0.0265156920481509, + "loss": 0.789, + "num_input_tokens_seen": 23319568, + "step": 40185 + }, + { + "epoch": 5.985999404229967, + "grad_norm": 0.0205078125, + "learning_rate": 0.026514442624864118, + "loss": 0.8002, + "num_input_tokens_seen": 23322416, + "step": 40190 + }, + { + "epoch": 5.986744116770926, + "grad_norm": 0.018798828125, + "learning_rate": 0.026513193007052526, + "loss": 0.7795, + "num_input_tokens_seen": 23325232, + "step": 40195 + }, + { + "epoch": 5.987488829311886, + "grad_norm": 0.017333984375, + "learning_rate": 0.026511943194737243, + "loss": 0.7847, + "num_input_tokens_seen": 23328272, + "step": 40200 + }, + { + "epoch": 5.9882335418528445, + "grad_norm": 0.017578125, + "learning_rate": 0.026510693187939378, + "loss": 0.7838, + "num_input_tokens_seen": 23331376, + "step": 40205 + }, + { + "epoch": 5.988978254393804, + "grad_norm": 0.0205078125, + "learning_rate": 0.02650944298668005, + "loss": 0.8135, + "num_input_tokens_seen": 23334352, + "step": 40210 + }, + { + "epoch": 5.989722966934763, + "grad_norm": 0.021484375, + "learning_rate": 0.02650819259098038, + "loss": 0.7883, + "num_input_tokens_seen": 23337296, + "step": 40215 + }, + { + "epoch": 5.990467679475723, + "grad_norm": 0.0198974609375, + "learning_rate": 0.0265069420008615, + "loss": 0.8037, + "num_input_tokens_seen": 23340432, + "step": 40220 + }, + { + "epoch": 5.991212392016681, + "grad_norm": 0.0244140625, + "learning_rate": 0.026505691216344524, + "loss": 0.8193, + "num_input_tokens_seen": 23343120, + "step": 40225 + }, + { + "epoch": 5.991957104557641, + "grad_norm": 0.01519775390625, + "learning_rate": 0.026504440237450584, + "loss": 0.8121, + "num_input_tokens_seen": 23345936, + "step": 40230 + }, + { + "epoch": 5.9927018170986, + "grad_norm": 0.035400390625, + "learning_rate": 0.026503189064200825, + "loss": 0.8388, + "num_input_tokens_seen": 23348912, + "step": 40235 + }, + { + "epoch": 5.993446529639559, + "grad_norm": 0.025634765625, + "learning_rate": 0.02650193769661638, + "loss": 0.7864, + "num_input_tokens_seen": 23352208, + "step": 40240 + }, + { + "epoch": 5.994191242180518, + "grad_norm": 0.0301513671875, + "learning_rate": 0.026500686134718385, + "loss": 0.8144, + "num_input_tokens_seen": 23355376, + "step": 40245 + }, + { + "epoch": 5.994935954721478, + "grad_norm": 0.025390625, + "learning_rate": 0.026499434378527986, + "loss": 0.7952, + "num_input_tokens_seen": 23358160, + "step": 40250 + }, + { + "epoch": 5.9956806672624365, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02649818242806633, + "loss": 0.815, + "num_input_tokens_seen": 23361040, + "step": 40255 + }, + { + "epoch": 5.996425379803396, + "grad_norm": 0.02490234375, + "learning_rate": 0.026496930283354567, + "loss": 0.8053, + "num_input_tokens_seen": 23364016, + "step": 40260 + }, + { + "epoch": 5.997170092344355, + "grad_norm": 0.01251220703125, + "learning_rate": 0.026495677944413855, + "loss": 0.8148, + "num_input_tokens_seen": 23366800, + "step": 40265 + }, + { + "epoch": 5.997914804885315, + "grad_norm": 0.0111083984375, + "learning_rate": 0.026494425411265347, + "loss": 0.8127, + "num_input_tokens_seen": 23369296, + "step": 40270 + }, + { + "epoch": 5.998659517426273, + "grad_norm": 0.0264892578125, + "learning_rate": 0.026493172683930203, + "loss": 0.8071, + "num_input_tokens_seen": 23372112, + "step": 40275 + }, + { + "epoch": 5.999404229967233, + "grad_norm": 0.0230712890625, + "learning_rate": 0.026491919762429587, + "loss": 0.815, + "num_input_tokens_seen": 23374992, + "step": 40280 + }, + { + "epoch": 6.0, + "eval_loss": 0.8028715252876282, + "eval_runtime": 70.6804, + "eval_samples_per_second": 42.218, + "eval_steps_per_second": 10.555, + "num_input_tokens_seen": 23376776, + "step": 40284 + }, + { + "epoch": 6.000148942508192, + "grad_norm": 0.02294921875, + "learning_rate": 0.026490666646784667, + "loss": 0.7954, + "num_input_tokens_seen": 23377352, + "step": 40285 + }, + { + "epoch": 6.000893655049151, + "grad_norm": 0.0216064453125, + "learning_rate": 0.026489413337016616, + "loss": 0.8146, + "num_input_tokens_seen": 23379944, + "step": 40290 + }, + { + "epoch": 6.00163836759011, + "grad_norm": 0.0234375, + "learning_rate": 0.026488159833146604, + "loss": 0.8028, + "num_input_tokens_seen": 23382792, + "step": 40295 + }, + { + "epoch": 6.00238308013107, + "grad_norm": 0.0230712890625, + "learning_rate": 0.026486906135195804, + "loss": 0.8099, + "num_input_tokens_seen": 23385960, + "step": 40300 + }, + { + "epoch": 6.003127792672029, + "grad_norm": 0.0203857421875, + "learning_rate": 0.0264856522431854, + "loss": 0.8159, + "num_input_tokens_seen": 23389160, + "step": 40305 + }, + { + "epoch": 6.003872505212988, + "grad_norm": 0.0189208984375, + "learning_rate": 0.026484398157136574, + "loss": 0.7987, + "num_input_tokens_seen": 23392040, + "step": 40310 + }, + { + "epoch": 6.004617217753947, + "grad_norm": 0.022216796875, + "learning_rate": 0.026483143877070518, + "loss": 0.8204, + "num_input_tokens_seen": 23394984, + "step": 40315 + }, + { + "epoch": 6.005361930294906, + "grad_norm": 0.01177978515625, + "learning_rate": 0.026481889403008413, + "loss": 0.8126, + "num_input_tokens_seen": 23397768, + "step": 40320 + }, + { + "epoch": 6.006106642835865, + "grad_norm": 0.0184326171875, + "learning_rate": 0.026480634734971458, + "loss": 0.8167, + "num_input_tokens_seen": 23400648, + "step": 40325 + }, + { + "epoch": 6.006851355376824, + "grad_norm": 0.031005859375, + "learning_rate": 0.026479379872980845, + "loss": 0.8028, + "num_input_tokens_seen": 23403720, + "step": 40330 + }, + { + "epoch": 6.007596067917784, + "grad_norm": 0.01153564453125, + "learning_rate": 0.026478124817057778, + "loss": 0.8111, + "num_input_tokens_seen": 23406664, + "step": 40335 + }, + { + "epoch": 6.0083407804587425, + "grad_norm": 0.01300048828125, + "learning_rate": 0.02647686956722346, + "loss": 0.7979, + "num_input_tokens_seen": 23409352, + "step": 40340 + }, + { + "epoch": 6.009085492999702, + "grad_norm": 0.02197265625, + "learning_rate": 0.02647561412349909, + "loss": 0.7953, + "num_input_tokens_seen": 23412104, + "step": 40345 + }, + { + "epoch": 6.009830205540661, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02647435848590589, + "loss": 0.8078, + "num_input_tokens_seen": 23414888, + "step": 40350 + }, + { + "epoch": 6.010574918081621, + "grad_norm": 0.02001953125, + "learning_rate": 0.02647310265446506, + "loss": 0.8053, + "num_input_tokens_seen": 23417608, + "step": 40355 + }, + { + "epoch": 6.011319630622579, + "grad_norm": 0.022705078125, + "learning_rate": 0.026471846629197823, + "loss": 0.7956, + "num_input_tokens_seen": 23420584, + "step": 40360 + }, + { + "epoch": 6.012064343163539, + "grad_norm": 0.018798828125, + "learning_rate": 0.026470590410125394, + "loss": 0.7988, + "num_input_tokens_seen": 23423528, + "step": 40365 + }, + { + "epoch": 6.012809055704498, + "grad_norm": 0.0167236328125, + "learning_rate": 0.026469333997269, + "loss": 0.806, + "num_input_tokens_seen": 23426280, + "step": 40370 + }, + { + "epoch": 6.013553768245457, + "grad_norm": 0.02001953125, + "learning_rate": 0.026468077390649866, + "loss": 0.794, + "num_input_tokens_seen": 23429416, + "step": 40375 + }, + { + "epoch": 6.014298480786416, + "grad_norm": 0.0125732421875, + "learning_rate": 0.026466820590289222, + "loss": 0.8207, + "num_input_tokens_seen": 23432360, + "step": 40380 + }, + { + "epoch": 6.015043193327376, + "grad_norm": 0.025390625, + "learning_rate": 0.026465563596208296, + "loss": 0.7922, + "num_input_tokens_seen": 23435336, + "step": 40385 + }, + { + "epoch": 6.015787905868335, + "grad_norm": 0.02392578125, + "learning_rate": 0.026464306408428326, + "loss": 0.7879, + "num_input_tokens_seen": 23438408, + "step": 40390 + }, + { + "epoch": 6.016532618409294, + "grad_norm": 0.017578125, + "learning_rate": 0.026463049026970553, + "loss": 0.8054, + "num_input_tokens_seen": 23441416, + "step": 40395 + }, + { + "epoch": 6.017277330950253, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02646179145185621, + "loss": 0.8142, + "num_input_tokens_seen": 23444392, + "step": 40400 + }, + { + "epoch": 6.018022043491213, + "grad_norm": 0.0260009765625, + "learning_rate": 0.026460533683106555, + "loss": 0.7965, + "num_input_tokens_seen": 23447272, + "step": 40405 + }, + { + "epoch": 6.018766756032171, + "grad_norm": 0.018798828125, + "learning_rate": 0.026459275720742834, + "loss": 0.8062, + "num_input_tokens_seen": 23450344, + "step": 40410 + }, + { + "epoch": 6.019511468573131, + "grad_norm": 0.0111083984375, + "learning_rate": 0.026458017564786294, + "loss": 0.8125, + "num_input_tokens_seen": 23453192, + "step": 40415 + }, + { + "epoch": 6.02025618111409, + "grad_norm": 0.01214599609375, + "learning_rate": 0.026456759215258193, + "loss": 0.8032, + "num_input_tokens_seen": 23456040, + "step": 40420 + }, + { + "epoch": 6.021000893655049, + "grad_norm": 0.01708984375, + "learning_rate": 0.026455500672179787, + "loss": 0.7908, + "num_input_tokens_seen": 23458664, + "step": 40425 + }, + { + "epoch": 6.021745606196008, + "grad_norm": 0.02978515625, + "learning_rate": 0.026454241935572345, + "loss": 0.8181, + "num_input_tokens_seen": 23461512, + "step": 40430 + }, + { + "epoch": 6.022490318736968, + "grad_norm": 0.01953125, + "learning_rate": 0.026452983005457124, + "loss": 0.7959, + "num_input_tokens_seen": 23464136, + "step": 40435 + }, + { + "epoch": 6.023235031277927, + "grad_norm": 0.022705078125, + "learning_rate": 0.026451723881855394, + "loss": 0.8031, + "num_input_tokens_seen": 23466920, + "step": 40440 + }, + { + "epoch": 6.023979743818886, + "grad_norm": 0.021240234375, + "learning_rate": 0.026450464564788428, + "loss": 0.8144, + "num_input_tokens_seen": 23469640, + "step": 40445 + }, + { + "epoch": 6.024724456359845, + "grad_norm": 0.0189208984375, + "learning_rate": 0.026449205054277503, + "loss": 0.8042, + "num_input_tokens_seen": 23472552, + "step": 40450 + }, + { + "epoch": 6.025469168900805, + "grad_norm": 0.0198974609375, + "learning_rate": 0.026447945350343897, + "loss": 0.8077, + "num_input_tokens_seen": 23475688, + "step": 40455 + }, + { + "epoch": 6.026213881441763, + "grad_norm": 0.0272216796875, + "learning_rate": 0.026446685453008885, + "loss": 0.8154, + "num_input_tokens_seen": 23478664, + "step": 40460 + }, + { + "epoch": 6.026958593982723, + "grad_norm": 0.01123046875, + "learning_rate": 0.02644542536229376, + "loss": 0.7999, + "num_input_tokens_seen": 23481352, + "step": 40465 + }, + { + "epoch": 6.027703306523682, + "grad_norm": 0.0101318359375, + "learning_rate": 0.0264441650782198, + "loss": 0.7985, + "num_input_tokens_seen": 23484040, + "step": 40470 + }, + { + "epoch": 6.0284480190646414, + "grad_norm": 0.0130615234375, + "learning_rate": 0.026442904600808313, + "loss": 0.816, + "num_input_tokens_seen": 23486952, + "step": 40475 + }, + { + "epoch": 6.0291927316056, + "grad_norm": 0.021728515625, + "learning_rate": 0.026441643930080572, + "loss": 0.8003, + "num_input_tokens_seen": 23489768, + "step": 40480 + }, + { + "epoch": 6.02993744414656, + "grad_norm": 0.0174560546875, + "learning_rate": 0.026440383066057893, + "loss": 0.8164, + "num_input_tokens_seen": 23492904, + "step": 40485 + }, + { + "epoch": 6.030682156687519, + "grad_norm": 0.022705078125, + "learning_rate": 0.026439122008761567, + "loss": 0.8122, + "num_input_tokens_seen": 23495624, + "step": 40490 + }, + { + "epoch": 6.031426869228477, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0264378607582129, + "loss": 0.7888, + "num_input_tokens_seen": 23498664, + "step": 40495 + }, + { + "epoch": 6.032171581769437, + "grad_norm": 0.02197265625, + "learning_rate": 0.026436599314433196, + "loss": 0.8028, + "num_input_tokens_seen": 23501704, + "step": 40500 + }, + { + "epoch": 6.032916294310396, + "grad_norm": 0.031005859375, + "learning_rate": 0.026435337677443774, + "loss": 0.807, + "num_input_tokens_seen": 23504936, + "step": 40505 + }, + { + "epoch": 6.033661006851355, + "grad_norm": 0.0186767578125, + "learning_rate": 0.026434075847265946, + "loss": 0.8056, + "num_input_tokens_seen": 23507528, + "step": 40510 + }, + { + "epoch": 6.034405719392314, + "grad_norm": 0.022705078125, + "learning_rate": 0.026432813823921025, + "loss": 0.8005, + "num_input_tokens_seen": 23510312, + "step": 40515 + }, + { + "epoch": 6.035150431933274, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02643155160743034, + "loss": 0.8055, + "num_input_tokens_seen": 23513416, + "step": 40520 + }, + { + "epoch": 6.035895144474233, + "grad_norm": 0.0201416015625, + "learning_rate": 0.026430289197815204, + "loss": 0.8154, + "num_input_tokens_seen": 23516552, + "step": 40525 + }, + { + "epoch": 6.036639857015192, + "grad_norm": 0.01708984375, + "learning_rate": 0.026429026595096952, + "loss": 0.8237, + "num_input_tokens_seen": 23519336, + "step": 40530 + }, + { + "epoch": 6.037384569556151, + "grad_norm": 0.01806640625, + "learning_rate": 0.02642776379929691, + "loss": 0.7911, + "num_input_tokens_seen": 23522504, + "step": 40535 + }, + { + "epoch": 6.038129282097111, + "grad_norm": 0.0233154296875, + "learning_rate": 0.026426500810436414, + "loss": 0.806, + "num_input_tokens_seen": 23525032, + "step": 40540 + }, + { + "epoch": 6.038873994638069, + "grad_norm": 0.025390625, + "learning_rate": 0.026425237628536796, + "loss": 0.7945, + "num_input_tokens_seen": 23528072, + "step": 40545 + }, + { + "epoch": 6.039618707179029, + "grad_norm": 0.01806640625, + "learning_rate": 0.026423974253619406, + "loss": 0.7861, + "num_input_tokens_seen": 23530888, + "step": 40550 + }, + { + "epoch": 6.040363419719988, + "grad_norm": 0.02734375, + "learning_rate": 0.026422710685705578, + "loss": 0.8113, + "num_input_tokens_seen": 23534024, + "step": 40555 + }, + { + "epoch": 6.0411081322609474, + "grad_norm": 0.019287109375, + "learning_rate": 0.026421446924816665, + "loss": 0.7755, + "num_input_tokens_seen": 23536744, + "step": 40560 + }, + { + "epoch": 6.041852844801906, + "grad_norm": 0.0260009765625, + "learning_rate": 0.026420182970974013, + "loss": 0.7917, + "num_input_tokens_seen": 23539720, + "step": 40565 + }, + { + "epoch": 6.042597557342866, + "grad_norm": 0.019775390625, + "learning_rate": 0.02641891882419898, + "loss": 0.7819, + "num_input_tokens_seen": 23542664, + "step": 40570 + }, + { + "epoch": 6.043342269883825, + "grad_norm": 0.0181884765625, + "learning_rate": 0.02641765448451291, + "loss": 0.8231, + "num_input_tokens_seen": 23545512, + "step": 40575 + }, + { + "epoch": 6.044086982424784, + "grad_norm": 0.022216796875, + "learning_rate": 0.02641638995193718, + "loss": 0.7948, + "num_input_tokens_seen": 23548424, + "step": 40580 + }, + { + "epoch": 6.044831694965743, + "grad_norm": 0.0125732421875, + "learning_rate": 0.026415125226493145, + "loss": 0.7866, + "num_input_tokens_seen": 23551400, + "step": 40585 + }, + { + "epoch": 6.045576407506703, + "grad_norm": 0.018798828125, + "learning_rate": 0.026413860308202173, + "loss": 0.812, + "num_input_tokens_seen": 23554568, + "step": 40590 + }, + { + "epoch": 6.046321120047661, + "grad_norm": 0.0169677734375, + "learning_rate": 0.026412595197085626, + "loss": 0.7883, + "num_input_tokens_seen": 23557256, + "step": 40595 + }, + { + "epoch": 6.047065832588621, + "grad_norm": 0.019287109375, + "learning_rate": 0.02641132989316488, + "loss": 0.7871, + "num_input_tokens_seen": 23560040, + "step": 40600 + }, + { + "epoch": 6.04781054512958, + "grad_norm": 0.017578125, + "learning_rate": 0.02641006439646132, + "loss": 0.8106, + "num_input_tokens_seen": 23562760, + "step": 40605 + }, + { + "epoch": 6.0485552576705395, + "grad_norm": 0.0177001953125, + "learning_rate": 0.026408798706996316, + "loss": 0.8016, + "num_input_tokens_seen": 23565768, + "step": 40610 + }, + { + "epoch": 6.049299970211498, + "grad_norm": 0.0245361328125, + "learning_rate": 0.026407532824791254, + "loss": 0.8247, + "num_input_tokens_seen": 23568712, + "step": 40615 + }, + { + "epoch": 6.050044682752458, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02640626674986752, + "loss": 0.8307, + "num_input_tokens_seen": 23571592, + "step": 40620 + }, + { + "epoch": 6.050789395293417, + "grad_norm": 0.016845703125, + "learning_rate": 0.026405000482246498, + "loss": 0.8055, + "num_input_tokens_seen": 23574600, + "step": 40625 + }, + { + "epoch": 6.051534107834376, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02640373402194959, + "loss": 0.8037, + "num_input_tokens_seen": 23577448, + "step": 40630 + }, + { + "epoch": 6.052278820375335, + "grad_norm": 0.0174560546875, + "learning_rate": 0.02640246736899818, + "loss": 0.7992, + "num_input_tokens_seen": 23580040, + "step": 40635 + }, + { + "epoch": 6.053023532916295, + "grad_norm": 0.0198974609375, + "learning_rate": 0.026401200523413677, + "loss": 0.8002, + "num_input_tokens_seen": 23583048, + "step": 40640 + }, + { + "epoch": 6.0537682454572534, + "grad_norm": 0.0185546875, + "learning_rate": 0.026399933485217478, + "loss": 0.7959, + "num_input_tokens_seen": 23585832, + "step": 40645 + }, + { + "epoch": 6.054512957998213, + "grad_norm": 0.01226806640625, + "learning_rate": 0.026398666254430988, + "loss": 0.7908, + "num_input_tokens_seen": 23588808, + "step": 40650 + }, + { + "epoch": 6.055257670539172, + "grad_norm": 0.017578125, + "learning_rate": 0.026397398831075617, + "loss": 0.8123, + "num_input_tokens_seen": 23591656, + "step": 40655 + }, + { + "epoch": 6.0560023830801315, + "grad_norm": 0.025390625, + "learning_rate": 0.026396131215172775, + "loss": 0.809, + "num_input_tokens_seen": 23594184, + "step": 40660 + }, + { + "epoch": 6.05674709562109, + "grad_norm": 0.0181884765625, + "learning_rate": 0.026394863406743883, + "loss": 0.7866, + "num_input_tokens_seen": 23597032, + "step": 40665 + }, + { + "epoch": 6.057491808162049, + "grad_norm": 0.01806640625, + "learning_rate": 0.026393595405810353, + "loss": 0.8011, + "num_input_tokens_seen": 23599880, + "step": 40670 + }, + { + "epoch": 6.058236520703009, + "grad_norm": 0.025390625, + "learning_rate": 0.02639232721239361, + "loss": 0.7865, + "num_input_tokens_seen": 23602728, + "step": 40675 + }, + { + "epoch": 6.058981233243967, + "grad_norm": 0.0174560546875, + "learning_rate": 0.026391058826515074, + "loss": 0.7972, + "num_input_tokens_seen": 23605640, + "step": 40680 + }, + { + "epoch": 6.059725945784927, + "grad_norm": 0.01251220703125, + "learning_rate": 0.02638979024819618, + "loss": 0.8177, + "num_input_tokens_seen": 23608264, + "step": 40685 + }, + { + "epoch": 6.060470658325886, + "grad_norm": 0.0140380859375, + "learning_rate": 0.026388521477458352, + "loss": 0.8158, + "num_input_tokens_seen": 23611176, + "step": 40690 + }, + { + "epoch": 6.0612153708668455, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02638725251432303, + "loss": 0.8091, + "num_input_tokens_seen": 23614344, + "step": 40695 + }, + { + "epoch": 6.061960083407804, + "grad_norm": 0.01708984375, + "learning_rate": 0.02638598335881165, + "loss": 0.8151, + "num_input_tokens_seen": 23617192, + "step": 40700 + }, + { + "epoch": 6.062704795948764, + "grad_norm": 0.011474609375, + "learning_rate": 0.026384714010945654, + "loss": 0.7886, + "num_input_tokens_seen": 23620456, + "step": 40705 + }, + { + "epoch": 6.063449508489723, + "grad_norm": 0.013671875, + "learning_rate": 0.026383444470746486, + "loss": 0.8105, + "num_input_tokens_seen": 23623464, + "step": 40710 + }, + { + "epoch": 6.064194221030682, + "grad_norm": 0.0191650390625, + "learning_rate": 0.026382174738235592, + "loss": 0.7948, + "num_input_tokens_seen": 23626696, + "step": 40715 + }, + { + "epoch": 6.064938933571641, + "grad_norm": 0.023193359375, + "learning_rate": 0.026380904813434424, + "loss": 0.812, + "num_input_tokens_seen": 23629608, + "step": 40720 + }, + { + "epoch": 6.065683646112601, + "grad_norm": 0.0118408203125, + "learning_rate": 0.026379634696364437, + "loss": 0.7823, + "num_input_tokens_seen": 23632520, + "step": 40725 + }, + { + "epoch": 6.0664283586535594, + "grad_norm": 0.01239013671875, + "learning_rate": 0.026378364387047087, + "loss": 0.7999, + "num_input_tokens_seen": 23635528, + "step": 40730 + }, + { + "epoch": 6.067173071194519, + "grad_norm": 0.020263671875, + "learning_rate": 0.026377093885503838, + "loss": 0.7989, + "num_input_tokens_seen": 23638376, + "step": 40735 + }, + { + "epoch": 6.067917783735478, + "grad_norm": 0.01708984375, + "learning_rate": 0.026375823191756145, + "loss": 0.7811, + "num_input_tokens_seen": 23641192, + "step": 40740 + }, + { + "epoch": 6.0686624962764375, + "grad_norm": 0.01318359375, + "learning_rate": 0.026374552305825485, + "loss": 0.8358, + "num_input_tokens_seen": 23644040, + "step": 40745 + }, + { + "epoch": 6.069407208817396, + "grad_norm": 0.0189208984375, + "learning_rate": 0.026373281227733327, + "loss": 0.7959, + "num_input_tokens_seen": 23647048, + "step": 40750 + }, + { + "epoch": 6.070151921358356, + "grad_norm": 0.0185546875, + "learning_rate": 0.02637200995750114, + "loss": 0.804, + "num_input_tokens_seen": 23649768, + "step": 40755 + }, + { + "epoch": 6.070896633899315, + "grad_norm": 0.017822265625, + "learning_rate": 0.0263707384951504, + "loss": 0.8259, + "num_input_tokens_seen": 23652648, + "step": 40760 + }, + { + "epoch": 6.071641346440274, + "grad_norm": 0.01177978515625, + "learning_rate": 0.026369466840702595, + "loss": 0.7867, + "num_input_tokens_seen": 23655336, + "step": 40765 + }, + { + "epoch": 6.072386058981233, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0263681949941792, + "loss": 0.8031, + "num_input_tokens_seen": 23658504, + "step": 40770 + }, + { + "epoch": 6.073130771522193, + "grad_norm": 0.02392578125, + "learning_rate": 0.026366922955601706, + "loss": 0.8248, + "num_input_tokens_seen": 23661288, + "step": 40775 + }, + { + "epoch": 6.0738754840631515, + "grad_norm": 0.018798828125, + "learning_rate": 0.0263656507249916, + "loss": 0.794, + "num_input_tokens_seen": 23664328, + "step": 40780 + }, + { + "epoch": 6.074620196604111, + "grad_norm": 0.019287109375, + "learning_rate": 0.026364378302370373, + "loss": 0.7996, + "num_input_tokens_seen": 23667240, + "step": 40785 + }, + { + "epoch": 6.07536490914507, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02636310568775953, + "loss": 0.7946, + "num_input_tokens_seen": 23669960, + "step": 40790 + }, + { + "epoch": 6.0761096216860295, + "grad_norm": 0.01092529296875, + "learning_rate": 0.026361832881180564, + "loss": 0.8111, + "num_input_tokens_seen": 23672904, + "step": 40795 + }, + { + "epoch": 6.076854334226988, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02636055988265498, + "loss": 0.7885, + "num_input_tokens_seen": 23676040, + "step": 40800 + }, + { + "epoch": 6.077599046767948, + "grad_norm": 0.018310546875, + "learning_rate": 0.026359286692204276, + "loss": 0.7916, + "num_input_tokens_seen": 23678984, + "step": 40805 + }, + { + "epoch": 6.078343759308907, + "grad_norm": 0.020263671875, + "learning_rate": 0.026358013309849975, + "loss": 0.8009, + "num_input_tokens_seen": 23681800, + "step": 40810 + }, + { + "epoch": 6.079088471849866, + "grad_norm": 0.035888671875, + "learning_rate": 0.026356739735613584, + "loss": 0.8261, + "num_input_tokens_seen": 23684936, + "step": 40815 + }, + { + "epoch": 6.079833184390825, + "grad_norm": 0.02197265625, + "learning_rate": 0.026355465969516618, + "loss": 0.8069, + "num_input_tokens_seen": 23687816, + "step": 40820 + }, + { + "epoch": 6.080577896931785, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02635419201158059, + "loss": 0.8111, + "num_input_tokens_seen": 23690760, + "step": 40825 + }, + { + "epoch": 6.0813226094727435, + "grad_norm": 0.0244140625, + "learning_rate": 0.026352917861827033, + "loss": 0.8156, + "num_input_tokens_seen": 23693608, + "step": 40830 + }, + { + "epoch": 6.082067322013703, + "grad_norm": 0.01324462890625, + "learning_rate": 0.026351643520277468, + "loss": 0.7926, + "num_input_tokens_seen": 23696360, + "step": 40835 + }, + { + "epoch": 6.082812034554662, + "grad_norm": 0.0185546875, + "learning_rate": 0.02635036898695342, + "loss": 0.8036, + "num_input_tokens_seen": 23699080, + "step": 40840 + }, + { + "epoch": 6.083556747095621, + "grad_norm": 0.01165771484375, + "learning_rate": 0.026349094261876423, + "loss": 0.8129, + "num_input_tokens_seen": 23702184, + "step": 40845 + }, + { + "epoch": 6.08430145963658, + "grad_norm": 0.017822265625, + "learning_rate": 0.026347819345068014, + "loss": 0.8128, + "num_input_tokens_seen": 23704936, + "step": 40850 + }, + { + "epoch": 6.085046172177539, + "grad_norm": 0.0174560546875, + "learning_rate": 0.026346544236549734, + "loss": 0.7866, + "num_input_tokens_seen": 23707496, + "step": 40855 + }, + { + "epoch": 6.085790884718499, + "grad_norm": 0.01129150390625, + "learning_rate": 0.026345268936343116, + "loss": 0.8024, + "num_input_tokens_seen": 23710216, + "step": 40860 + }, + { + "epoch": 6.0865355972594575, + "grad_norm": 0.0120849609375, + "learning_rate": 0.026343993444469714, + "loss": 0.8242, + "num_input_tokens_seen": 23713160, + "step": 40865 + }, + { + "epoch": 6.087280309800417, + "grad_norm": 0.026611328125, + "learning_rate": 0.02634271776095107, + "loss": 0.8146, + "num_input_tokens_seen": 23716200, + "step": 40870 + }, + { + "epoch": 6.088025022341376, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02634144188580874, + "loss": 0.8121, + "num_input_tokens_seen": 23719464, + "step": 40875 + }, + { + "epoch": 6.0887697348823355, + "grad_norm": 0.01953125, + "learning_rate": 0.026340165819064277, + "loss": 0.7726, + "num_input_tokens_seen": 23722312, + "step": 40880 + }, + { + "epoch": 6.089514447423294, + "grad_norm": 0.01904296875, + "learning_rate": 0.026338889560739234, + "loss": 0.7953, + "num_input_tokens_seen": 23724968, + "step": 40885 + }, + { + "epoch": 6.090259159964254, + "grad_norm": 0.0179443359375, + "learning_rate": 0.026337613110855182, + "loss": 0.7877, + "num_input_tokens_seen": 23727752, + "step": 40890 + }, + { + "epoch": 6.091003872505213, + "grad_norm": 0.036376953125, + "learning_rate": 0.02633633646943367, + "loss": 0.8073, + "num_input_tokens_seen": 23730600, + "step": 40895 + }, + { + "epoch": 6.091748585046172, + "grad_norm": 0.0179443359375, + "learning_rate": 0.026335059636496284, + "loss": 0.8004, + "num_input_tokens_seen": 23733544, + "step": 40900 + }, + { + "epoch": 6.092493297587131, + "grad_norm": 0.0198974609375, + "learning_rate": 0.026333782612064583, + "loss": 0.8155, + "num_input_tokens_seen": 23736584, + "step": 40905 + }, + { + "epoch": 6.093238010128091, + "grad_norm": 0.013671875, + "learning_rate": 0.026332505396160145, + "loss": 0.8014, + "num_input_tokens_seen": 23739592, + "step": 40910 + }, + { + "epoch": 6.0939827226690495, + "grad_norm": 0.02099609375, + "learning_rate": 0.02633122798880454, + "loss": 0.7884, + "num_input_tokens_seen": 23742408, + "step": 40915 + }, + { + "epoch": 6.094727435210009, + "grad_norm": 0.03857421875, + "learning_rate": 0.026329950390019362, + "loss": 0.8076, + "num_input_tokens_seen": 23745256, + "step": 40920 + }, + { + "epoch": 6.095472147750968, + "grad_norm": 0.01348876953125, + "learning_rate": 0.026328672599826186, + "loss": 0.7979, + "num_input_tokens_seen": 23748168, + "step": 40925 + }, + { + "epoch": 6.0962168602919276, + "grad_norm": 0.0301513671875, + "learning_rate": 0.026327394618246597, + "loss": 0.788, + "num_input_tokens_seen": 23751112, + "step": 40930 + }, + { + "epoch": 6.096961572832886, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02632611644530219, + "loss": 0.7847, + "num_input_tokens_seen": 23753960, + "step": 40935 + }, + { + "epoch": 6.097706285373846, + "grad_norm": 0.01251220703125, + "learning_rate": 0.026324838081014563, + "loss": 0.8126, + "num_input_tokens_seen": 23756808, + "step": 40940 + }, + { + "epoch": 6.098450997914805, + "grad_norm": 0.0115966796875, + "learning_rate": 0.0263235595254053, + "loss": 0.8036, + "num_input_tokens_seen": 23759816, + "step": 40945 + }, + { + "epoch": 6.099195710455764, + "grad_norm": 0.02392578125, + "learning_rate": 0.026322280778496007, + "loss": 0.8048, + "num_input_tokens_seen": 23762568, + "step": 40950 + }, + { + "epoch": 6.099940422996723, + "grad_norm": 0.0247802734375, + "learning_rate": 0.026321001840308292, + "loss": 0.7913, + "num_input_tokens_seen": 23765512, + "step": 40955 + }, + { + "epoch": 6.100685135537683, + "grad_norm": 0.0263671875, + "learning_rate": 0.026319722710863754, + "loss": 0.786, + "num_input_tokens_seen": 23768296, + "step": 40960 + }, + { + "epoch": 6.1014298480786415, + "grad_norm": 0.024169921875, + "learning_rate": 0.026318443390184007, + "loss": 0.8154, + "num_input_tokens_seen": 23771144, + "step": 40965 + }, + { + "epoch": 6.102174560619601, + "grad_norm": 0.0224609375, + "learning_rate": 0.026317163878290663, + "loss": 0.8091, + "num_input_tokens_seen": 23774248, + "step": 40970 + }, + { + "epoch": 6.10291927316056, + "grad_norm": 0.0184326171875, + "learning_rate": 0.026315884175205338, + "loss": 0.8, + "num_input_tokens_seen": 23777384, + "step": 40975 + }, + { + "epoch": 6.10366398570152, + "grad_norm": 0.021728515625, + "learning_rate": 0.026314604280949647, + "loss": 0.7829, + "num_input_tokens_seen": 23780456, + "step": 40980 + }, + { + "epoch": 6.104408698242478, + "grad_norm": 0.025390625, + "learning_rate": 0.02631332419554522, + "loss": 0.7886, + "num_input_tokens_seen": 23783464, + "step": 40985 + }, + { + "epoch": 6.105153410783438, + "grad_norm": 0.031494140625, + "learning_rate": 0.026312043919013678, + "loss": 0.787, + "num_input_tokens_seen": 23786344, + "step": 40990 + }, + { + "epoch": 6.105898123324397, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02631076345137665, + "loss": 0.7942, + "num_input_tokens_seen": 23789480, + "step": 40995 + }, + { + "epoch": 6.106642835865356, + "grad_norm": 0.01324462890625, + "learning_rate": 0.02630948279265577, + "loss": 0.81, + "num_input_tokens_seen": 23792840, + "step": 41000 + }, + { + "epoch": 6.107387548406315, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02630820194287267, + "loss": 0.8163, + "num_input_tokens_seen": 23795432, + "step": 41005 + }, + { + "epoch": 6.108132260947274, + "grad_norm": 0.0224609375, + "learning_rate": 0.026306920902048996, + "loss": 0.7814, + "num_input_tokens_seen": 23798216, + "step": 41010 + }, + { + "epoch": 6.1088769734882336, + "grad_norm": 0.0279541015625, + "learning_rate": 0.026305639670206382, + "loss": 0.7812, + "num_input_tokens_seen": 23801064, + "step": 41015 + }, + { + "epoch": 6.109621686029192, + "grad_norm": 0.01397705078125, + "learning_rate": 0.026304358247366478, + "loss": 0.814, + "num_input_tokens_seen": 23803752, + "step": 41020 + }, + { + "epoch": 6.110366398570152, + "grad_norm": 0.0191650390625, + "learning_rate": 0.026303076633550932, + "loss": 0.8029, + "num_input_tokens_seen": 23806696, + "step": 41025 + }, + { + "epoch": 6.111111111111111, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02630179482878139, + "loss": 0.8431, + "num_input_tokens_seen": 23809480, + "step": 41030 + }, + { + "epoch": 6.11185582365207, + "grad_norm": 0.019775390625, + "learning_rate": 0.026300512833079517, + "loss": 0.8024, + "num_input_tokens_seen": 23811944, + "step": 41035 + }, + { + "epoch": 6.112600536193029, + "grad_norm": 0.02880859375, + "learning_rate": 0.02629923064646696, + "loss": 0.7979, + "num_input_tokens_seen": 23814856, + "step": 41040 + }, + { + "epoch": 6.113345248733989, + "grad_norm": 0.019287109375, + "learning_rate": 0.026297948268965383, + "loss": 0.7969, + "num_input_tokens_seen": 23817672, + "step": 41045 + }, + { + "epoch": 6.1140899612749475, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02629666570059646, + "loss": 0.7893, + "num_input_tokens_seen": 23820584, + "step": 41050 + }, + { + "epoch": 6.114834673815907, + "grad_norm": 0.053955078125, + "learning_rate": 0.02629538294138185, + "loss": 0.8061, + "num_input_tokens_seen": 23823496, + "step": 41055 + }, + { + "epoch": 6.115579386356866, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02629409999134322, + "loss": 0.8023, + "num_input_tokens_seen": 23826472, + "step": 41060 + }, + { + "epoch": 6.116324098897826, + "grad_norm": 0.02001953125, + "learning_rate": 0.026292816850502252, + "loss": 0.7999, + "num_input_tokens_seen": 23829480, + "step": 41065 + }, + { + "epoch": 6.117068811438784, + "grad_norm": 0.018310546875, + "learning_rate": 0.02629153351888062, + "loss": 0.791, + "num_input_tokens_seen": 23831976, + "step": 41070 + }, + { + "epoch": 6.117813523979744, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02629024999650001, + "loss": 0.8366, + "num_input_tokens_seen": 23834824, + "step": 41075 + }, + { + "epoch": 6.118558236520703, + "grad_norm": 0.01556396484375, + "learning_rate": 0.026288966283382096, + "loss": 0.7868, + "num_input_tokens_seen": 23837960, + "step": 41080 + }, + { + "epoch": 6.119302949061662, + "grad_norm": 0.02685546875, + "learning_rate": 0.026287682379548576, + "loss": 0.7982, + "num_input_tokens_seen": 23841000, + "step": 41085 + }, + { + "epoch": 6.120047661602621, + "grad_norm": 0.0198974609375, + "learning_rate": 0.026286398285021127, + "loss": 0.7857, + "num_input_tokens_seen": 23843720, + "step": 41090 + }, + { + "epoch": 6.120792374143581, + "grad_norm": 0.031982421875, + "learning_rate": 0.026285113999821454, + "loss": 0.8144, + "num_input_tokens_seen": 23846888, + "step": 41095 + }, + { + "epoch": 6.1215370866845396, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026283829523971253, + "loss": 0.7944, + "num_input_tokens_seen": 23849640, + "step": 41100 + }, + { + "epoch": 6.122281799225499, + "grad_norm": 0.0189208984375, + "learning_rate": 0.026282544857492214, + "loss": 0.7914, + "num_input_tokens_seen": 23852808, + "step": 41105 + }, + { + "epoch": 6.123026511766458, + "grad_norm": 0.037841796875, + "learning_rate": 0.026281260000406047, + "loss": 0.7787, + "num_input_tokens_seen": 23855784, + "step": 41110 + }, + { + "epoch": 6.123771224307418, + "grad_norm": 0.0189208984375, + "learning_rate": 0.026279974952734465, + "loss": 0.8246, + "num_input_tokens_seen": 23858568, + "step": 41115 + }, + { + "epoch": 6.124515936848376, + "grad_norm": 0.016845703125, + "learning_rate": 0.026278689714499166, + "loss": 0.7989, + "num_input_tokens_seen": 23861224, + "step": 41120 + }, + { + "epoch": 6.125260649389336, + "grad_norm": 0.013427734375, + "learning_rate": 0.026277404285721867, + "loss": 0.8099, + "num_input_tokens_seen": 23864264, + "step": 41125 + }, + { + "epoch": 6.126005361930295, + "grad_norm": 0.0224609375, + "learning_rate": 0.02627611866642428, + "loss": 0.8114, + "num_input_tokens_seen": 23866952, + "step": 41130 + }, + { + "epoch": 6.126750074471254, + "grad_norm": 0.032958984375, + "learning_rate": 0.02627483285662814, + "loss": 0.7979, + "num_input_tokens_seen": 23870056, + "step": 41135 + }, + { + "epoch": 6.127494787012213, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02627354685635515, + "loss": 0.8098, + "num_input_tokens_seen": 23873032, + "step": 41140 + }, + { + "epoch": 6.128239499553173, + "grad_norm": 0.0274658203125, + "learning_rate": 0.026272260665627045, + "loss": 0.7934, + "num_input_tokens_seen": 23875880, + "step": 41145 + }, + { + "epoch": 6.128984212094132, + "grad_norm": 0.0194091796875, + "learning_rate": 0.026270974284465554, + "loss": 0.8088, + "num_input_tokens_seen": 23878600, + "step": 41150 + }, + { + "epoch": 6.129728924635091, + "grad_norm": 0.0262451171875, + "learning_rate": 0.02626968771289241, + "loss": 0.81, + "num_input_tokens_seen": 23881544, + "step": 41155 + }, + { + "epoch": 6.13047363717605, + "grad_norm": 0.020751953125, + "learning_rate": 0.026268400950929343, + "loss": 0.8108, + "num_input_tokens_seen": 23884456, + "step": 41160 + }, + { + "epoch": 6.13121834971701, + "grad_norm": 0.01239013671875, + "learning_rate": 0.02626711399859809, + "loss": 0.7999, + "num_input_tokens_seen": 23887464, + "step": 41165 + }, + { + "epoch": 6.131963062257968, + "grad_norm": 0.0157470703125, + "learning_rate": 0.026265826855920403, + "loss": 0.797, + "num_input_tokens_seen": 23890280, + "step": 41170 + }, + { + "epoch": 6.132707774798928, + "grad_norm": 0.022216796875, + "learning_rate": 0.02626453952291802, + "loss": 0.793, + "num_input_tokens_seen": 23893768, + "step": 41175 + }, + { + "epoch": 6.133452487339887, + "grad_norm": 0.02734375, + "learning_rate": 0.026263251999612696, + "loss": 0.7975, + "num_input_tokens_seen": 23896616, + "step": 41180 + }, + { + "epoch": 6.134197199880846, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02626196428602617, + "loss": 0.7967, + "num_input_tokens_seen": 23899720, + "step": 41185 + }, + { + "epoch": 6.134941912421805, + "grad_norm": 0.0213623046875, + "learning_rate": 0.026260676382180204, + "loss": 0.7757, + "num_input_tokens_seen": 23902856, + "step": 41190 + }, + { + "epoch": 6.135686624962764, + "grad_norm": 0.01519775390625, + "learning_rate": 0.02625938828809656, + "loss": 0.7943, + "num_input_tokens_seen": 23905736, + "step": 41195 + }, + { + "epoch": 6.136431337503724, + "grad_norm": 0.025390625, + "learning_rate": 0.026258100003796987, + "loss": 0.8121, + "num_input_tokens_seen": 23908680, + "step": 41200 + }, + { + "epoch": 6.137176050044682, + "grad_norm": 0.0272216796875, + "learning_rate": 0.026256811529303262, + "loss": 0.7824, + "num_input_tokens_seen": 23911368, + "step": 41205 + }, + { + "epoch": 6.137920762585642, + "grad_norm": 0.0235595703125, + "learning_rate": 0.026255522864637146, + "loss": 0.784, + "num_input_tokens_seen": 23914312, + "step": 41210 + }, + { + "epoch": 6.138665475126601, + "grad_norm": 0.026611328125, + "learning_rate": 0.026254234009820413, + "loss": 0.7605, + "num_input_tokens_seen": 23917128, + "step": 41215 + }, + { + "epoch": 6.13941018766756, + "grad_norm": 0.0322265625, + "learning_rate": 0.026252944964874837, + "loss": 0.7658, + "num_input_tokens_seen": 23919624, + "step": 41220 + }, + { + "epoch": 6.140154900208519, + "grad_norm": 0.0244140625, + "learning_rate": 0.026251655729822184, + "loss": 0.802, + "num_input_tokens_seen": 23922504, + "step": 41225 + }, + { + "epoch": 6.140899612749479, + "grad_norm": 0.025390625, + "learning_rate": 0.026250366304684244, + "loss": 0.8235, + "num_input_tokens_seen": 23925256, + "step": 41230 + }, + { + "epoch": 6.141644325290438, + "grad_norm": 0.0205078125, + "learning_rate": 0.026249076689482807, + "loss": 0.802, + "num_input_tokens_seen": 23928072, + "step": 41235 + }, + { + "epoch": 6.142389037831397, + "grad_norm": 0.0240478515625, + "learning_rate": 0.026247786884239645, + "loss": 0.79, + "num_input_tokens_seen": 23931144, + "step": 41240 + }, + { + "epoch": 6.143133750372356, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026246496888976564, + "loss": 0.8375, + "num_input_tokens_seen": 23933736, + "step": 41245 + }, + { + "epoch": 6.143878462913316, + "grad_norm": 0.0223388671875, + "learning_rate": 0.026245206703715343, + "loss": 0.8046, + "num_input_tokens_seen": 23936712, + "step": 41250 + }, + { + "epoch": 6.144623175454274, + "grad_norm": 0.01513671875, + "learning_rate": 0.026243916328477782, + "loss": 0.7956, + "num_input_tokens_seen": 23939784, + "step": 41255 + }, + { + "epoch": 6.145367887995234, + "grad_norm": 0.0230712890625, + "learning_rate": 0.026242625763285687, + "loss": 0.8074, + "num_input_tokens_seen": 23942664, + "step": 41260 + }, + { + "epoch": 6.146112600536193, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02624133500816085, + "loss": 0.7898, + "num_input_tokens_seen": 23945608, + "step": 41265 + }, + { + "epoch": 6.146857313077152, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02624004406312509, + "loss": 0.8363, + "num_input_tokens_seen": 23948520, + "step": 41270 + }, + { + "epoch": 6.147602025618111, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02623875292820021, + "loss": 0.8222, + "num_input_tokens_seen": 23951912, + "step": 41275 + }, + { + "epoch": 6.148346738159071, + "grad_norm": 0.03369140625, + "learning_rate": 0.026237461603408015, + "loss": 0.7835, + "num_input_tokens_seen": 23954696, + "step": 41280 + }, + { + "epoch": 6.14909145070003, + "grad_norm": 0.02197265625, + "learning_rate": 0.026236170088770328, + "loss": 0.8035, + "num_input_tokens_seen": 23957448, + "step": 41285 + }, + { + "epoch": 6.149836163240989, + "grad_norm": 0.024169921875, + "learning_rate": 0.02623487838430897, + "loss": 0.7963, + "num_input_tokens_seen": 23960360, + "step": 41290 + }, + { + "epoch": 6.150580875781948, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026233586490045764, + "loss": 0.8198, + "num_input_tokens_seen": 23963176, + "step": 41295 + }, + { + "epoch": 6.151325588322908, + "grad_norm": 0.0302734375, + "learning_rate": 0.026232294406002526, + "loss": 0.8223, + "num_input_tokens_seen": 23965896, + "step": 41300 + }, + { + "epoch": 6.152070300863866, + "grad_norm": 0.021484375, + "learning_rate": 0.02623100213220109, + "loss": 0.7915, + "num_input_tokens_seen": 23968840, + "step": 41305 + }, + { + "epoch": 6.152815013404826, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02622970966866329, + "loss": 0.8227, + "num_input_tokens_seen": 23971784, + "step": 41310 + }, + { + "epoch": 6.153559725945785, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02622841701541096, + "loss": 0.7912, + "num_input_tokens_seen": 23974792, + "step": 41315 + }, + { + "epoch": 6.1543044384867445, + "grad_norm": 0.021728515625, + "learning_rate": 0.02622712417246594, + "loss": 0.8178, + "num_input_tokens_seen": 23977608, + "step": 41320 + }, + { + "epoch": 6.155049151027703, + "grad_norm": 0.01483154296875, + "learning_rate": 0.02622583113985006, + "loss": 0.8061, + "num_input_tokens_seen": 23980328, + "step": 41325 + }, + { + "epoch": 6.155793863568663, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026224537917585174, + "loss": 0.7892, + "num_input_tokens_seen": 23982984, + "step": 41330 + }, + { + "epoch": 6.156538576109622, + "grad_norm": 0.0216064453125, + "learning_rate": 0.026223244505693126, + "loss": 0.7902, + "num_input_tokens_seen": 23986024, + "step": 41335 + }, + { + "epoch": 6.157283288650581, + "grad_norm": 0.01434326171875, + "learning_rate": 0.026221950904195775, + "loss": 0.8208, + "num_input_tokens_seen": 23988808, + "step": 41340 + }, + { + "epoch": 6.15802800119154, + "grad_norm": 0.033447265625, + "learning_rate": 0.026220657113114963, + "loss": 0.7858, + "num_input_tokens_seen": 23991528, + "step": 41345 + }, + { + "epoch": 6.1587727137325, + "grad_norm": 0.0220947265625, + "learning_rate": 0.026219363132472556, + "loss": 0.8018, + "num_input_tokens_seen": 23994440, + "step": 41350 + }, + { + "epoch": 6.159517426273458, + "grad_norm": 0.019287109375, + "learning_rate": 0.026218068962290415, + "loss": 0.7982, + "num_input_tokens_seen": 23997320, + "step": 41355 + }, + { + "epoch": 6.160262138814417, + "grad_norm": 0.03076171875, + "learning_rate": 0.026216774602590392, + "loss": 0.8069, + "num_input_tokens_seen": 24000296, + "step": 41360 + }, + { + "epoch": 6.161006851355377, + "grad_norm": 0.041015625, + "learning_rate": 0.02621548005339437, + "loss": 0.8281, + "num_input_tokens_seen": 24003144, + "step": 41365 + }, + { + "epoch": 6.161751563896336, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02621418531472421, + "loss": 0.8107, + "num_input_tokens_seen": 24006376, + "step": 41370 + }, + { + "epoch": 6.162496276437295, + "grad_norm": 0.023193359375, + "learning_rate": 0.026212890386601784, + "loss": 0.7809, + "num_input_tokens_seen": 24009032, + "step": 41375 + }, + { + "epoch": 6.163240988978254, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02621159526904897, + "loss": 0.7928, + "num_input_tokens_seen": 24011944, + "step": 41380 + }, + { + "epoch": 6.163985701519214, + "grad_norm": 0.0279541015625, + "learning_rate": 0.026210299962087655, + "loss": 0.7973, + "num_input_tokens_seen": 24014760, + "step": 41385 + }, + { + "epoch": 6.164730414060172, + "grad_norm": 0.01336669921875, + "learning_rate": 0.02620900446573971, + "loss": 0.8016, + "num_input_tokens_seen": 24017864, + "step": 41390 + }, + { + "epoch": 6.165475126601132, + "grad_norm": 0.0322265625, + "learning_rate": 0.026207708780027032, + "loss": 0.8193, + "num_input_tokens_seen": 24020744, + "step": 41395 + }, + { + "epoch": 6.166219839142091, + "grad_norm": 0.013916015625, + "learning_rate": 0.026206412904971506, + "loss": 0.8006, + "num_input_tokens_seen": 24023400, + "step": 41400 + }, + { + "epoch": 6.1669645516830505, + "grad_norm": 0.01239013671875, + "learning_rate": 0.026205116840595018, + "loss": 0.8221, + "num_input_tokens_seen": 24026312, + "step": 41405 + }, + { + "epoch": 6.167709264224009, + "grad_norm": 0.03369140625, + "learning_rate": 0.026203820586919467, + "loss": 0.8132, + "num_input_tokens_seen": 24029160, + "step": 41410 + }, + { + "epoch": 6.168453976764969, + "grad_norm": 0.021484375, + "learning_rate": 0.02620252414396676, + "loss": 0.8061, + "num_input_tokens_seen": 24032392, + "step": 41415 + }, + { + "epoch": 6.169198689305928, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026201227511758792, + "loss": 0.8137, + "num_input_tokens_seen": 24035784, + "step": 41420 + }, + { + "epoch": 6.169943401846887, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02619993069031747, + "loss": 0.8083, + "num_input_tokens_seen": 24038696, + "step": 41425 + }, + { + "epoch": 6.170688114387846, + "grad_norm": 0.0213623046875, + "learning_rate": 0.026198633679664702, + "loss": 0.8107, + "num_input_tokens_seen": 24041672, + "step": 41430 + }, + { + "epoch": 6.171432826928806, + "grad_norm": 0.01806640625, + "learning_rate": 0.026197336479822402, + "loss": 0.8211, + "num_input_tokens_seen": 24044456, + "step": 41435 + }, + { + "epoch": 6.172177539469764, + "grad_norm": 0.01116943359375, + "learning_rate": 0.02619603909081248, + "loss": 0.8009, + "num_input_tokens_seen": 24047240, + "step": 41440 + }, + { + "epoch": 6.172922252010724, + "grad_norm": 0.020263671875, + "learning_rate": 0.026194741512656855, + "loss": 0.7924, + "num_input_tokens_seen": 24050248, + "step": 41445 + }, + { + "epoch": 6.173666964551683, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02619344374537745, + "loss": 0.7927, + "num_input_tokens_seen": 24053128, + "step": 41450 + }, + { + "epoch": 6.1744116770926425, + "grad_norm": 0.05078125, + "learning_rate": 0.026192145788996195, + "loss": 0.8469, + "num_input_tokens_seen": 24055976, + "step": 41455 + }, + { + "epoch": 6.175156389633601, + "grad_norm": 0.0140380859375, + "learning_rate": 0.02619084764353501, + "loss": 0.8059, + "num_input_tokens_seen": 24059176, + "step": 41460 + }, + { + "epoch": 6.175901102174561, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026189549309015823, + "loss": 0.8202, + "num_input_tokens_seen": 24062056, + "step": 41465 + }, + { + "epoch": 6.17664581471552, + "grad_norm": 0.0255126953125, + "learning_rate": 0.026188250785460578, + "loss": 0.8253, + "num_input_tokens_seen": 24064616, + "step": 41470 + }, + { + "epoch": 6.177390527256479, + "grad_norm": 0.018798828125, + "learning_rate": 0.026186952072891203, + "loss": 0.8115, + "num_input_tokens_seen": 24067848, + "step": 41475 + }, + { + "epoch": 6.178135239797438, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02618565317132964, + "loss": 0.8021, + "num_input_tokens_seen": 24070536, + "step": 41480 + }, + { + "epoch": 6.178879952338398, + "grad_norm": 0.0277099609375, + "learning_rate": 0.026184354080797844, + "loss": 0.8076, + "num_input_tokens_seen": 24073320, + "step": 41485 + }, + { + "epoch": 6.1796246648793565, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02618305480131775, + "loss": 0.792, + "num_input_tokens_seen": 24076232, + "step": 41490 + }, + { + "epoch": 6.180369377420316, + "grad_norm": 0.022216796875, + "learning_rate": 0.02618175533291131, + "loss": 0.7946, + "num_input_tokens_seen": 24079112, + "step": 41495 + }, + { + "epoch": 6.181114089961275, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02618045567560048, + "loss": 0.8002, + "num_input_tokens_seen": 24082120, + "step": 41500 + }, + { + "epoch": 6.1818588025022345, + "grad_norm": 0.0203857421875, + "learning_rate": 0.026179155829407212, + "loss": 0.8028, + "num_input_tokens_seen": 24084936, + "step": 41505 + }, + { + "epoch": 6.182603515043193, + "grad_norm": 0.0147705078125, + "learning_rate": 0.02617785579435347, + "loss": 0.8036, + "num_input_tokens_seen": 24087944, + "step": 41510 + }, + { + "epoch": 6.183348227584153, + "grad_norm": 0.0196533203125, + "learning_rate": 0.026176555570461217, + "loss": 0.8086, + "num_input_tokens_seen": 24090632, + "step": 41515 + }, + { + "epoch": 6.184092940125112, + "grad_norm": 0.0272216796875, + "learning_rate": 0.026175255157752413, + "loss": 0.8122, + "num_input_tokens_seen": 24093736, + "step": 41520 + }, + { + "epoch": 6.18483765266607, + "grad_norm": 0.0205078125, + "learning_rate": 0.026173954556249032, + "loss": 0.7961, + "num_input_tokens_seen": 24096744, + "step": 41525 + }, + { + "epoch": 6.18558236520703, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02617265376597305, + "loss": 0.8081, + "num_input_tokens_seen": 24099624, + "step": 41530 + }, + { + "epoch": 6.18632707774799, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02617135278694643, + "loss": 0.803, + "num_input_tokens_seen": 24102600, + "step": 41535 + }, + { + "epoch": 6.1870717902889485, + "grad_norm": 0.022216796875, + "learning_rate": 0.026170051619191163, + "loss": 0.7954, + "num_input_tokens_seen": 24105640, + "step": 41540 + }, + { + "epoch": 6.187816502829907, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02616875026272923, + "loss": 0.8039, + "num_input_tokens_seen": 24108392, + "step": 41545 + }, + { + "epoch": 6.188561215370867, + "grad_norm": 0.02197265625, + "learning_rate": 0.026167448717582604, + "loss": 0.804, + "num_input_tokens_seen": 24111304, + "step": 41550 + }, + { + "epoch": 6.189305927911826, + "grad_norm": 0.01239013671875, + "learning_rate": 0.02616614698377329, + "loss": 0.81, + "num_input_tokens_seen": 24114280, + "step": 41555 + }, + { + "epoch": 6.190050640452785, + "grad_norm": 0.0147705078125, + "learning_rate": 0.02616484506132327, + "loss": 0.8119, + "num_input_tokens_seen": 24117448, + "step": 41560 + }, + { + "epoch": 6.190795352993744, + "grad_norm": 0.021484375, + "learning_rate": 0.02616354295025454, + "loss": 0.8288, + "num_input_tokens_seen": 24120040, + "step": 41565 + }, + { + "epoch": 6.191540065534704, + "grad_norm": 0.0286865234375, + "learning_rate": 0.026162240650589096, + "loss": 0.7909, + "num_input_tokens_seen": 24123336, + "step": 41570 + }, + { + "epoch": 6.1922847780756625, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02616093816234894, + "loss": 0.8119, + "num_input_tokens_seen": 24125896, + "step": 41575 + }, + { + "epoch": 6.193029490616622, + "grad_norm": 0.027099609375, + "learning_rate": 0.02615963548555608, + "loss": 0.7957, + "num_input_tokens_seen": 24129032, + "step": 41580 + }, + { + "epoch": 6.193774203157581, + "grad_norm": 0.04052734375, + "learning_rate": 0.02615833262023252, + "loss": 0.794, + "num_input_tokens_seen": 24132072, + "step": 41585 + }, + { + "epoch": 6.1945189156985405, + "grad_norm": 0.0198974609375, + "learning_rate": 0.026157029566400272, + "loss": 0.8035, + "num_input_tokens_seen": 24134792, + "step": 41590 + }, + { + "epoch": 6.195263628239499, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02615572632408135, + "loss": 0.8035, + "num_input_tokens_seen": 24137416, + "step": 41595 + }, + { + "epoch": 6.196008340780459, + "grad_norm": 0.0230712890625, + "learning_rate": 0.026154422893297766, + "loss": 0.7977, + "num_input_tokens_seen": 24140264, + "step": 41600 + }, + { + "epoch": 6.196753053321418, + "grad_norm": 0.024658203125, + "learning_rate": 0.026153119274071546, + "loss": 0.8064, + "num_input_tokens_seen": 24143624, + "step": 41605 + }, + { + "epoch": 6.197497765862377, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02615181546642471, + "loss": 0.8113, + "num_input_tokens_seen": 24146696, + "step": 41610 + }, + { + "epoch": 6.198242478403336, + "grad_norm": 0.01318359375, + "learning_rate": 0.026150511470379285, + "loss": 0.8094, + "num_input_tokens_seen": 24149448, + "step": 41615 + }, + { + "epoch": 6.198987190944296, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02614920728595731, + "loss": 0.8063, + "num_input_tokens_seen": 24152232, + "step": 41620 + }, + { + "epoch": 6.1997319034852545, + "grad_norm": 0.024169921875, + "learning_rate": 0.0261479029131808, + "loss": 0.8035, + "num_input_tokens_seen": 24154984, + "step": 41625 + }, + { + "epoch": 6.200476616026214, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026146598352071802, + "loss": 0.7954, + "num_input_tokens_seen": 24157800, + "step": 41630 + }, + { + "epoch": 6.201221328567173, + "grad_norm": 0.01318359375, + "learning_rate": 0.026145293602652355, + "loss": 0.803, + "num_input_tokens_seen": 24160552, + "step": 41635 + }, + { + "epoch": 6.2019660411081325, + "grad_norm": 0.022705078125, + "learning_rate": 0.026143988664944502, + "loss": 0.8042, + "num_input_tokens_seen": 24163304, + "step": 41640 + }, + { + "epoch": 6.202710753649091, + "grad_norm": 0.0361328125, + "learning_rate": 0.026142683538970285, + "loss": 0.7987, + "num_input_tokens_seen": 24166600, + "step": 41645 + }, + { + "epoch": 6.203455466190051, + "grad_norm": 0.0230712890625, + "learning_rate": 0.026141378224751755, + "loss": 0.7954, + "num_input_tokens_seen": 24169704, + "step": 41650 + }, + { + "epoch": 6.20420017873101, + "grad_norm": 0.017578125, + "learning_rate": 0.026140072722310965, + "loss": 0.8183, + "num_input_tokens_seen": 24172392, + "step": 41655 + }, + { + "epoch": 6.204944891271969, + "grad_norm": 0.023681640625, + "learning_rate": 0.026138767031669966, + "loss": 0.8042, + "num_input_tokens_seen": 24175656, + "step": 41660 + }, + { + "epoch": 6.205689603812928, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02613746115285082, + "loss": 0.8056, + "num_input_tokens_seen": 24178760, + "step": 41665 + }, + { + "epoch": 6.206434316353888, + "grad_norm": 0.02392578125, + "learning_rate": 0.026136155085875586, + "loss": 0.7913, + "num_input_tokens_seen": 24181448, + "step": 41670 + }, + { + "epoch": 6.2071790288948465, + "grad_norm": 0.0162353515625, + "learning_rate": 0.026134848830766333, + "loss": 0.8012, + "num_input_tokens_seen": 24184456, + "step": 41675 + }, + { + "epoch": 6.207923741435806, + "grad_norm": 0.0184326171875, + "learning_rate": 0.026133542387545124, + "loss": 0.8171, + "num_input_tokens_seen": 24187400, + "step": 41680 + }, + { + "epoch": 6.208668453976765, + "grad_norm": 0.0308837890625, + "learning_rate": 0.026132235756234034, + "loss": 0.8071, + "num_input_tokens_seen": 24190440, + "step": 41685 + }, + { + "epoch": 6.209413166517725, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02613092893685513, + "loss": 0.8099, + "num_input_tokens_seen": 24193160, + "step": 41690 + }, + { + "epoch": 6.210157879058683, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0261296219294305, + "loss": 0.8112, + "num_input_tokens_seen": 24196008, + "step": 41695 + }, + { + "epoch": 6.210902591599643, + "grad_norm": 0.02978515625, + "learning_rate": 0.02612831473398222, + "loss": 0.805, + "num_input_tokens_seen": 24198792, + "step": 41700 + }, + { + "epoch": 6.211647304140602, + "grad_norm": 0.01324462890625, + "learning_rate": 0.026127007350532368, + "loss": 0.8068, + "num_input_tokens_seen": 24201832, + "step": 41705 + }, + { + "epoch": 6.2123920166815605, + "grad_norm": 0.01361083984375, + "learning_rate": 0.026125699779103038, + "loss": 0.8049, + "num_input_tokens_seen": 24204648, + "step": 41710 + }, + { + "epoch": 6.21313672922252, + "grad_norm": 0.020751953125, + "learning_rate": 0.02612439201971632, + "loss": 0.8031, + "num_input_tokens_seen": 24207656, + "step": 41715 + }, + { + "epoch": 6.213881441763479, + "grad_norm": 0.0245361328125, + "learning_rate": 0.026123084072394304, + "loss": 0.803, + "num_input_tokens_seen": 24210440, + "step": 41720 + }, + { + "epoch": 6.2146261543044385, + "grad_norm": 0.0216064453125, + "learning_rate": 0.026121775937159086, + "loss": 0.805, + "num_input_tokens_seen": 24213512, + "step": 41725 + }, + { + "epoch": 6.215370866845397, + "grad_norm": 0.0213623046875, + "learning_rate": 0.026120467614032767, + "loss": 0.8019, + "num_input_tokens_seen": 24216584, + "step": 41730 + }, + { + "epoch": 6.216115579386357, + "grad_norm": 0.024658203125, + "learning_rate": 0.026119159103037453, + "loss": 0.8106, + "num_input_tokens_seen": 24219560, + "step": 41735 + }, + { + "epoch": 6.216860291927316, + "grad_norm": 0.03271484375, + "learning_rate": 0.026117850404195245, + "loss": 0.814, + "num_input_tokens_seen": 24222216, + "step": 41740 + }, + { + "epoch": 6.217605004468275, + "grad_norm": 0.01611328125, + "learning_rate": 0.026116541517528253, + "loss": 0.802, + "num_input_tokens_seen": 24225064, + "step": 41745 + }, + { + "epoch": 6.218349717009234, + "grad_norm": 0.02880859375, + "learning_rate": 0.02611523244305859, + "loss": 0.7917, + "num_input_tokens_seen": 24228232, + "step": 41750 + }, + { + "epoch": 6.219094429550194, + "grad_norm": 0.01361083984375, + "learning_rate": 0.026113923180808372, + "loss": 0.8183, + "num_input_tokens_seen": 24231208, + "step": 41755 + }, + { + "epoch": 6.2198391420911525, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02611261373079972, + "loss": 0.8075, + "num_input_tokens_seen": 24233800, + "step": 41760 + }, + { + "epoch": 6.220583854632112, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02611130409305475, + "loss": 0.8031, + "num_input_tokens_seen": 24236840, + "step": 41765 + }, + { + "epoch": 6.221328567173071, + "grad_norm": 0.037841796875, + "learning_rate": 0.026109994267595596, + "loss": 0.7975, + "num_input_tokens_seen": 24239688, + "step": 41770 + }, + { + "epoch": 6.222073279714031, + "grad_norm": 0.025146484375, + "learning_rate": 0.026108684254444372, + "loss": 0.8069, + "num_input_tokens_seen": 24242696, + "step": 41775 + }, + { + "epoch": 6.222817992254989, + "grad_norm": 0.02099609375, + "learning_rate": 0.026107374053623227, + "loss": 0.7963, + "num_input_tokens_seen": 24245576, + "step": 41780 + }, + { + "epoch": 6.223562704795949, + "grad_norm": 0.01507568359375, + "learning_rate": 0.026106063665154285, + "loss": 0.7923, + "num_input_tokens_seen": 24248712, + "step": 41785 + }, + { + "epoch": 6.224307417336908, + "grad_norm": 0.01416015625, + "learning_rate": 0.026104753089059678, + "loss": 0.816, + "num_input_tokens_seen": 24251624, + "step": 41790 + }, + { + "epoch": 6.225052129877867, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02610344232536156, + "loss": 0.7859, + "num_input_tokens_seen": 24254600, + "step": 41795 + }, + { + "epoch": 6.225796842418826, + "grad_norm": 0.0267333984375, + "learning_rate": 0.026102131374082068, + "loss": 0.8135, + "num_input_tokens_seen": 24257512, + "step": 41800 + }, + { + "epoch": 6.226541554959786, + "grad_norm": 0.0263671875, + "learning_rate": 0.02610082023524335, + "loss": 0.8173, + "num_input_tokens_seen": 24260296, + "step": 41805 + }, + { + "epoch": 6.2272862675007445, + "grad_norm": 0.0223388671875, + "learning_rate": 0.026099508908867555, + "loss": 0.8007, + "num_input_tokens_seen": 24263080, + "step": 41810 + }, + { + "epoch": 6.228030980041704, + "grad_norm": 0.01348876953125, + "learning_rate": 0.02609819739497684, + "loss": 0.8069, + "num_input_tokens_seen": 24265928, + "step": 41815 + }, + { + "epoch": 6.228775692582663, + "grad_norm": 0.031494140625, + "learning_rate": 0.026096885693593357, + "loss": 0.8044, + "num_input_tokens_seen": 24269128, + "step": 41820 + }, + { + "epoch": 6.229520405123623, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02609557380473927, + "loss": 0.8118, + "num_input_tokens_seen": 24271912, + "step": 41825 + }, + { + "epoch": 6.230265117664581, + "grad_norm": 0.0498046875, + "learning_rate": 0.02609426172843674, + "loss": 0.8011, + "num_input_tokens_seen": 24275656, + "step": 41830 + }, + { + "epoch": 6.231009830205541, + "grad_norm": 0.0211181640625, + "learning_rate": 0.026092949464707936, + "loss": 0.807, + "num_input_tokens_seen": 24278248, + "step": 41835 + }, + { + "epoch": 6.2317545427465, + "grad_norm": 0.04638671875, + "learning_rate": 0.026091637013575024, + "loss": 0.8, + "num_input_tokens_seen": 24281288, + "step": 41840 + }, + { + "epoch": 6.232499255287459, + "grad_norm": 0.027099609375, + "learning_rate": 0.026090324375060175, + "loss": 0.7979, + "num_input_tokens_seen": 24284040, + "step": 41845 + }, + { + "epoch": 6.233243967828418, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02608901154918557, + "loss": 0.8075, + "num_input_tokens_seen": 24287144, + "step": 41850 + }, + { + "epoch": 6.233988680369378, + "grad_norm": 0.0194091796875, + "learning_rate": 0.026087698535973387, + "loss": 0.8086, + "num_input_tokens_seen": 24290120, + "step": 41855 + }, + { + "epoch": 6.234733392910337, + "grad_norm": 0.03173828125, + "learning_rate": 0.026086385335445807, + "loss": 0.8069, + "num_input_tokens_seen": 24293096, + "step": 41860 + }, + { + "epoch": 6.235478105451296, + "grad_norm": 0.026123046875, + "learning_rate": 0.026085071947625008, + "loss": 0.8083, + "num_input_tokens_seen": 24295944, + "step": 41865 + }, + { + "epoch": 6.236222817992255, + "grad_norm": 0.02783203125, + "learning_rate": 0.02608375837253319, + "loss": 0.7958, + "num_input_tokens_seen": 24298952, + "step": 41870 + }, + { + "epoch": 6.236967530533214, + "grad_norm": 0.036865234375, + "learning_rate": 0.02608244461019254, + "loss": 0.8112, + "num_input_tokens_seen": 24301768, + "step": 41875 + }, + { + "epoch": 6.237712243074173, + "grad_norm": 0.021240234375, + "learning_rate": 0.02608113066062525, + "loss": 0.8102, + "num_input_tokens_seen": 24304776, + "step": 41880 + }, + { + "epoch": 6.238456955615132, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02607981652385352, + "loss": 0.7957, + "num_input_tokens_seen": 24307784, + "step": 41885 + }, + { + "epoch": 6.239201668156092, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02607850219989955, + "loss": 0.7996, + "num_input_tokens_seen": 24310856, + "step": 41890 + }, + { + "epoch": 6.2399463806970505, + "grad_norm": 0.024169921875, + "learning_rate": 0.026077187688785544, + "loss": 0.7979, + "num_input_tokens_seen": 24314088, + "step": 41895 + }, + { + "epoch": 6.24069109323801, + "grad_norm": 0.01904296875, + "learning_rate": 0.026075872990533714, + "loss": 0.8006, + "num_input_tokens_seen": 24317192, + "step": 41900 + }, + { + "epoch": 6.241435805778969, + "grad_norm": 0.0189208984375, + "learning_rate": 0.026074558105166262, + "loss": 0.8026, + "num_input_tokens_seen": 24319848, + "step": 41905 + }, + { + "epoch": 6.242180518319929, + "grad_norm": 0.021728515625, + "learning_rate": 0.02607324303270541, + "loss": 0.8011, + "num_input_tokens_seen": 24322824, + "step": 41910 + }, + { + "epoch": 6.242925230860887, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02607192777317337, + "loss": 0.8037, + "num_input_tokens_seen": 24325736, + "step": 41915 + }, + { + "epoch": 6.243669943401847, + "grad_norm": 0.012451171875, + "learning_rate": 0.026070612326592368, + "loss": 0.7955, + "num_input_tokens_seen": 24328360, + "step": 41920 + }, + { + "epoch": 6.244414655942806, + "grad_norm": 0.0306396484375, + "learning_rate": 0.026069296692984616, + "loss": 0.819, + "num_input_tokens_seen": 24331848, + "step": 41925 + }, + { + "epoch": 6.245159368483765, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02606798087237235, + "loss": 0.7794, + "num_input_tokens_seen": 24334664, + "step": 41930 + }, + { + "epoch": 6.245904081024724, + "grad_norm": 0.01953125, + "learning_rate": 0.02606666486477779, + "loss": 0.7984, + "num_input_tokens_seen": 24337640, + "step": 41935 + }, + { + "epoch": 6.246648793565684, + "grad_norm": 0.0225830078125, + "learning_rate": 0.026065348670223188, + "loss": 0.8048, + "num_input_tokens_seen": 24340712, + "step": 41940 + }, + { + "epoch": 6.247393506106643, + "grad_norm": 0.0179443359375, + "learning_rate": 0.026064032288730755, + "loss": 0.8164, + "num_input_tokens_seen": 24343560, + "step": 41945 + }, + { + "epoch": 6.248138218647602, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02606271572032275, + "loss": 0.8009, + "num_input_tokens_seen": 24346568, + "step": 41950 + }, + { + "epoch": 6.248882931188561, + "grad_norm": 0.01336669921875, + "learning_rate": 0.026061398965021396, + "loss": 0.8083, + "num_input_tokens_seen": 24349256, + "step": 41955 + }, + { + "epoch": 6.249627643729521, + "grad_norm": 0.01202392578125, + "learning_rate": 0.026060082022848957, + "loss": 0.8313, + "num_input_tokens_seen": 24352040, + "step": 41960 + }, + { + "epoch": 6.250372356270479, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02605876489382767, + "loss": 0.8106, + "num_input_tokens_seen": 24355176, + "step": 41965 + }, + { + "epoch": 6.251117068811439, + "grad_norm": 0.0400390625, + "learning_rate": 0.02605744757797979, + "loss": 0.8198, + "num_input_tokens_seen": 24358056, + "step": 41970 + }, + { + "epoch": 6.251861781352398, + "grad_norm": 0.023681640625, + "learning_rate": 0.02605613007532757, + "loss": 0.8085, + "num_input_tokens_seen": 24361192, + "step": 41975 + }, + { + "epoch": 6.252606493893357, + "grad_norm": 0.0233154296875, + "learning_rate": 0.026054812385893272, + "loss": 0.8168, + "num_input_tokens_seen": 24364168, + "step": 41980 + }, + { + "epoch": 6.253351206434316, + "grad_norm": 0.03271484375, + "learning_rate": 0.026053494509699154, + "loss": 0.8042, + "num_input_tokens_seen": 24367144, + "step": 41985 + }, + { + "epoch": 6.254095918975276, + "grad_norm": 0.0179443359375, + "learning_rate": 0.026052176446767477, + "loss": 0.7915, + "num_input_tokens_seen": 24369992, + "step": 41990 + }, + { + "epoch": 6.254840631516235, + "grad_norm": 0.0247802734375, + "learning_rate": 0.026050858197120514, + "loss": 0.7916, + "num_input_tokens_seen": 24373480, + "step": 41995 + }, + { + "epoch": 6.255585344057194, + "grad_norm": 0.01953125, + "learning_rate": 0.02604953976078054, + "loss": 0.7967, + "num_input_tokens_seen": 24376520, + "step": 42000 + }, + { + "epoch": 6.256330056598153, + "grad_norm": 0.01904296875, + "learning_rate": 0.026048221137769813, + "loss": 0.8051, + "num_input_tokens_seen": 24379752, + "step": 42005 + }, + { + "epoch": 6.257074769139113, + "grad_norm": 0.014892578125, + "learning_rate": 0.026046902328110622, + "loss": 0.7994, + "num_input_tokens_seen": 24382728, + "step": 42010 + }, + { + "epoch": 6.257819481680071, + "grad_norm": 0.01312255859375, + "learning_rate": 0.026045583331825245, + "loss": 0.801, + "num_input_tokens_seen": 24385704, + "step": 42015 + }, + { + "epoch": 6.258564194221031, + "grad_norm": 0.012451171875, + "learning_rate": 0.026044264148935962, + "loss": 0.7875, + "num_input_tokens_seen": 24388232, + "step": 42020 + }, + { + "epoch": 6.25930890676199, + "grad_norm": 0.01153564453125, + "learning_rate": 0.026042944779465064, + "loss": 0.7961, + "num_input_tokens_seen": 24391112, + "step": 42025 + }, + { + "epoch": 6.2600536193029495, + "grad_norm": 0.02001953125, + "learning_rate": 0.026041625223434832, + "loss": 0.7959, + "num_input_tokens_seen": 24393928, + "step": 42030 + }, + { + "epoch": 6.260798331843908, + "grad_norm": 0.0218505859375, + "learning_rate": 0.026040305480867568, + "loss": 0.7959, + "num_input_tokens_seen": 24397000, + "step": 42035 + }, + { + "epoch": 6.261543044384867, + "grad_norm": 0.0206298828125, + "learning_rate": 0.026038985551785566, + "loss": 0.788, + "num_input_tokens_seen": 24399880, + "step": 42040 + }, + { + "epoch": 6.262287756925827, + "grad_norm": 0.02294921875, + "learning_rate": 0.026037665436211117, + "loss": 0.8134, + "num_input_tokens_seen": 24402664, + "step": 42045 + }, + { + "epoch": 6.263032469466786, + "grad_norm": 0.0142822265625, + "learning_rate": 0.026036345134166528, + "loss": 0.8099, + "num_input_tokens_seen": 24405640, + "step": 42050 + }, + { + "epoch": 6.263777182007745, + "grad_norm": 0.0238037109375, + "learning_rate": 0.026035024645674112, + "loss": 0.8047, + "num_input_tokens_seen": 24409128, + "step": 42055 + }, + { + "epoch": 6.264521894548704, + "grad_norm": 0.0201416015625, + "learning_rate": 0.026033703970756165, + "loss": 0.8049, + "num_input_tokens_seen": 24411816, + "step": 42060 + }, + { + "epoch": 6.265266607089663, + "grad_norm": 0.02978515625, + "learning_rate": 0.026032383109435005, + "loss": 0.7939, + "num_input_tokens_seen": 24414536, + "step": 42065 + }, + { + "epoch": 6.266011319630622, + "grad_norm": 0.01409912109375, + "learning_rate": 0.026031062061732942, + "loss": 0.8128, + "num_input_tokens_seen": 24417192, + "step": 42070 + }, + { + "epoch": 6.266756032171582, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0260297408276723, + "loss": 0.808, + "num_input_tokens_seen": 24419816, + "step": 42075 + }, + { + "epoch": 6.267500744712541, + "grad_norm": 0.017578125, + "learning_rate": 0.026028419407275398, + "loss": 0.7962, + "num_input_tokens_seen": 24422728, + "step": 42080 + }, + { + "epoch": 6.2682454572535, + "grad_norm": 0.027099609375, + "learning_rate": 0.026027097800564558, + "loss": 0.7897, + "num_input_tokens_seen": 24425448, + "step": 42085 + }, + { + "epoch": 6.268990169794459, + "grad_norm": 0.033203125, + "learning_rate": 0.026025776007562108, + "loss": 0.8239, + "num_input_tokens_seen": 24428488, + "step": 42090 + }, + { + "epoch": 6.269734882335419, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02602445402829038, + "loss": 0.7886, + "num_input_tokens_seen": 24431720, + "step": 42095 + }, + { + "epoch": 6.270479594876377, + "grad_norm": 0.02490234375, + "learning_rate": 0.026023131862771703, + "loss": 0.8096, + "num_input_tokens_seen": 24434440, + "step": 42100 + }, + { + "epoch": 6.271224307417337, + "grad_norm": 0.0213623046875, + "learning_rate": 0.026021809511028417, + "loss": 0.7938, + "num_input_tokens_seen": 24437352, + "step": 42105 + }, + { + "epoch": 6.271969019958296, + "grad_norm": 0.024658203125, + "learning_rate": 0.026020486973082863, + "loss": 0.8102, + "num_input_tokens_seen": 24440136, + "step": 42110 + }, + { + "epoch": 6.2727137324992555, + "grad_norm": 0.02197265625, + "learning_rate": 0.02601916424895738, + "loss": 0.806, + "num_input_tokens_seen": 24443144, + "step": 42115 + }, + { + "epoch": 6.273458445040214, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02601784133867432, + "loss": 0.7971, + "num_input_tokens_seen": 24445896, + "step": 42120 + }, + { + "epoch": 6.274203157581174, + "grad_norm": 0.01904296875, + "learning_rate": 0.026016518242256023, + "loss": 0.7957, + "num_input_tokens_seen": 24449160, + "step": 42125 + }, + { + "epoch": 6.274947870122133, + "grad_norm": 0.0186767578125, + "learning_rate": 0.026015194959724854, + "loss": 0.8054, + "num_input_tokens_seen": 24452168, + "step": 42130 + }, + { + "epoch": 6.275692582663092, + "grad_norm": 0.0185546875, + "learning_rate": 0.026013871491103156, + "loss": 0.7977, + "num_input_tokens_seen": 24455048, + "step": 42135 + }, + { + "epoch": 6.276437295204051, + "grad_norm": 0.03271484375, + "learning_rate": 0.026012547836413297, + "loss": 0.8195, + "num_input_tokens_seen": 24457800, + "step": 42140 + }, + { + "epoch": 6.277182007745011, + "grad_norm": 0.0172119140625, + "learning_rate": 0.02601122399567763, + "loss": 0.8062, + "num_input_tokens_seen": 24460520, + "step": 42145 + }, + { + "epoch": 6.277926720285969, + "grad_norm": 0.02294921875, + "learning_rate": 0.026009899968918528, + "loss": 0.7955, + "num_input_tokens_seen": 24463208, + "step": 42150 + }, + { + "epoch": 6.278671432826929, + "grad_norm": 0.022705078125, + "learning_rate": 0.026008575756158354, + "loss": 0.8198, + "num_input_tokens_seen": 24466184, + "step": 42155 + }, + { + "epoch": 6.279416145367888, + "grad_norm": 0.0205078125, + "learning_rate": 0.026007251357419485, + "loss": 0.8194, + "num_input_tokens_seen": 24469256, + "step": 42160 + }, + { + "epoch": 6.2801608579088475, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02600592677272429, + "loss": 0.8208, + "num_input_tokens_seen": 24472104, + "step": 42165 + }, + { + "epoch": 6.280905570449806, + "grad_norm": 0.0198974609375, + "learning_rate": 0.026004602002095145, + "loss": 0.7768, + "num_input_tokens_seen": 24474728, + "step": 42170 + }, + { + "epoch": 6.281650282990766, + "grad_norm": 0.01165771484375, + "learning_rate": 0.026003277045554437, + "loss": 0.7992, + "num_input_tokens_seen": 24477480, + "step": 42175 + }, + { + "epoch": 6.282394995531725, + "grad_norm": 0.019287109375, + "learning_rate": 0.02600195190312455, + "loss": 0.7891, + "num_input_tokens_seen": 24480424, + "step": 42180 + }, + { + "epoch": 6.283139708072684, + "grad_norm": 0.018798828125, + "learning_rate": 0.02600062657482786, + "loss": 0.8211, + "num_input_tokens_seen": 24483560, + "step": 42185 + }, + { + "epoch": 6.283884420613643, + "grad_norm": 0.037353515625, + "learning_rate": 0.025999301060686767, + "loss": 0.8263, + "num_input_tokens_seen": 24486504, + "step": 42190 + }, + { + "epoch": 6.284629133154603, + "grad_norm": 0.036376953125, + "learning_rate": 0.02599797536072367, + "loss": 0.8033, + "num_input_tokens_seen": 24489544, + "step": 42195 + }, + { + "epoch": 6.2853738456955615, + "grad_norm": 0.018798828125, + "learning_rate": 0.025996649474960946, + "loss": 0.806, + "num_input_tokens_seen": 24492456, + "step": 42200 + }, + { + "epoch": 6.286118558236521, + "grad_norm": 0.017822265625, + "learning_rate": 0.02599532340342101, + "loss": 0.8144, + "num_input_tokens_seen": 24495368, + "step": 42205 + }, + { + "epoch": 6.28686327077748, + "grad_norm": 0.02001953125, + "learning_rate": 0.025993997146126258, + "loss": 0.7992, + "num_input_tokens_seen": 24498344, + "step": 42210 + }, + { + "epoch": 6.2876079833184395, + "grad_norm": 0.01904296875, + "learning_rate": 0.0259926707030991, + "loss": 0.8134, + "num_input_tokens_seen": 24501480, + "step": 42215 + }, + { + "epoch": 6.288352695859398, + "grad_norm": 0.02001953125, + "learning_rate": 0.025991344074361943, + "loss": 0.8129, + "num_input_tokens_seen": 24504456, + "step": 42220 + }, + { + "epoch": 6.289097408400357, + "grad_norm": 0.018310546875, + "learning_rate": 0.0259900172599372, + "loss": 0.7952, + "num_input_tokens_seen": 24507272, + "step": 42225 + }, + { + "epoch": 6.289842120941317, + "grad_norm": 0.01275634765625, + "learning_rate": 0.02598869025984728, + "loss": 0.8111, + "num_input_tokens_seen": 24510216, + "step": 42230 + }, + { + "epoch": 6.290586833482275, + "grad_norm": 0.021728515625, + "learning_rate": 0.025987363074114613, + "loss": 0.8186, + "num_input_tokens_seen": 24512872, + "step": 42235 + }, + { + "epoch": 6.291331546023235, + "grad_norm": 0.0245361328125, + "learning_rate": 0.025986035702761607, + "loss": 0.7761, + "num_input_tokens_seen": 24515592, + "step": 42240 + }, + { + "epoch": 6.292076258564194, + "grad_norm": 0.0203857421875, + "learning_rate": 0.025984708145810706, + "loss": 0.7937, + "num_input_tokens_seen": 24518504, + "step": 42245 + }, + { + "epoch": 6.2928209711051535, + "grad_norm": 0.02587890625, + "learning_rate": 0.025983380403284313, + "loss": 0.8015, + "num_input_tokens_seen": 24521480, + "step": 42250 + }, + { + "epoch": 6.293565683646112, + "grad_norm": 0.03564453125, + "learning_rate": 0.025982052475204877, + "loss": 0.8083, + "num_input_tokens_seen": 24524328, + "step": 42255 + }, + { + "epoch": 6.294310396187072, + "grad_norm": 0.01953125, + "learning_rate": 0.02598072436159482, + "loss": 0.8174, + "num_input_tokens_seen": 24527144, + "step": 42260 + }, + { + "epoch": 6.295055108728031, + "grad_norm": 0.0244140625, + "learning_rate": 0.02597939606247659, + "loss": 0.8053, + "num_input_tokens_seen": 24530344, + "step": 42265 + }, + { + "epoch": 6.29579982126899, + "grad_norm": 0.03125, + "learning_rate": 0.025978067577872622, + "loss": 0.8132, + "num_input_tokens_seen": 24533160, + "step": 42270 + }, + { + "epoch": 6.296544533809949, + "grad_norm": 0.0220947265625, + "learning_rate": 0.025976738907805365, + "loss": 0.794, + "num_input_tokens_seen": 24536328, + "step": 42275 + }, + { + "epoch": 6.297289246350909, + "grad_norm": 0.0240478515625, + "learning_rate": 0.025975410052297254, + "loss": 0.8004, + "num_input_tokens_seen": 24539144, + "step": 42280 + }, + { + "epoch": 6.2980339588918675, + "grad_norm": 0.0224609375, + "learning_rate": 0.025974081011370744, + "loss": 0.8079, + "num_input_tokens_seen": 24541928, + "step": 42285 + }, + { + "epoch": 6.298778671432827, + "grad_norm": 0.01953125, + "learning_rate": 0.02597275178504829, + "loss": 0.7966, + "num_input_tokens_seen": 24544744, + "step": 42290 + }, + { + "epoch": 6.299523383973786, + "grad_norm": 0.022216796875, + "learning_rate": 0.02597142237335235, + "loss": 0.8024, + "num_input_tokens_seen": 24547592, + "step": 42295 + }, + { + "epoch": 6.3002680965147455, + "grad_norm": 0.0257568359375, + "learning_rate": 0.025970092776305374, + "loss": 0.809, + "num_input_tokens_seen": 24550440, + "step": 42300 + }, + { + "epoch": 6.301012809055704, + "grad_norm": 0.0257568359375, + "learning_rate": 0.025968762993929837, + "loss": 0.7988, + "num_input_tokens_seen": 24553544, + "step": 42305 + }, + { + "epoch": 6.301757521596664, + "grad_norm": 0.0191650390625, + "learning_rate": 0.025967433026248196, + "loss": 0.7984, + "num_input_tokens_seen": 24556360, + "step": 42310 + }, + { + "epoch": 6.302502234137623, + "grad_norm": 0.01287841796875, + "learning_rate": 0.025966102873282916, + "loss": 0.8029, + "num_input_tokens_seen": 24559272, + "step": 42315 + }, + { + "epoch": 6.303246946678582, + "grad_norm": 0.01953125, + "learning_rate": 0.025964772535056475, + "loss": 0.8008, + "num_input_tokens_seen": 24562216, + "step": 42320 + }, + { + "epoch": 6.303991659219541, + "grad_norm": 0.0296630859375, + "learning_rate": 0.025963442011591353, + "loss": 0.7865, + "num_input_tokens_seen": 24565288, + "step": 42325 + }, + { + "epoch": 6.304736371760501, + "grad_norm": 0.0262451171875, + "learning_rate": 0.025962111302910013, + "loss": 0.797, + "num_input_tokens_seen": 24568296, + "step": 42330 + }, + { + "epoch": 6.3054810843014595, + "grad_norm": 0.0137939453125, + "learning_rate": 0.025960780409034948, + "loss": 0.8151, + "num_input_tokens_seen": 24571176, + "step": 42335 + }, + { + "epoch": 6.306225796842419, + "grad_norm": 0.02685546875, + "learning_rate": 0.025959449329988633, + "loss": 0.8172, + "num_input_tokens_seen": 24574376, + "step": 42340 + }, + { + "epoch": 6.306970509383378, + "grad_norm": 0.0272216796875, + "learning_rate": 0.025958118065793567, + "loss": 0.8093, + "num_input_tokens_seen": 24577256, + "step": 42345 + }, + { + "epoch": 6.3077152219243375, + "grad_norm": 0.0133056640625, + "learning_rate": 0.02595678661647223, + "loss": 0.778, + "num_input_tokens_seen": 24580136, + "step": 42350 + }, + { + "epoch": 6.308459934465296, + "grad_norm": 0.01611328125, + "learning_rate": 0.025955454982047117, + "loss": 0.8168, + "num_input_tokens_seen": 24583272, + "step": 42355 + }, + { + "epoch": 6.309204647006256, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02595412316254073, + "loss": 0.8125, + "num_input_tokens_seen": 24586376, + "step": 42360 + }, + { + "epoch": 6.309949359547215, + "grad_norm": 0.021484375, + "learning_rate": 0.025952791157975566, + "loss": 0.788, + "num_input_tokens_seen": 24589320, + "step": 42365 + }, + { + "epoch": 6.310694072088174, + "grad_norm": 0.0228271484375, + "learning_rate": 0.025951458968374124, + "loss": 0.8012, + "num_input_tokens_seen": 24592072, + "step": 42370 + }, + { + "epoch": 6.311438784629133, + "grad_norm": 0.026123046875, + "learning_rate": 0.025950126593758914, + "loss": 0.8092, + "num_input_tokens_seen": 24594792, + "step": 42375 + }, + { + "epoch": 6.312183497170093, + "grad_norm": 0.02197265625, + "learning_rate": 0.02594879403415245, + "loss": 0.807, + "num_input_tokens_seen": 24597576, + "step": 42380 + }, + { + "epoch": 6.3129282097110515, + "grad_norm": 0.0224609375, + "learning_rate": 0.025947461289577232, + "loss": 0.8058, + "num_input_tokens_seen": 24600840, + "step": 42385 + }, + { + "epoch": 6.31367292225201, + "grad_norm": 0.0213623046875, + "learning_rate": 0.025946128360055783, + "loss": 0.8014, + "num_input_tokens_seen": 24603784, + "step": 42390 + }, + { + "epoch": 6.31441763479297, + "grad_norm": 0.01483154296875, + "learning_rate": 0.025944795245610622, + "loss": 0.8072, + "num_input_tokens_seen": 24606952, + "step": 42395 + }, + { + "epoch": 6.31516234733393, + "grad_norm": 0.03515625, + "learning_rate": 0.02594346194626427, + "loss": 0.8071, + "num_input_tokens_seen": 24609736, + "step": 42400 + }, + { + "epoch": 6.315907059874888, + "grad_norm": 0.021728515625, + "learning_rate": 0.025942128462039252, + "loss": 0.8016, + "num_input_tokens_seen": 24612648, + "step": 42405 + }, + { + "epoch": 6.316651772415847, + "grad_norm": 0.023681640625, + "learning_rate": 0.02594079479295809, + "loss": 0.8102, + "num_input_tokens_seen": 24615304, + "step": 42410 + }, + { + "epoch": 6.317396484956807, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02593946093904332, + "loss": 0.8047, + "num_input_tokens_seen": 24618408, + "step": 42415 + }, + { + "epoch": 6.3181411974977655, + "grad_norm": 0.0203857421875, + "learning_rate": 0.025938126900317483, + "loss": 0.7959, + "num_input_tokens_seen": 24621672, + "step": 42420 + }, + { + "epoch": 6.318885910038725, + "grad_norm": 0.021484375, + "learning_rate": 0.025936792676803103, + "loss": 0.7971, + "num_input_tokens_seen": 24624488, + "step": 42425 + }, + { + "epoch": 6.319630622579684, + "grad_norm": 0.030517578125, + "learning_rate": 0.025935458268522726, + "loss": 0.8113, + "num_input_tokens_seen": 24627528, + "step": 42430 + }, + { + "epoch": 6.3203753351206435, + "grad_norm": 0.022705078125, + "learning_rate": 0.025934123675498896, + "loss": 0.8021, + "num_input_tokens_seen": 24630216, + "step": 42435 + }, + { + "epoch": 6.321120047661602, + "grad_norm": 0.0145263671875, + "learning_rate": 0.025932788897754164, + "loss": 0.7986, + "num_input_tokens_seen": 24633320, + "step": 42440 + }, + { + "epoch": 6.321864760202562, + "grad_norm": 0.013427734375, + "learning_rate": 0.025931453935311072, + "loss": 0.8032, + "num_input_tokens_seen": 24636040, + "step": 42445 + }, + { + "epoch": 6.322609472743521, + "grad_norm": 0.02490234375, + "learning_rate": 0.025930118788192173, + "loss": 0.8154, + "num_input_tokens_seen": 24639272, + "step": 42450 + }, + { + "epoch": 6.32335418528448, + "grad_norm": 0.020751953125, + "learning_rate": 0.025928783456420027, + "loss": 0.7936, + "num_input_tokens_seen": 24642312, + "step": 42455 + }, + { + "epoch": 6.324098897825439, + "grad_norm": 0.02099609375, + "learning_rate": 0.025927447940017195, + "loss": 0.8031, + "num_input_tokens_seen": 24645448, + "step": 42460 + }, + { + "epoch": 6.324843610366399, + "grad_norm": 0.0213623046875, + "learning_rate": 0.025926112239006236, + "loss": 0.8009, + "num_input_tokens_seen": 24648264, + "step": 42465 + }, + { + "epoch": 6.3255883229073575, + "grad_norm": 0.0198974609375, + "learning_rate": 0.025924776353409715, + "loss": 0.8114, + "num_input_tokens_seen": 24651144, + "step": 42470 + }, + { + "epoch": 6.326333035448317, + "grad_norm": 0.01953125, + "learning_rate": 0.0259234402832502, + "loss": 0.7782, + "num_input_tokens_seen": 24653928, + "step": 42475 + }, + { + "epoch": 6.327077747989276, + "grad_norm": 0.0247802734375, + "learning_rate": 0.025922104028550264, + "loss": 0.7974, + "num_input_tokens_seen": 24656840, + "step": 42480 + }, + { + "epoch": 6.327822460530236, + "grad_norm": 0.0281982421875, + "learning_rate": 0.025920767589332483, + "loss": 0.8176, + "num_input_tokens_seen": 24659656, + "step": 42485 + }, + { + "epoch": 6.328567173071194, + "grad_norm": 0.02734375, + "learning_rate": 0.02591943096561943, + "loss": 0.79, + "num_input_tokens_seen": 24662440, + "step": 42490 + }, + { + "epoch": 6.329311885612154, + "grad_norm": 0.0272216796875, + "learning_rate": 0.025918094157433694, + "loss": 0.792, + "num_input_tokens_seen": 24665224, + "step": 42495 + }, + { + "epoch": 6.330056598153113, + "grad_norm": 0.020263671875, + "learning_rate": 0.025916757164797844, + "loss": 0.8031, + "num_input_tokens_seen": 24668168, + "step": 42500 + }, + { + "epoch": 6.330801310694072, + "grad_norm": 0.031005859375, + "learning_rate": 0.025915419987734487, + "loss": 0.8032, + "num_input_tokens_seen": 24670760, + "step": 42505 + }, + { + "epoch": 6.331546023235031, + "grad_norm": 0.019287109375, + "learning_rate": 0.0259140826262662, + "loss": 0.8123, + "num_input_tokens_seen": 24673832, + "step": 42510 + }, + { + "epoch": 6.332290735775991, + "grad_norm": 0.019287109375, + "learning_rate": 0.025912745080415574, + "loss": 0.7929, + "num_input_tokens_seen": 24676776, + "step": 42515 + }, + { + "epoch": 6.3330354483169495, + "grad_norm": 0.02294921875, + "learning_rate": 0.025911407350205216, + "loss": 0.785, + "num_input_tokens_seen": 24679688, + "step": 42520 + }, + { + "epoch": 6.333780160857909, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02591006943565772, + "loss": 0.7896, + "num_input_tokens_seen": 24682408, + "step": 42525 + }, + { + "epoch": 6.334524873398868, + "grad_norm": 0.0250244140625, + "learning_rate": 0.025908731336795693, + "loss": 0.7938, + "num_input_tokens_seen": 24685256, + "step": 42530 + }, + { + "epoch": 6.335269585939828, + "grad_norm": 0.013671875, + "learning_rate": 0.02590739305364173, + "loss": 0.791, + "num_input_tokens_seen": 24688008, + "step": 42535 + }, + { + "epoch": 6.336014298480786, + "grad_norm": 0.01904296875, + "learning_rate": 0.025906054586218457, + "loss": 0.7865, + "num_input_tokens_seen": 24690824, + "step": 42540 + }, + { + "epoch": 6.336759011021746, + "grad_norm": 0.0281982421875, + "learning_rate": 0.025904715934548468, + "loss": 0.7824, + "num_input_tokens_seen": 24693832, + "step": 42545 + }, + { + "epoch": 6.337503723562705, + "grad_norm": 0.0189208984375, + "learning_rate": 0.025903377098654385, + "loss": 0.7751, + "num_input_tokens_seen": 24696872, + "step": 42550 + }, + { + "epoch": 6.338248436103664, + "grad_norm": 0.0203857421875, + "learning_rate": 0.025902038078558832, + "loss": 0.7723, + "num_input_tokens_seen": 24699784, + "step": 42555 + }, + { + "epoch": 6.338993148644623, + "grad_norm": 0.02001953125, + "learning_rate": 0.02590069887428443, + "loss": 0.7799, + "num_input_tokens_seen": 24702920, + "step": 42560 + }, + { + "epoch": 6.339737861185583, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02589935948585379, + "loss": 0.7915, + "num_input_tokens_seen": 24705928, + "step": 42565 + }, + { + "epoch": 6.340482573726542, + "grad_norm": 0.020263671875, + "learning_rate": 0.025898019913289556, + "loss": 0.7623, + "num_input_tokens_seen": 24708808, + "step": 42570 + }, + { + "epoch": 6.3412272862675, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02589668015661435, + "loss": 0.7848, + "num_input_tokens_seen": 24711624, + "step": 42575 + }, + { + "epoch": 6.34197199880846, + "grad_norm": 0.037353515625, + "learning_rate": 0.02589534021585081, + "loss": 0.8208, + "num_input_tokens_seen": 24714728, + "step": 42580 + }, + { + "epoch": 6.342716711349419, + "grad_norm": 0.02001953125, + "learning_rate": 0.025894000091021566, + "loss": 0.7905, + "num_input_tokens_seen": 24717384, + "step": 42585 + }, + { + "epoch": 6.343461423890378, + "grad_norm": 0.0419921875, + "learning_rate": 0.025892659782149264, + "loss": 0.8197, + "num_input_tokens_seen": 24720296, + "step": 42590 + }, + { + "epoch": 6.344206136431337, + "grad_norm": 0.0240478515625, + "learning_rate": 0.025891319289256545, + "loss": 0.8021, + "num_input_tokens_seen": 24723272, + "step": 42595 + }, + { + "epoch": 6.344950848972297, + "grad_norm": 0.013427734375, + "learning_rate": 0.025889978612366058, + "loss": 0.7931, + "num_input_tokens_seen": 24726152, + "step": 42600 + }, + { + "epoch": 6.3456955615132555, + "grad_norm": 0.031494140625, + "learning_rate": 0.025888637751500448, + "loss": 0.8213, + "num_input_tokens_seen": 24729000, + "step": 42605 + }, + { + "epoch": 6.346440274054215, + "grad_norm": 0.031982421875, + "learning_rate": 0.025887296706682376, + "loss": 0.8143, + "num_input_tokens_seen": 24731816, + "step": 42610 + }, + { + "epoch": 6.347184986595174, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025885955477934486, + "loss": 0.8078, + "num_input_tokens_seen": 24734568, + "step": 42615 + }, + { + "epoch": 6.347929699136134, + "grad_norm": 0.0260009765625, + "learning_rate": 0.025884614065279445, + "loss": 0.7744, + "num_input_tokens_seen": 24737672, + "step": 42620 + }, + { + "epoch": 6.348674411677092, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02588327246873991, + "loss": 0.779, + "num_input_tokens_seen": 24740424, + "step": 42625 + }, + { + "epoch": 6.349419124218052, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02588193068833855, + "loss": 0.7782, + "num_input_tokens_seen": 24743240, + "step": 42630 + }, + { + "epoch": 6.350163836759011, + "grad_norm": 0.021240234375, + "learning_rate": 0.025880588724098024, + "loss": 0.7779, + "num_input_tokens_seen": 24746120, + "step": 42635 + }, + { + "epoch": 6.35090854929997, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02587924657604102, + "loss": 0.7881, + "num_input_tokens_seen": 24749160, + "step": 42640 + }, + { + "epoch": 6.351653261840929, + "grad_norm": 0.0235595703125, + "learning_rate": 0.025877904244190196, + "loss": 0.7917, + "num_input_tokens_seen": 24752008, + "step": 42645 + }, + { + "epoch": 6.352397974381889, + "grad_norm": 0.024658203125, + "learning_rate": 0.025876561728568236, + "loss": 0.7805, + "num_input_tokens_seen": 24754824, + "step": 42650 + }, + { + "epoch": 6.353142686922848, + "grad_norm": 0.02392578125, + "learning_rate": 0.02587521902919782, + "loss": 0.7812, + "num_input_tokens_seen": 24757576, + "step": 42655 + }, + { + "epoch": 6.353887399463807, + "grad_norm": 0.023193359375, + "learning_rate": 0.025873876146101638, + "loss": 0.8076, + "num_input_tokens_seen": 24760168, + "step": 42660 + }, + { + "epoch": 6.354632112004766, + "grad_norm": 0.0281982421875, + "learning_rate": 0.025872533079302363, + "loss": 0.7782, + "num_input_tokens_seen": 24763240, + "step": 42665 + }, + { + "epoch": 6.355376824545726, + "grad_norm": 0.033203125, + "learning_rate": 0.02587118982882269, + "loss": 0.8324, + "num_input_tokens_seen": 24765960, + "step": 42670 + }, + { + "epoch": 6.356121537086684, + "grad_norm": 0.035888671875, + "learning_rate": 0.02586984639468532, + "loss": 0.8179, + "num_input_tokens_seen": 24768808, + "step": 42675 + }, + { + "epoch": 6.356866249627644, + "grad_norm": 0.0230712890625, + "learning_rate": 0.025868502776912942, + "loss": 0.7999, + "num_input_tokens_seen": 24771912, + "step": 42680 + }, + { + "epoch": 6.357610962168603, + "grad_norm": 0.021728515625, + "learning_rate": 0.025867158975528257, + "loss": 0.8019, + "num_input_tokens_seen": 24774952, + "step": 42685 + }, + { + "epoch": 6.358355674709562, + "grad_norm": 0.027587890625, + "learning_rate": 0.02586581499055396, + "loss": 0.794, + "num_input_tokens_seen": 24777832, + "step": 42690 + }, + { + "epoch": 6.359100387250521, + "grad_norm": 0.01275634765625, + "learning_rate": 0.025864470822012764, + "loss": 0.7984, + "num_input_tokens_seen": 24781160, + "step": 42695 + }, + { + "epoch": 6.359845099791481, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02586312646992738, + "loss": 0.8114, + "num_input_tokens_seen": 24784104, + "step": 42700 + }, + { + "epoch": 6.36058981233244, + "grad_norm": 0.0169677734375, + "learning_rate": 0.02586178193432051, + "loss": 0.7814, + "num_input_tokens_seen": 24786632, + "step": 42705 + }, + { + "epoch": 6.361334524873399, + "grad_norm": 0.01904296875, + "learning_rate": 0.025860437215214877, + "loss": 0.8152, + "num_input_tokens_seen": 24789544, + "step": 42710 + }, + { + "epoch": 6.362079237414358, + "grad_norm": 0.0269775390625, + "learning_rate": 0.025859092312633194, + "loss": 0.8113, + "num_input_tokens_seen": 24792488, + "step": 42715 + }, + { + "epoch": 6.362823949955318, + "grad_norm": 0.0308837890625, + "learning_rate": 0.025857747226598185, + "loss": 0.7997, + "num_input_tokens_seen": 24795272, + "step": 42720 + }, + { + "epoch": 6.363568662496276, + "grad_norm": 0.021728515625, + "learning_rate": 0.025856401957132572, + "loss": 0.8266, + "num_input_tokens_seen": 24798152, + "step": 42725 + }, + { + "epoch": 6.364313375037236, + "grad_norm": 0.027587890625, + "learning_rate": 0.025855056504259077, + "loss": 0.795, + "num_input_tokens_seen": 24801000, + "step": 42730 + }, + { + "epoch": 6.365058087578195, + "grad_norm": 0.0146484375, + "learning_rate": 0.02585371086800044, + "loss": 0.8349, + "num_input_tokens_seen": 24803816, + "step": 42735 + }, + { + "epoch": 6.365802800119154, + "grad_norm": 0.021240234375, + "learning_rate": 0.02585236504837939, + "loss": 0.7767, + "num_input_tokens_seen": 24806856, + "step": 42740 + }, + { + "epoch": 6.366547512660113, + "grad_norm": 0.028076171875, + "learning_rate": 0.025851019045418658, + "loss": 0.8055, + "num_input_tokens_seen": 24809608, + "step": 42745 + }, + { + "epoch": 6.367292225201073, + "grad_norm": 0.024169921875, + "learning_rate": 0.025849672859140988, + "loss": 0.8231, + "num_input_tokens_seen": 24812584, + "step": 42750 + }, + { + "epoch": 6.368036937742032, + "grad_norm": 0.0224609375, + "learning_rate": 0.025848326489569123, + "loss": 0.811, + "num_input_tokens_seen": 24815272, + "step": 42755 + }, + { + "epoch": 6.36878165028299, + "grad_norm": 0.0196533203125, + "learning_rate": 0.025846979936725807, + "loss": 0.7981, + "num_input_tokens_seen": 24818376, + "step": 42760 + }, + { + "epoch": 6.36952636282395, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02584563320063379, + "loss": 0.7766, + "num_input_tokens_seen": 24821160, + "step": 42765 + }, + { + "epoch": 6.370271075364909, + "grad_norm": 0.0179443359375, + "learning_rate": 0.025844286281315822, + "loss": 0.8255, + "num_input_tokens_seen": 24824008, + "step": 42770 + }, + { + "epoch": 6.371015787905868, + "grad_norm": 0.0228271484375, + "learning_rate": 0.025842939178794665, + "loss": 0.8257, + "num_input_tokens_seen": 24826696, + "step": 42775 + }, + { + "epoch": 6.371760500446827, + "grad_norm": 0.0189208984375, + "learning_rate": 0.025841591893093065, + "loss": 0.804, + "num_input_tokens_seen": 24829384, + "step": 42780 + }, + { + "epoch": 6.372505212987787, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02584024442423379, + "loss": 0.7952, + "num_input_tokens_seen": 24832168, + "step": 42785 + }, + { + "epoch": 6.373249925528746, + "grad_norm": 0.0203857421875, + "learning_rate": 0.025838896772239606, + "loss": 0.8025, + "num_input_tokens_seen": 24835208, + "step": 42790 + }, + { + "epoch": 6.373994638069705, + "grad_norm": 0.01263427734375, + "learning_rate": 0.025837548937133272, + "loss": 0.7978, + "num_input_tokens_seen": 24837896, + "step": 42795 + }, + { + "epoch": 6.374739350610664, + "grad_norm": 0.017333984375, + "learning_rate": 0.025836200918937565, + "loss": 0.8249, + "num_input_tokens_seen": 24840936, + "step": 42800 + }, + { + "epoch": 6.375484063151624, + "grad_norm": 0.0191650390625, + "learning_rate": 0.025834852717675258, + "loss": 0.8152, + "num_input_tokens_seen": 24843816, + "step": 42805 + }, + { + "epoch": 6.376228775692582, + "grad_norm": 0.0255126953125, + "learning_rate": 0.025833504333369128, + "loss": 0.8065, + "num_input_tokens_seen": 24846440, + "step": 42810 + }, + { + "epoch": 6.376973488233542, + "grad_norm": 0.0322265625, + "learning_rate": 0.02583215576604195, + "loss": 0.7968, + "num_input_tokens_seen": 24849480, + "step": 42815 + }, + { + "epoch": 6.377718200774501, + "grad_norm": 0.0152587890625, + "learning_rate": 0.025830807015716512, + "loss": 0.8107, + "num_input_tokens_seen": 24852360, + "step": 42820 + }, + { + "epoch": 6.3784629133154604, + "grad_norm": 0.02294921875, + "learning_rate": 0.025829458082415595, + "loss": 0.7891, + "num_input_tokens_seen": 24855464, + "step": 42825 + }, + { + "epoch": 6.379207625856419, + "grad_norm": 0.018310546875, + "learning_rate": 0.025828108966161993, + "loss": 0.7997, + "num_input_tokens_seen": 24858344, + "step": 42830 + }, + { + "epoch": 6.379952338397379, + "grad_norm": 0.028076171875, + "learning_rate": 0.025826759666978494, + "loss": 0.8336, + "num_input_tokens_seen": 24861352, + "step": 42835 + }, + { + "epoch": 6.380697050938338, + "grad_norm": 0.0286865234375, + "learning_rate": 0.025825410184887893, + "loss": 0.8049, + "num_input_tokens_seen": 24864264, + "step": 42840 + }, + { + "epoch": 6.381441763479297, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02582406051991299, + "loss": 0.7916, + "num_input_tokens_seen": 24867656, + "step": 42845 + }, + { + "epoch": 6.382186476020256, + "grad_norm": 0.0203857421875, + "learning_rate": 0.025822710672076588, + "loss": 0.777, + "num_input_tokens_seen": 24870344, + "step": 42850 + }, + { + "epoch": 6.382931188561216, + "grad_norm": 0.02734375, + "learning_rate": 0.025821360641401486, + "loss": 0.8088, + "num_input_tokens_seen": 24873224, + "step": 42855 + }, + { + "epoch": 6.383675901102174, + "grad_norm": 0.021484375, + "learning_rate": 0.025820010427910496, + "loss": 0.7873, + "num_input_tokens_seen": 24875976, + "step": 42860 + }, + { + "epoch": 6.384420613643134, + "grad_norm": 0.01373291015625, + "learning_rate": 0.025818660031626424, + "loss": 0.7951, + "num_input_tokens_seen": 24879144, + "step": 42865 + }, + { + "epoch": 6.385165326184093, + "grad_norm": 0.01611328125, + "learning_rate": 0.025817309452572088, + "loss": 0.8028, + "num_input_tokens_seen": 24881960, + "step": 42870 + }, + { + "epoch": 6.3859100387250525, + "grad_norm": 0.0260009765625, + "learning_rate": 0.025815958690770302, + "loss": 0.7812, + "num_input_tokens_seen": 24884840, + "step": 42875 + }, + { + "epoch": 6.386654751266011, + "grad_norm": 0.01336669921875, + "learning_rate": 0.02581460774624389, + "loss": 0.8144, + "num_input_tokens_seen": 24887464, + "step": 42880 + }, + { + "epoch": 6.387399463806971, + "grad_norm": 0.02490234375, + "learning_rate": 0.025813256619015672, + "loss": 0.8237, + "num_input_tokens_seen": 24890408, + "step": 42885 + }, + { + "epoch": 6.38814417634793, + "grad_norm": 0.0113525390625, + "learning_rate": 0.02581190530910847, + "loss": 0.811, + "num_input_tokens_seen": 24893608, + "step": 42890 + }, + { + "epoch": 6.388888888888889, + "grad_norm": 0.0245361328125, + "learning_rate": 0.025810553816545116, + "loss": 0.8092, + "num_input_tokens_seen": 24896136, + "step": 42895 + }, + { + "epoch": 6.389633601429848, + "grad_norm": 0.0177001953125, + "learning_rate": 0.025809202141348447, + "loss": 0.8109, + "num_input_tokens_seen": 24899208, + "step": 42900 + }, + { + "epoch": 6.390378313970807, + "grad_norm": 0.01446533203125, + "learning_rate": 0.02580785028354129, + "loss": 0.7857, + "num_input_tokens_seen": 24902120, + "step": 42905 + }, + { + "epoch": 6.3911230265117664, + "grad_norm": 0.027587890625, + "learning_rate": 0.02580649824314649, + "loss": 0.788, + "num_input_tokens_seen": 24904776, + "step": 42910 + }, + { + "epoch": 6.391867739052726, + "grad_norm": 0.0306396484375, + "learning_rate": 0.025805146020186884, + "loss": 0.7942, + "num_input_tokens_seen": 24907688, + "step": 42915 + }, + { + "epoch": 6.392612451593685, + "grad_norm": 0.013427734375, + "learning_rate": 0.025803793614685315, + "loss": 0.8055, + "num_input_tokens_seen": 24910792, + "step": 42920 + }, + { + "epoch": 6.393357164134644, + "grad_norm": 0.02001953125, + "learning_rate": 0.025802441026664638, + "loss": 0.794, + "num_input_tokens_seen": 24913736, + "step": 42925 + }, + { + "epoch": 6.394101876675603, + "grad_norm": 0.0228271484375, + "learning_rate": 0.025801088256147697, + "loss": 0.8318, + "num_input_tokens_seen": 24916712, + "step": 42930 + }, + { + "epoch": 6.394846589216562, + "grad_norm": 0.0291748046875, + "learning_rate": 0.025799735303157344, + "loss": 0.8229, + "num_input_tokens_seen": 24919432, + "step": 42935 + }, + { + "epoch": 6.395591301757522, + "grad_norm": 0.024169921875, + "learning_rate": 0.025798382167716447, + "loss": 0.7874, + "num_input_tokens_seen": 24922184, + "step": 42940 + }, + { + "epoch": 6.39633601429848, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02579702884984785, + "loss": 0.8091, + "num_input_tokens_seen": 24925000, + "step": 42945 + }, + { + "epoch": 6.39708072683944, + "grad_norm": 0.03369140625, + "learning_rate": 0.02579567534957443, + "loss": 0.7906, + "num_input_tokens_seen": 24928232, + "step": 42950 + }, + { + "epoch": 6.397825439380399, + "grad_norm": 0.024169921875, + "learning_rate": 0.025794321666919046, + "loss": 0.8083, + "num_input_tokens_seen": 24930952, + "step": 42955 + }, + { + "epoch": 6.3985701519213585, + "grad_norm": 0.02392578125, + "learning_rate": 0.025792967801904567, + "loss": 0.7935, + "num_input_tokens_seen": 24933864, + "step": 42960 + }, + { + "epoch": 6.399314864462317, + "grad_norm": 0.0224609375, + "learning_rate": 0.02579161375455387, + "loss": 0.8229, + "num_input_tokens_seen": 24936648, + "step": 42965 + }, + { + "epoch": 6.400059577003277, + "grad_norm": 0.014404296875, + "learning_rate": 0.025790259524889822, + "loss": 0.8018, + "num_input_tokens_seen": 24939688, + "step": 42970 + }, + { + "epoch": 6.400804289544236, + "grad_norm": 0.022216796875, + "learning_rate": 0.02578890511293531, + "loss": 0.7941, + "num_input_tokens_seen": 24942376, + "step": 42975 + }, + { + "epoch": 6.401549002085195, + "grad_norm": 0.0194091796875, + "learning_rate": 0.025787550518713207, + "loss": 0.7962, + "num_input_tokens_seen": 24945000, + "step": 42980 + }, + { + "epoch": 6.402293714626154, + "grad_norm": 0.034912109375, + "learning_rate": 0.025786195742246405, + "loss": 0.8222, + "num_input_tokens_seen": 24947848, + "step": 42985 + }, + { + "epoch": 6.403038427167114, + "grad_norm": 0.0306396484375, + "learning_rate": 0.02578484078355779, + "loss": 0.7878, + "num_input_tokens_seen": 24950696, + "step": 42990 + }, + { + "epoch": 6.4037831397080724, + "grad_norm": 0.015869140625, + "learning_rate": 0.02578348564267025, + "loss": 0.7837, + "num_input_tokens_seen": 24953736, + "step": 42995 + }, + { + "epoch": 6.404527852249032, + "grad_norm": 0.01141357421875, + "learning_rate": 0.025782130319606678, + "loss": 0.7848, + "num_input_tokens_seen": 24956616, + "step": 43000 + }, + { + "epoch": 6.405272564789991, + "grad_norm": 0.0263671875, + "learning_rate": 0.02578077481438997, + "loss": 0.7971, + "num_input_tokens_seen": 24959656, + "step": 43005 + }, + { + "epoch": 6.4060172773309505, + "grad_norm": 0.0262451171875, + "learning_rate": 0.025779419127043037, + "loss": 0.7945, + "num_input_tokens_seen": 24962504, + "step": 43010 + }, + { + "epoch": 6.406761989871909, + "grad_norm": 0.0228271484375, + "learning_rate": 0.025778063257588767, + "loss": 0.8008, + "num_input_tokens_seen": 24965288, + "step": 43015 + }, + { + "epoch": 6.407506702412869, + "grad_norm": 0.010498046875, + "learning_rate": 0.025776707206050072, + "loss": 0.7808, + "num_input_tokens_seen": 24968168, + "step": 43020 + }, + { + "epoch": 6.408251414953828, + "grad_norm": 0.01531982421875, + "learning_rate": 0.025775350972449866, + "loss": 0.817, + "num_input_tokens_seen": 24970856, + "step": 43025 + }, + { + "epoch": 6.408996127494787, + "grad_norm": 0.02490234375, + "learning_rate": 0.025773994556811057, + "loss": 0.7884, + "num_input_tokens_seen": 24973736, + "step": 43030 + }, + { + "epoch": 6.409740840035746, + "grad_norm": 0.03271484375, + "learning_rate": 0.02577263795915656, + "loss": 0.8161, + "num_input_tokens_seen": 24976648, + "step": 43035 + }, + { + "epoch": 6.410485552576706, + "grad_norm": 0.0272216796875, + "learning_rate": 0.025771281179509293, + "loss": 0.8438, + "num_input_tokens_seen": 24979496, + "step": 43040 + }, + { + "epoch": 6.4112302651176645, + "grad_norm": 0.0198974609375, + "learning_rate": 0.025769924217892176, + "loss": 0.8094, + "num_input_tokens_seen": 24982248, + "step": 43045 + }, + { + "epoch": 6.411974977658624, + "grad_norm": 0.0211181640625, + "learning_rate": 0.025768567074328133, + "loss": 0.7957, + "num_input_tokens_seen": 24984936, + "step": 43050 + }, + { + "epoch": 6.412719690199583, + "grad_norm": 0.02001953125, + "learning_rate": 0.025767209748840104, + "loss": 0.7855, + "num_input_tokens_seen": 24988040, + "step": 43055 + }, + { + "epoch": 6.4134644027405425, + "grad_norm": 0.024169921875, + "learning_rate": 0.025765852241450997, + "loss": 0.8217, + "num_input_tokens_seen": 24990824, + "step": 43060 + }, + { + "epoch": 6.414209115281501, + "grad_norm": 0.013671875, + "learning_rate": 0.025764494552183763, + "loss": 0.8096, + "num_input_tokens_seen": 24993736, + "step": 43065 + }, + { + "epoch": 6.414953827822461, + "grad_norm": 0.0250244140625, + "learning_rate": 0.025763136681061335, + "loss": 0.8141, + "num_input_tokens_seen": 24996520, + "step": 43070 + }, + { + "epoch": 6.41569854036342, + "grad_norm": 0.0205078125, + "learning_rate": 0.025761778628106656, + "loss": 0.7796, + "num_input_tokens_seen": 24999240, + "step": 43075 + }, + { + "epoch": 6.416443252904379, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02576042039334266, + "loss": 0.8117, + "num_input_tokens_seen": 25002216, + "step": 43080 + }, + { + "epoch": 6.417187965445338, + "grad_norm": 0.01904296875, + "learning_rate": 0.0257590619767923, + "loss": 0.8071, + "num_input_tokens_seen": 25004872, + "step": 43085 + }, + { + "epoch": 6.417932677986297, + "grad_norm": 0.0205078125, + "learning_rate": 0.025757703378478523, + "loss": 0.8, + "num_input_tokens_seen": 25008136, + "step": 43090 + }, + { + "epoch": 6.4186773905272565, + "grad_norm": 0.0191650390625, + "learning_rate": 0.025756344598424276, + "loss": 0.7914, + "num_input_tokens_seen": 25011048, + "step": 43095 + }, + { + "epoch": 6.419422103068215, + "grad_norm": 0.0181884765625, + "learning_rate": 0.025754985636652523, + "loss": 0.7896, + "num_input_tokens_seen": 25013896, + "step": 43100 + }, + { + "epoch": 6.420166815609175, + "grad_norm": 0.01422119140625, + "learning_rate": 0.025753626493186223, + "loss": 0.7801, + "num_input_tokens_seen": 25016648, + "step": 43105 + }, + { + "epoch": 6.420911528150134, + "grad_norm": 0.0257568359375, + "learning_rate": 0.025752267168048324, + "loss": 0.8215, + "num_input_tokens_seen": 25019880, + "step": 43110 + }, + { + "epoch": 6.421656240691093, + "grad_norm": 0.02978515625, + "learning_rate": 0.025750907661261806, + "loss": 0.8243, + "num_input_tokens_seen": 25022760, + "step": 43115 + }, + { + "epoch": 6.422400953232052, + "grad_norm": 0.0150146484375, + "learning_rate": 0.02574954797284963, + "loss": 0.8095, + "num_input_tokens_seen": 25025608, + "step": 43120 + }, + { + "epoch": 6.423145665773012, + "grad_norm": 0.032958984375, + "learning_rate": 0.025748188102834763, + "loss": 0.7996, + "num_input_tokens_seen": 25028648, + "step": 43125 + }, + { + "epoch": 6.4238903783139705, + "grad_norm": 0.0164794921875, + "learning_rate": 0.025746828051240185, + "loss": 0.7983, + "num_input_tokens_seen": 25031784, + "step": 43130 + }, + { + "epoch": 6.42463509085493, + "grad_norm": 0.033447265625, + "learning_rate": 0.02574546781808886, + "loss": 0.7977, + "num_input_tokens_seen": 25034664, + "step": 43135 + }, + { + "epoch": 6.425379803395889, + "grad_norm": 0.0223388671875, + "learning_rate": 0.025744107403403788, + "loss": 0.8037, + "num_input_tokens_seen": 25037576, + "step": 43140 + }, + { + "epoch": 6.4261245159368485, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02574274680720794, + "loss": 0.7838, + "num_input_tokens_seen": 25040296, + "step": 43145 + }, + { + "epoch": 6.426869228477807, + "grad_norm": 0.0196533203125, + "learning_rate": 0.0257413860295243, + "loss": 0.7976, + "num_input_tokens_seen": 25043016, + "step": 43150 + }, + { + "epoch": 6.427613941018767, + "grad_norm": 0.0186767578125, + "learning_rate": 0.025740025070375865, + "loss": 0.7942, + "num_input_tokens_seen": 25046184, + "step": 43155 + }, + { + "epoch": 6.428358653559726, + "grad_norm": 0.0205078125, + "learning_rate": 0.02573866392978562, + "loss": 0.7962, + "num_input_tokens_seen": 25049064, + "step": 43160 + }, + { + "epoch": 6.429103366100685, + "grad_norm": 0.01953125, + "learning_rate": 0.025737302607776555, + "loss": 0.8163, + "num_input_tokens_seen": 25051848, + "step": 43165 + }, + { + "epoch": 6.429848078641644, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02573594110437168, + "loss": 0.8178, + "num_input_tokens_seen": 25054696, + "step": 43170 + }, + { + "epoch": 6.430592791182604, + "grad_norm": 0.0208740234375, + "learning_rate": 0.025734579419593995, + "loss": 0.8164, + "num_input_tokens_seen": 25057544, + "step": 43175 + }, + { + "epoch": 6.4313375037235625, + "grad_norm": 0.0257568359375, + "learning_rate": 0.025733217553466498, + "loss": 0.8021, + "num_input_tokens_seen": 25060648, + "step": 43180 + }, + { + "epoch": 6.432082216264522, + "grad_norm": 0.021240234375, + "learning_rate": 0.025731855506012202, + "loss": 0.8263, + "num_input_tokens_seen": 25063432, + "step": 43185 + }, + { + "epoch": 6.432826928805481, + "grad_norm": 0.0184326171875, + "learning_rate": 0.025730493277254112, + "loss": 0.7976, + "num_input_tokens_seen": 25066056, + "step": 43190 + }, + { + "epoch": 6.4335716413464406, + "grad_norm": 0.02001953125, + "learning_rate": 0.025729130867215243, + "loss": 0.7892, + "num_input_tokens_seen": 25069384, + "step": 43195 + }, + { + "epoch": 6.434316353887399, + "grad_norm": 0.021728515625, + "learning_rate": 0.025727768275918612, + "loss": 0.8266, + "num_input_tokens_seen": 25072168, + "step": 43200 + }, + { + "epoch": 6.435061066428359, + "grad_norm": 0.016845703125, + "learning_rate": 0.025726405503387246, + "loss": 0.7686, + "num_input_tokens_seen": 25075016, + "step": 43205 + }, + { + "epoch": 6.435805778969318, + "grad_norm": 0.01336669921875, + "learning_rate": 0.025725042549644157, + "loss": 0.7965, + "num_input_tokens_seen": 25077768, + "step": 43210 + }, + { + "epoch": 6.436550491510277, + "grad_norm": 0.0174560546875, + "learning_rate": 0.025723679414712375, + "loss": 0.814, + "num_input_tokens_seen": 25080360, + "step": 43215 + }, + { + "epoch": 6.437295204051236, + "grad_norm": 0.02294921875, + "learning_rate": 0.025722316098614924, + "loss": 0.8414, + "num_input_tokens_seen": 25083496, + "step": 43220 + }, + { + "epoch": 6.438039916592196, + "grad_norm": 0.0230712890625, + "learning_rate": 0.025720952601374844, + "loss": 0.8293, + "num_input_tokens_seen": 25086536, + "step": 43225 + }, + { + "epoch": 6.4387846291331545, + "grad_norm": 0.0269775390625, + "learning_rate": 0.025719588923015166, + "loss": 0.8057, + "num_input_tokens_seen": 25089448, + "step": 43230 + }, + { + "epoch": 6.439529341674114, + "grad_norm": 0.0120849609375, + "learning_rate": 0.025718225063558926, + "loss": 0.7921, + "num_input_tokens_seen": 25092296, + "step": 43235 + }, + { + "epoch": 6.440274054215073, + "grad_norm": 0.0181884765625, + "learning_rate": 0.025716861023029166, + "loss": 0.8044, + "num_input_tokens_seen": 25095176, + "step": 43240 + }, + { + "epoch": 6.441018766756033, + "grad_norm": 0.0291748046875, + "learning_rate": 0.025715496801448936, + "loss": 0.8142, + "num_input_tokens_seen": 25097992, + "step": 43245 + }, + { + "epoch": 6.441763479296991, + "grad_norm": 0.0289306640625, + "learning_rate": 0.025714132398841277, + "loss": 0.8152, + "num_input_tokens_seen": 25100776, + "step": 43250 + }, + { + "epoch": 6.44250819183795, + "grad_norm": 0.018310546875, + "learning_rate": 0.02571276781522924, + "loss": 0.8137, + "num_input_tokens_seen": 25103880, + "step": 43255 + }, + { + "epoch": 6.44325290437891, + "grad_norm": 0.018310546875, + "learning_rate": 0.025711403050635878, + "loss": 0.7932, + "num_input_tokens_seen": 25106760, + "step": 43260 + }, + { + "epoch": 6.443997616919869, + "grad_norm": 0.0174560546875, + "learning_rate": 0.025710038105084248, + "loss": 0.7974, + "num_input_tokens_seen": 25109672, + "step": 43265 + }, + { + "epoch": 6.444742329460828, + "grad_norm": 0.01123046875, + "learning_rate": 0.025708672978597406, + "loss": 0.7964, + "num_input_tokens_seen": 25112424, + "step": 43270 + }, + { + "epoch": 6.445487042001787, + "grad_norm": 0.0196533203125, + "learning_rate": 0.025707307671198425, + "loss": 0.8092, + "num_input_tokens_seen": 25115112, + "step": 43275 + }, + { + "epoch": 6.4462317545427466, + "grad_norm": 0.01220703125, + "learning_rate": 0.025705942182910356, + "loss": 0.8064, + "num_input_tokens_seen": 25118056, + "step": 43280 + }, + { + "epoch": 6.446976467083705, + "grad_norm": 0.020263671875, + "learning_rate": 0.02570457651375628, + "loss": 0.8049, + "num_input_tokens_seen": 25121064, + "step": 43285 + }, + { + "epoch": 6.447721179624665, + "grad_norm": 0.02783203125, + "learning_rate": 0.025703210663759263, + "loss": 0.8093, + "num_input_tokens_seen": 25124040, + "step": 43290 + }, + { + "epoch": 6.448465892165624, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02570184463294238, + "loss": 0.8055, + "num_input_tokens_seen": 25126856, + "step": 43295 + }, + { + "epoch": 6.449210604706583, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025700478421328707, + "loss": 0.7999, + "num_input_tokens_seen": 25129704, + "step": 43300 + }, + { + "epoch": 6.449955317247542, + "grad_norm": 0.0225830078125, + "learning_rate": 0.025699112028941328, + "loss": 0.8002, + "num_input_tokens_seen": 25132520, + "step": 43305 + }, + { + "epoch": 6.450700029788502, + "grad_norm": 0.0198974609375, + "learning_rate": 0.025697745455803327, + "loss": 0.7882, + "num_input_tokens_seen": 25135272, + "step": 43310 + }, + { + "epoch": 6.4514447423294605, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02569637870193778, + "loss": 0.7997, + "num_input_tokens_seen": 25138440, + "step": 43315 + }, + { + "epoch": 6.45218945487042, + "grad_norm": 0.0179443359375, + "learning_rate": 0.025695011767367798, + "loss": 0.8191, + "num_input_tokens_seen": 25141576, + "step": 43320 + }, + { + "epoch": 6.452934167411379, + "grad_norm": 0.01409912109375, + "learning_rate": 0.025693644652116456, + "loss": 0.8081, + "num_input_tokens_seen": 25144264, + "step": 43325 + }, + { + "epoch": 6.453678879952339, + "grad_norm": 0.024658203125, + "learning_rate": 0.025692277356206855, + "loss": 0.799, + "num_input_tokens_seen": 25147400, + "step": 43330 + }, + { + "epoch": 6.454423592493297, + "grad_norm": 0.01177978515625, + "learning_rate": 0.0256909098796621, + "loss": 0.792, + "num_input_tokens_seen": 25150472, + "step": 43335 + }, + { + "epoch": 6.455168305034257, + "grad_norm": 0.0169677734375, + "learning_rate": 0.025689542222505288, + "loss": 0.7769, + "num_input_tokens_seen": 25153192, + "step": 43340 + }, + { + "epoch": 6.455913017575216, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02568817438475952, + "loss": 0.8035, + "num_input_tokens_seen": 25156200, + "step": 43345 + }, + { + "epoch": 6.456657730116175, + "grad_norm": 0.021240234375, + "learning_rate": 0.025686806366447913, + "loss": 0.7837, + "num_input_tokens_seen": 25158984, + "step": 43350 + }, + { + "epoch": 6.457402442657134, + "grad_norm": 0.01031494140625, + "learning_rate": 0.02568543816759357, + "loss": 0.7882, + "num_input_tokens_seen": 25161640, + "step": 43355 + }, + { + "epoch": 6.458147155198094, + "grad_norm": 0.0164794921875, + "learning_rate": 0.02568406978821961, + "loss": 0.7791, + "num_input_tokens_seen": 25164360, + "step": 43360 + }, + { + "epoch": 6.4588918677390526, + "grad_norm": 0.01141357421875, + "learning_rate": 0.025682701228349154, + "loss": 0.8035, + "num_input_tokens_seen": 25167112, + "step": 43365 + }, + { + "epoch": 6.459636580280012, + "grad_norm": 0.0172119140625, + "learning_rate": 0.025681332488005318, + "loss": 0.8018, + "num_input_tokens_seen": 25169928, + "step": 43370 + }, + { + "epoch": 6.460381292820971, + "grad_norm": 0.0126953125, + "learning_rate": 0.025679963567211223, + "loss": 0.7796, + "num_input_tokens_seen": 25172712, + "step": 43375 + }, + { + "epoch": 6.461126005361931, + "grad_norm": 0.0166015625, + "learning_rate": 0.025678594465989996, + "loss": 0.7779, + "num_input_tokens_seen": 25175592, + "step": 43380 + }, + { + "epoch": 6.461870717902889, + "grad_norm": 0.031982421875, + "learning_rate": 0.02567722518436477, + "loss": 0.8013, + "num_input_tokens_seen": 25178376, + "step": 43385 + }, + { + "epoch": 6.462615430443849, + "grad_norm": 0.0224609375, + "learning_rate": 0.025675855722358677, + "loss": 0.778, + "num_input_tokens_seen": 25181064, + "step": 43390 + }, + { + "epoch": 6.463360142984808, + "grad_norm": 0.0274658203125, + "learning_rate": 0.025674486079994854, + "loss": 0.815, + "num_input_tokens_seen": 25184104, + "step": 43395 + }, + { + "epoch": 6.464104855525767, + "grad_norm": 0.012451171875, + "learning_rate": 0.025673116257296437, + "loss": 0.7917, + "num_input_tokens_seen": 25186920, + "step": 43400 + }, + { + "epoch": 6.464849568066726, + "grad_norm": 0.0125732421875, + "learning_rate": 0.02567174625428657, + "loss": 0.7978, + "num_input_tokens_seen": 25189928, + "step": 43405 + }, + { + "epoch": 6.465594280607686, + "grad_norm": 0.0123291015625, + "learning_rate": 0.02567037607098839, + "loss": 0.8138, + "num_input_tokens_seen": 25192872, + "step": 43410 + }, + { + "epoch": 6.466338993148645, + "grad_norm": 0.0302734375, + "learning_rate": 0.025669005707425054, + "loss": 0.8274, + "num_input_tokens_seen": 25195496, + "step": 43415 + }, + { + "epoch": 6.467083705689604, + "grad_norm": 0.0234375, + "learning_rate": 0.025667635163619713, + "loss": 0.7846, + "num_input_tokens_seen": 25198088, + "step": 43420 + }, + { + "epoch": 6.467828418230563, + "grad_norm": 0.02099609375, + "learning_rate": 0.025666264439595517, + "loss": 0.7917, + "num_input_tokens_seen": 25201224, + "step": 43425 + }, + { + "epoch": 6.468573130771523, + "grad_norm": 0.01141357421875, + "learning_rate": 0.02566489353537562, + "loss": 0.7979, + "num_input_tokens_seen": 25204488, + "step": 43430 + }, + { + "epoch": 6.469317843312481, + "grad_norm": 0.0159912109375, + "learning_rate": 0.025663522450983193, + "loss": 0.7848, + "num_input_tokens_seen": 25207752, + "step": 43435 + }, + { + "epoch": 6.47006255585344, + "grad_norm": 0.0184326171875, + "learning_rate": 0.025662151186441388, + "loss": 0.7878, + "num_input_tokens_seen": 25210664, + "step": 43440 + }, + { + "epoch": 6.4708072683944, + "grad_norm": 0.02099609375, + "learning_rate": 0.025660779741773374, + "loss": 0.8404, + "num_input_tokens_seen": 25213512, + "step": 43445 + }, + { + "epoch": 6.4715519809353586, + "grad_norm": 0.017578125, + "learning_rate": 0.025659408117002323, + "loss": 0.8172, + "num_input_tokens_seen": 25216456, + "step": 43450 + }, + { + "epoch": 6.472296693476318, + "grad_norm": 0.017333984375, + "learning_rate": 0.025658036312151408, + "loss": 0.8187, + "num_input_tokens_seen": 25219144, + "step": 43455 + }, + { + "epoch": 6.473041406017277, + "grad_norm": 0.0206298828125, + "learning_rate": 0.025656664327243797, + "loss": 0.8206, + "num_input_tokens_seen": 25222024, + "step": 43460 + }, + { + "epoch": 6.473786118558237, + "grad_norm": 0.017333984375, + "learning_rate": 0.025655292162302674, + "loss": 0.806, + "num_input_tokens_seen": 25224616, + "step": 43465 + }, + { + "epoch": 6.474530831099195, + "grad_norm": 0.0167236328125, + "learning_rate": 0.025653919817351226, + "loss": 0.7785, + "num_input_tokens_seen": 25227400, + "step": 43470 + }, + { + "epoch": 6.475275543640155, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02565254729241262, + "loss": 0.8017, + "num_input_tokens_seen": 25230088, + "step": 43475 + }, + { + "epoch": 6.476020256181114, + "grad_norm": 0.011962890625, + "learning_rate": 0.02565117458751006, + "loss": 0.7848, + "num_input_tokens_seen": 25233032, + "step": 43480 + }, + { + "epoch": 6.476764968722073, + "grad_norm": 0.01806640625, + "learning_rate": 0.025649801702666734, + "loss": 0.7892, + "num_input_tokens_seen": 25235848, + "step": 43485 + }, + { + "epoch": 6.477509681263032, + "grad_norm": 0.0174560546875, + "learning_rate": 0.02564842863790583, + "loss": 0.7942, + "num_input_tokens_seen": 25239080, + "step": 43490 + }, + { + "epoch": 6.478254393803992, + "grad_norm": 0.0257568359375, + "learning_rate": 0.025647055393250545, + "loss": 0.8076, + "num_input_tokens_seen": 25242120, + "step": 43495 + }, + { + "epoch": 6.478999106344951, + "grad_norm": 0.02685546875, + "learning_rate": 0.02564568196872408, + "loss": 0.7852, + "num_input_tokens_seen": 25245032, + "step": 43500 + }, + { + "epoch": 6.47974381888591, + "grad_norm": 0.021484375, + "learning_rate": 0.025644308364349642, + "loss": 0.7911, + "num_input_tokens_seen": 25247688, + "step": 43505 + }, + { + "epoch": 6.480488531426869, + "grad_norm": 0.0230712890625, + "learning_rate": 0.025642934580150427, + "loss": 0.7923, + "num_input_tokens_seen": 25250728, + "step": 43510 + }, + { + "epoch": 6.481233243967829, + "grad_norm": 0.0250244140625, + "learning_rate": 0.025641560616149653, + "loss": 0.7986, + "num_input_tokens_seen": 25253544, + "step": 43515 + }, + { + "epoch": 6.481977956508787, + "grad_norm": 0.0322265625, + "learning_rate": 0.025640186472370527, + "loss": 0.8281, + "num_input_tokens_seen": 25256424, + "step": 43520 + }, + { + "epoch": 6.482722669049747, + "grad_norm": 0.01092529296875, + "learning_rate": 0.025638812148836263, + "loss": 0.8064, + "num_input_tokens_seen": 25259432, + "step": 43525 + }, + { + "epoch": 6.483467381590706, + "grad_norm": 0.0289306640625, + "learning_rate": 0.025637437645570083, + "loss": 0.807, + "num_input_tokens_seen": 25262152, + "step": 43530 + }, + { + "epoch": 6.484212094131665, + "grad_norm": 0.0186767578125, + "learning_rate": 0.025636062962595203, + "loss": 0.7716, + "num_input_tokens_seen": 25264936, + "step": 43535 + }, + { + "epoch": 6.484956806672624, + "grad_norm": 0.01373291015625, + "learning_rate": 0.02563468809993485, + "loss": 0.7905, + "num_input_tokens_seen": 25267656, + "step": 43540 + }, + { + "epoch": 6.485701519213584, + "grad_norm": 0.0257568359375, + "learning_rate": 0.025633313057612253, + "loss": 0.7989, + "num_input_tokens_seen": 25270536, + "step": 43545 + }, + { + "epoch": 6.486446231754543, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02563193783565064, + "loss": 0.789, + "num_input_tokens_seen": 25273448, + "step": 43550 + }, + { + "epoch": 6.487190944295502, + "grad_norm": 0.02734375, + "learning_rate": 0.025630562434073238, + "loss": 0.8368, + "num_input_tokens_seen": 25276200, + "step": 43555 + }, + { + "epoch": 6.487935656836461, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02562918685290329, + "loss": 0.8144, + "num_input_tokens_seen": 25279048, + "step": 43560 + }, + { + "epoch": 6.488680369377421, + "grad_norm": 0.025146484375, + "learning_rate": 0.025627811092164034, + "loss": 0.8167, + "num_input_tokens_seen": 25281832, + "step": 43565 + }, + { + "epoch": 6.489425081918379, + "grad_norm": 0.01806640625, + "learning_rate": 0.02562643515187871, + "loss": 0.8034, + "num_input_tokens_seen": 25284872, + "step": 43570 + }, + { + "epoch": 6.490169794459339, + "grad_norm": 0.017822265625, + "learning_rate": 0.025625059032070567, + "loss": 0.8124, + "num_input_tokens_seen": 25288072, + "step": 43575 + }, + { + "epoch": 6.490914507000298, + "grad_norm": 0.0196533203125, + "learning_rate": 0.025623682732762845, + "loss": 0.7895, + "num_input_tokens_seen": 25290984, + "step": 43580 + }, + { + "epoch": 6.4916592195412575, + "grad_norm": 0.0142822265625, + "learning_rate": 0.025622306253978804, + "loss": 0.7978, + "num_input_tokens_seen": 25293704, + "step": 43585 + }, + { + "epoch": 6.492403932082216, + "grad_norm": 0.02197265625, + "learning_rate": 0.0256209295957417, + "loss": 0.8107, + "num_input_tokens_seen": 25296904, + "step": 43590 + }, + { + "epoch": 6.493148644623176, + "grad_norm": 0.0203857421875, + "learning_rate": 0.025619552758074777, + "loss": 0.8126, + "num_input_tokens_seen": 25299816, + "step": 43595 + }, + { + "epoch": 6.493893357164135, + "grad_norm": 0.0174560546875, + "learning_rate": 0.025618175741001305, + "loss": 0.7803, + "num_input_tokens_seen": 25302504, + "step": 43600 + }, + { + "epoch": 6.494638069705093, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02561679854454455, + "loss": 0.7987, + "num_input_tokens_seen": 25305384, + "step": 43605 + }, + { + "epoch": 6.495382782246053, + "grad_norm": 0.02001953125, + "learning_rate": 0.02561542116872777, + "loss": 0.7871, + "num_input_tokens_seen": 25308488, + "step": 43610 + }, + { + "epoch": 6.496127494787013, + "grad_norm": 0.0311279296875, + "learning_rate": 0.02561404361357424, + "loss": 0.8244, + "num_input_tokens_seen": 25311112, + "step": 43615 + }, + { + "epoch": 6.496872207327971, + "grad_norm": 0.01263427734375, + "learning_rate": 0.025612665879107228, + "loss": 0.7868, + "num_input_tokens_seen": 25313832, + "step": 43620 + }, + { + "epoch": 6.49761691986893, + "grad_norm": 0.0133056640625, + "learning_rate": 0.025611287965350015, + "loss": 0.7925, + "num_input_tokens_seen": 25316808, + "step": 43625 + }, + { + "epoch": 6.49836163240989, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02560990987232588, + "loss": 0.8078, + "num_input_tokens_seen": 25319752, + "step": 43630 + }, + { + "epoch": 6.499106344950849, + "grad_norm": 0.0184326171875, + "learning_rate": 0.025608531600058097, + "loss": 0.8144, + "num_input_tokens_seen": 25322632, + "step": 43635 + }, + { + "epoch": 6.499851057491808, + "grad_norm": 0.0257568359375, + "learning_rate": 0.025607153148569956, + "loss": 0.8051, + "num_input_tokens_seen": 25325448, + "step": 43640 + }, + { + "epoch": 6.500595770032767, + "grad_norm": 0.019775390625, + "learning_rate": 0.025605774517884736, + "loss": 0.7959, + "num_input_tokens_seen": 25328104, + "step": 43645 + }, + { + "epoch": 6.501340482573727, + "grad_norm": 0.0303955078125, + "learning_rate": 0.025604395708025744, + "loss": 0.8029, + "num_input_tokens_seen": 25331304, + "step": 43650 + }, + { + "epoch": 6.502085195114685, + "grad_norm": 0.0272216796875, + "learning_rate": 0.025603016719016262, + "loss": 0.7969, + "num_input_tokens_seen": 25334216, + "step": 43655 + }, + { + "epoch": 6.502829907655645, + "grad_norm": 0.013671875, + "learning_rate": 0.02560163755087959, + "loss": 0.8298, + "num_input_tokens_seen": 25337064, + "step": 43660 + }, + { + "epoch": 6.503574620196604, + "grad_norm": 0.0189208984375, + "learning_rate": 0.025600258203639023, + "loss": 0.8093, + "num_input_tokens_seen": 25340296, + "step": 43665 + }, + { + "epoch": 6.5043193327375635, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02559887867731787, + "loss": 0.8088, + "num_input_tokens_seen": 25343496, + "step": 43670 + }, + { + "epoch": 6.505064045278522, + "grad_norm": 0.021240234375, + "learning_rate": 0.02559749897193943, + "loss": 0.8054, + "num_input_tokens_seen": 25346312, + "step": 43675 + }, + { + "epoch": 6.505808757819482, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02559611908752702, + "loss": 0.7835, + "num_input_tokens_seen": 25349192, + "step": 43680 + }, + { + "epoch": 6.506553470360441, + "grad_norm": 0.0211181640625, + "learning_rate": 0.025594739024103946, + "loss": 0.8187, + "num_input_tokens_seen": 25352264, + "step": 43685 + }, + { + "epoch": 6.5072981829014, + "grad_norm": 0.0223388671875, + "learning_rate": 0.025593358781693526, + "loss": 0.7984, + "num_input_tokens_seen": 25355656, + "step": 43690 + }, + { + "epoch": 6.508042895442359, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02559197836031907, + "loss": 0.7945, + "num_input_tokens_seen": 25358440, + "step": 43695 + }, + { + "epoch": 6.508787607983319, + "grad_norm": 0.0198974609375, + "learning_rate": 0.025590597760003914, + "loss": 0.781, + "num_input_tokens_seen": 25361160, + "step": 43700 + }, + { + "epoch": 6.509532320524277, + "grad_norm": 0.0244140625, + "learning_rate": 0.02558921698077137, + "loss": 0.8085, + "num_input_tokens_seen": 25364200, + "step": 43705 + }, + { + "epoch": 6.510277033065237, + "grad_norm": 0.0419921875, + "learning_rate": 0.025587836022644764, + "loss": 0.8075, + "num_input_tokens_seen": 25367016, + "step": 43710 + }, + { + "epoch": 6.511021745606196, + "grad_norm": 0.01275634765625, + "learning_rate": 0.02558645488564743, + "loss": 0.8099, + "num_input_tokens_seen": 25369864, + "step": 43715 + }, + { + "epoch": 6.5117664581471555, + "grad_norm": 0.0205078125, + "learning_rate": 0.0255850735698027, + "loss": 0.7813, + "num_input_tokens_seen": 25372904, + "step": 43720 + }, + { + "epoch": 6.512511170688114, + "grad_norm": 0.020751953125, + "learning_rate": 0.025583692075133915, + "loss": 0.8051, + "num_input_tokens_seen": 25375784, + "step": 43725 + }, + { + "epoch": 6.513255883229074, + "grad_norm": 0.0257568359375, + "learning_rate": 0.0255823104016644, + "loss": 0.7904, + "num_input_tokens_seen": 25378664, + "step": 43730 + }, + { + "epoch": 6.514000595770033, + "grad_norm": 0.0233154296875, + "learning_rate": 0.025580928549417514, + "loss": 0.8201, + "num_input_tokens_seen": 25381448, + "step": 43735 + }, + { + "epoch": 6.514745308310992, + "grad_norm": 0.023681640625, + "learning_rate": 0.02557954651841659, + "loss": 0.8236, + "num_input_tokens_seen": 25384296, + "step": 43740 + }, + { + "epoch": 6.515490020851951, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025578164308684984, + "loss": 0.8044, + "num_input_tokens_seen": 25387272, + "step": 43745 + }, + { + "epoch": 6.516234733392911, + "grad_norm": 0.02001953125, + "learning_rate": 0.02557678192024604, + "loss": 0.8059, + "num_input_tokens_seen": 25390696, + "step": 43750 + }, + { + "epoch": 6.5169794459338695, + "grad_norm": 0.024169921875, + "learning_rate": 0.025575399353123116, + "loss": 0.8, + "num_input_tokens_seen": 25393640, + "step": 43755 + }, + { + "epoch": 6.517724158474829, + "grad_norm": 0.020263671875, + "learning_rate": 0.02557401660733957, + "loss": 0.826, + "num_input_tokens_seen": 25396552, + "step": 43760 + }, + { + "epoch": 6.518468871015788, + "grad_norm": 0.0255126953125, + "learning_rate": 0.025572633682918757, + "loss": 0.7836, + "num_input_tokens_seen": 25399560, + "step": 43765 + }, + { + "epoch": 6.519213583556747, + "grad_norm": 0.0142822265625, + "learning_rate": 0.025571250579884047, + "loss": 0.806, + "num_input_tokens_seen": 25402568, + "step": 43770 + }, + { + "epoch": 6.519958296097706, + "grad_norm": 0.030517578125, + "learning_rate": 0.0255698672982588, + "loss": 0.8103, + "num_input_tokens_seen": 25405576, + "step": 43775 + }, + { + "epoch": 6.520703008638666, + "grad_norm": 0.01239013671875, + "learning_rate": 0.02556848383806639, + "loss": 0.8126, + "num_input_tokens_seen": 25408520, + "step": 43780 + }, + { + "epoch": 6.521447721179625, + "grad_norm": 0.0185546875, + "learning_rate": 0.025567100199330187, + "loss": 0.7819, + "num_input_tokens_seen": 25411176, + "step": 43785 + }, + { + "epoch": 6.522192433720583, + "grad_norm": 0.021240234375, + "learning_rate": 0.025565716382073563, + "loss": 0.8064, + "num_input_tokens_seen": 25413992, + "step": 43790 + }, + { + "epoch": 6.522937146261543, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025564332386319902, + "loss": 0.7956, + "num_input_tokens_seen": 25417480, + "step": 43795 + }, + { + "epoch": 6.523681858802503, + "grad_norm": 0.0211181640625, + "learning_rate": 0.025562948212092578, + "loss": 0.7945, + "num_input_tokens_seen": 25420264, + "step": 43800 + }, + { + "epoch": 6.5244265713434615, + "grad_norm": 0.0191650390625, + "learning_rate": 0.025561563859414983, + "loss": 0.8041, + "num_input_tokens_seen": 25423464, + "step": 43805 + }, + { + "epoch": 6.52517128388442, + "grad_norm": 0.0272216796875, + "learning_rate": 0.025560179328310503, + "loss": 0.7867, + "num_input_tokens_seen": 25426120, + "step": 43810 + }, + { + "epoch": 6.52591599642538, + "grad_norm": 0.0174560546875, + "learning_rate": 0.025558794618802522, + "loss": 0.8009, + "num_input_tokens_seen": 25429192, + "step": 43815 + }, + { + "epoch": 6.526660708966339, + "grad_norm": 0.01153564453125, + "learning_rate": 0.02555740973091444, + "loss": 0.7901, + "num_input_tokens_seen": 25431944, + "step": 43820 + }, + { + "epoch": 6.527405421507298, + "grad_norm": 0.0303955078125, + "learning_rate": 0.025556024664669653, + "loss": 0.8089, + "num_input_tokens_seen": 25434728, + "step": 43825 + }, + { + "epoch": 6.528150134048257, + "grad_norm": 0.01300048828125, + "learning_rate": 0.025554639420091552, + "loss": 0.823, + "num_input_tokens_seen": 25437640, + "step": 43830 + }, + { + "epoch": 6.528894846589217, + "grad_norm": 0.019775390625, + "learning_rate": 0.025553253997203554, + "loss": 0.8096, + "num_input_tokens_seen": 25440520, + "step": 43835 + }, + { + "epoch": 6.5296395591301755, + "grad_norm": 0.0234375, + "learning_rate": 0.025551868396029045, + "loss": 0.8067, + "num_input_tokens_seen": 25443336, + "step": 43840 + }, + { + "epoch": 6.530384271671135, + "grad_norm": 0.021728515625, + "learning_rate": 0.025550482616591447, + "loss": 0.8125, + "num_input_tokens_seen": 25446376, + "step": 43845 + }, + { + "epoch": 6.531128984212094, + "grad_norm": 0.0181884765625, + "learning_rate": 0.025549096658914174, + "loss": 0.7977, + "num_input_tokens_seen": 25449224, + "step": 43850 + }, + { + "epoch": 6.5318736967530535, + "grad_norm": 0.0286865234375, + "learning_rate": 0.025547710523020628, + "loss": 0.7814, + "num_input_tokens_seen": 25452072, + "step": 43855 + }, + { + "epoch": 6.532618409294012, + "grad_norm": 0.018798828125, + "learning_rate": 0.025546324208934237, + "loss": 0.7836, + "num_input_tokens_seen": 25454792, + "step": 43860 + }, + { + "epoch": 6.533363121834972, + "grad_norm": 0.0206298828125, + "learning_rate": 0.025544937716678418, + "loss": 0.8103, + "num_input_tokens_seen": 25457736, + "step": 43865 + }, + { + "epoch": 6.534107834375931, + "grad_norm": 0.0196533203125, + "learning_rate": 0.025543551046276588, + "loss": 0.7906, + "num_input_tokens_seen": 25460584, + "step": 43870 + }, + { + "epoch": 6.53485254691689, + "grad_norm": 0.01904296875, + "learning_rate": 0.025542164197752185, + "loss": 0.7923, + "num_input_tokens_seen": 25463272, + "step": 43875 + }, + { + "epoch": 6.535597259457849, + "grad_norm": 0.017333984375, + "learning_rate": 0.02554077717112863, + "loss": 0.8019, + "num_input_tokens_seen": 25465960, + "step": 43880 + }, + { + "epoch": 6.536341971998809, + "grad_norm": 0.02294921875, + "learning_rate": 0.025539389966429356, + "loss": 0.7843, + "num_input_tokens_seen": 25468904, + "step": 43885 + }, + { + "epoch": 6.5370866845397675, + "grad_norm": 0.029052734375, + "learning_rate": 0.0255380025836778, + "loss": 0.807, + "num_input_tokens_seen": 25471848, + "step": 43890 + }, + { + "epoch": 6.537831397080727, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0255366150228974, + "loss": 0.8287, + "num_input_tokens_seen": 25474696, + "step": 43895 + }, + { + "epoch": 6.538576109621686, + "grad_norm": 0.016357421875, + "learning_rate": 0.025535227284111603, + "loss": 0.79, + "num_input_tokens_seen": 25477608, + "step": 43900 + }, + { + "epoch": 6.5393208221626455, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02553383936734385, + "loss": 0.7845, + "num_input_tokens_seen": 25480808, + "step": 43905 + }, + { + "epoch": 6.540065534703604, + "grad_norm": 0.01348876953125, + "learning_rate": 0.025532451272617582, + "loss": 0.8072, + "num_input_tokens_seen": 25483784, + "step": 43910 + }, + { + "epoch": 6.540810247244564, + "grad_norm": 0.0198974609375, + "learning_rate": 0.025531062999956257, + "loss": 0.782, + "num_input_tokens_seen": 25486856, + "step": 43915 + }, + { + "epoch": 6.541554959785523, + "grad_norm": 0.010498046875, + "learning_rate": 0.025529674549383322, + "loss": 0.7967, + "num_input_tokens_seen": 25489544, + "step": 43920 + }, + { + "epoch": 6.542299672326482, + "grad_norm": 0.025146484375, + "learning_rate": 0.025528285920922242, + "loss": 0.7877, + "num_input_tokens_seen": 25492488, + "step": 43925 + }, + { + "epoch": 6.543044384867441, + "grad_norm": 0.022216796875, + "learning_rate": 0.025526897114596468, + "loss": 0.8088, + "num_input_tokens_seen": 25495400, + "step": 43930 + }, + { + "epoch": 6.5437890974084, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02552550813042947, + "loss": 0.8228, + "num_input_tokens_seen": 25498344, + "step": 43935 + }, + { + "epoch": 6.5445338099493595, + "grad_norm": 0.019775390625, + "learning_rate": 0.025524118968444708, + "loss": 0.79, + "num_input_tokens_seen": 25501128, + "step": 43940 + }, + { + "epoch": 6.545278522490319, + "grad_norm": 0.01422119140625, + "learning_rate": 0.02552272962866565, + "loss": 0.8173, + "num_input_tokens_seen": 25504168, + "step": 43945 + }, + { + "epoch": 6.546023235031278, + "grad_norm": 0.0218505859375, + "learning_rate": 0.025521340111115776, + "loss": 0.7863, + "num_input_tokens_seen": 25506952, + "step": 43950 + }, + { + "epoch": 6.546767947572237, + "grad_norm": 0.0308837890625, + "learning_rate": 0.025519950415818553, + "loss": 0.8176, + "num_input_tokens_seen": 25509768, + "step": 43955 + }, + { + "epoch": 6.547512660113196, + "grad_norm": 0.0189208984375, + "learning_rate": 0.025518560542797452, + "loss": 0.8199, + "num_input_tokens_seen": 25512936, + "step": 43960 + }, + { + "epoch": 6.548257372654156, + "grad_norm": 0.02099609375, + "learning_rate": 0.02551717049207597, + "loss": 0.8153, + "num_input_tokens_seen": 25515944, + "step": 43965 + }, + { + "epoch": 6.549002085195115, + "grad_norm": 0.034423828125, + "learning_rate": 0.025515780263677575, + "loss": 0.8142, + "num_input_tokens_seen": 25518888, + "step": 43970 + }, + { + "epoch": 6.5497467977360735, + "grad_norm": 0.0294189453125, + "learning_rate": 0.025514389857625764, + "loss": 0.8004, + "num_input_tokens_seen": 25521544, + "step": 43975 + }, + { + "epoch": 6.550491510277033, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02551299927394402, + "loss": 0.7991, + "num_input_tokens_seen": 25524424, + "step": 43980 + }, + { + "epoch": 6.551236222817992, + "grad_norm": 0.0196533203125, + "learning_rate": 0.025511608512655843, + "loss": 0.8083, + "num_input_tokens_seen": 25527208, + "step": 43985 + }, + { + "epoch": 6.5519809353589515, + "grad_norm": 0.019775390625, + "learning_rate": 0.025510217573784723, + "loss": 0.8153, + "num_input_tokens_seen": 25530088, + "step": 43990 + }, + { + "epoch": 6.55272564789991, + "grad_norm": 0.023681640625, + "learning_rate": 0.025508826457354153, + "loss": 0.8148, + "num_input_tokens_seen": 25532744, + "step": 43995 + }, + { + "epoch": 6.55347036044087, + "grad_norm": 0.0213623046875, + "learning_rate": 0.025507435163387646, + "loss": 0.8058, + "num_input_tokens_seen": 25535496, + "step": 44000 + }, + { + "epoch": 6.554215072981829, + "grad_norm": 0.02587890625, + "learning_rate": 0.0255060436919087, + "loss": 0.8194, + "num_input_tokens_seen": 25538600, + "step": 44005 + }, + { + "epoch": 6.554959785522788, + "grad_norm": 0.038818359375, + "learning_rate": 0.025504652042940824, + "loss": 0.8256, + "num_input_tokens_seen": 25541384, + "step": 44010 + }, + { + "epoch": 6.555704498063747, + "grad_norm": 0.0206298828125, + "learning_rate": 0.025503260216507527, + "loss": 0.8006, + "num_input_tokens_seen": 25544168, + "step": 44015 + }, + { + "epoch": 6.556449210604707, + "grad_norm": 0.014892578125, + "learning_rate": 0.025501868212632322, + "loss": 0.8041, + "num_input_tokens_seen": 25547048, + "step": 44020 + }, + { + "epoch": 6.5571939231456655, + "grad_norm": 0.0213623046875, + "learning_rate": 0.025500476031338728, + "loss": 0.8072, + "num_input_tokens_seen": 25549768, + "step": 44025 + }, + { + "epoch": 6.557938635686625, + "grad_norm": 0.0322265625, + "learning_rate": 0.02549908367265027, + "loss": 0.8178, + "num_input_tokens_seen": 25553128, + "step": 44030 + }, + { + "epoch": 6.558683348227584, + "grad_norm": 0.034423828125, + "learning_rate": 0.02549769113659046, + "loss": 0.797, + "num_input_tokens_seen": 25555976, + "step": 44035 + }, + { + "epoch": 6.559428060768544, + "grad_norm": 0.0245361328125, + "learning_rate": 0.025496298423182826, + "loss": 0.7895, + "num_input_tokens_seen": 25558728, + "step": 44040 + }, + { + "epoch": 6.560172773309502, + "grad_norm": 0.0234375, + "learning_rate": 0.025494905532450898, + "loss": 0.8145, + "num_input_tokens_seen": 25561608, + "step": 44045 + }, + { + "epoch": 6.560917485850462, + "grad_norm": 0.030029296875, + "learning_rate": 0.02549351246441821, + "loss": 0.8214, + "num_input_tokens_seen": 25564616, + "step": 44050 + }, + { + "epoch": 6.561662198391421, + "grad_norm": 0.0296630859375, + "learning_rate": 0.025492119219108295, + "loss": 0.7984, + "num_input_tokens_seen": 25567592, + "step": 44055 + }, + { + "epoch": 6.56240691093238, + "grad_norm": 0.0196533203125, + "learning_rate": 0.025490725796544683, + "loss": 0.7929, + "num_input_tokens_seen": 25570376, + "step": 44060 + }, + { + "epoch": 6.563151623473339, + "grad_norm": 0.0247802734375, + "learning_rate": 0.025489332196750928, + "loss": 0.7838, + "num_input_tokens_seen": 25573352, + "step": 44065 + }, + { + "epoch": 6.563896336014299, + "grad_norm": 0.0296630859375, + "learning_rate": 0.025487938419750562, + "loss": 0.8107, + "num_input_tokens_seen": 25576232, + "step": 44070 + }, + { + "epoch": 6.5646410485552575, + "grad_norm": 0.027099609375, + "learning_rate": 0.02548654446556714, + "loss": 0.7882, + "num_input_tokens_seen": 25579016, + "step": 44075 + }, + { + "epoch": 6.565385761096217, + "grad_norm": 0.029052734375, + "learning_rate": 0.025485150334224202, + "loss": 0.7933, + "num_input_tokens_seen": 25581896, + "step": 44080 + }, + { + "epoch": 6.566130473637176, + "grad_norm": 0.024658203125, + "learning_rate": 0.025483756025745312, + "loss": 0.8115, + "num_input_tokens_seen": 25584616, + "step": 44085 + }, + { + "epoch": 6.566875186178136, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02548236154015401, + "loss": 0.8241, + "num_input_tokens_seen": 25587688, + "step": 44090 + }, + { + "epoch": 6.567619898719094, + "grad_norm": 0.0216064453125, + "learning_rate": 0.025480966877473872, + "loss": 0.8102, + "num_input_tokens_seen": 25590760, + "step": 44095 + }, + { + "epoch": 6.568364611260054, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02547957203772845, + "loss": 0.7912, + "num_input_tokens_seen": 25593448, + "step": 44100 + }, + { + "epoch": 6.569109323801013, + "grad_norm": 0.014892578125, + "learning_rate": 0.025478177020941305, + "loss": 0.8138, + "num_input_tokens_seen": 25596264, + "step": 44105 + }, + { + "epoch": 6.569854036341972, + "grad_norm": 0.0198974609375, + "learning_rate": 0.025476781827136016, + "loss": 0.791, + "num_input_tokens_seen": 25599080, + "step": 44110 + }, + { + "epoch": 6.570598748882931, + "grad_norm": 0.02392578125, + "learning_rate": 0.02547538645633614, + "loss": 0.8058, + "num_input_tokens_seen": 25602088, + "step": 44115 + }, + { + "epoch": 6.57134346142389, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02547399090856526, + "loss": 0.7994, + "num_input_tokens_seen": 25604968, + "step": 44120 + }, + { + "epoch": 6.57208817396485, + "grad_norm": 0.01373291015625, + "learning_rate": 0.025472595183846945, + "loss": 0.8125, + "num_input_tokens_seen": 25607912, + "step": 44125 + }, + { + "epoch": 6.572832886505809, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02547119928220478, + "loss": 0.7816, + "num_input_tokens_seen": 25611144, + "step": 44130 + }, + { + "epoch": 6.573577599046768, + "grad_norm": 0.0267333984375, + "learning_rate": 0.025469803203662347, + "loss": 0.8039, + "num_input_tokens_seen": 25613992, + "step": 44135 + }, + { + "epoch": 6.574322311587727, + "grad_norm": 0.0262451171875, + "learning_rate": 0.02546840694824323, + "loss": 0.7885, + "num_input_tokens_seen": 25616776, + "step": 44140 + }, + { + "epoch": 6.575067024128686, + "grad_norm": 0.0223388671875, + "learning_rate": 0.025467010515971014, + "loss": 0.7945, + "num_input_tokens_seen": 25619592, + "step": 44145 + }, + { + "epoch": 6.575811736669645, + "grad_norm": 0.026123046875, + "learning_rate": 0.025465613906869296, + "loss": 0.7865, + "num_input_tokens_seen": 25622440, + "step": 44150 + }, + { + "epoch": 6.576556449210605, + "grad_norm": 0.0146484375, + "learning_rate": 0.02546421712096167, + "loss": 0.8172, + "num_input_tokens_seen": 25625672, + "step": 44155 + }, + { + "epoch": 6.5773011617515635, + "grad_norm": 0.0150146484375, + "learning_rate": 0.025462820158271723, + "loss": 0.7939, + "num_input_tokens_seen": 25628616, + "step": 44160 + }, + { + "epoch": 6.578045874292523, + "grad_norm": 0.0205078125, + "learning_rate": 0.02546142301882307, + "loss": 0.8424, + "num_input_tokens_seen": 25631432, + "step": 44165 + }, + { + "epoch": 6.578790586833482, + "grad_norm": 0.02392578125, + "learning_rate": 0.025460025702639305, + "loss": 0.8023, + "num_input_tokens_seen": 25634728, + "step": 44170 + }, + { + "epoch": 6.579535299374442, + "grad_norm": 0.0167236328125, + "learning_rate": 0.02545862820974404, + "loss": 0.7983, + "num_input_tokens_seen": 25638024, + "step": 44175 + }, + { + "epoch": 6.5802800119154, + "grad_norm": 0.022705078125, + "learning_rate": 0.025457230540160882, + "loss": 0.7964, + "num_input_tokens_seen": 25641000, + "step": 44180 + }, + { + "epoch": 6.58102472445636, + "grad_norm": 0.0205078125, + "learning_rate": 0.025455832693913435, + "loss": 0.7935, + "num_input_tokens_seen": 25643848, + "step": 44185 + }, + { + "epoch": 6.581769436997319, + "grad_norm": 0.025634765625, + "learning_rate": 0.025454434671025327, + "loss": 0.7816, + "num_input_tokens_seen": 25646472, + "step": 44190 + }, + { + "epoch": 6.582514149538278, + "grad_norm": 0.017333984375, + "learning_rate": 0.025453036471520172, + "loss": 0.7994, + "num_input_tokens_seen": 25649384, + "step": 44195 + }, + { + "epoch": 6.583258862079237, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02545163809542159, + "loss": 0.8024, + "num_input_tokens_seen": 25652520, + "step": 44200 + }, + { + "epoch": 6.584003574620197, + "grad_norm": 0.01361083984375, + "learning_rate": 0.0254502395427532, + "loss": 0.7967, + "num_input_tokens_seen": 25655560, + "step": 44205 + }, + { + "epoch": 6.584748287161156, + "grad_norm": 0.01153564453125, + "learning_rate": 0.025448840813538636, + "loss": 0.8056, + "num_input_tokens_seen": 25658632, + "step": 44210 + }, + { + "epoch": 6.585492999702115, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025447441907801526, + "loss": 0.7882, + "num_input_tokens_seen": 25661864, + "step": 44215 + }, + { + "epoch": 6.586237712243074, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0254460428255655, + "loss": 0.7962, + "num_input_tokens_seen": 25665096, + "step": 44220 + }, + { + "epoch": 6.586982424784034, + "grad_norm": 0.0225830078125, + "learning_rate": 0.025444643566854202, + "loss": 0.806, + "num_input_tokens_seen": 25668008, + "step": 44225 + }, + { + "epoch": 6.587727137324992, + "grad_norm": 0.024658203125, + "learning_rate": 0.02544324413169127, + "loss": 0.8054, + "num_input_tokens_seen": 25670728, + "step": 44230 + }, + { + "epoch": 6.588471849865952, + "grad_norm": 0.0181884765625, + "learning_rate": 0.02544184452010034, + "loss": 0.8091, + "num_input_tokens_seen": 25673672, + "step": 44235 + }, + { + "epoch": 6.589216562406911, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025440444732105057, + "loss": 0.7932, + "num_input_tokens_seen": 25676328, + "step": 44240 + }, + { + "epoch": 6.58996127494787, + "grad_norm": 0.03271484375, + "learning_rate": 0.025439044767729072, + "loss": 0.8162, + "num_input_tokens_seen": 25679144, + "step": 44245 + }, + { + "epoch": 6.590705987488829, + "grad_norm": 0.0299072265625, + "learning_rate": 0.025437644626996034, + "loss": 0.794, + "num_input_tokens_seen": 25681896, + "step": 44250 + }, + { + "epoch": 6.591450700029789, + "grad_norm": 0.0126953125, + "learning_rate": 0.0254362443099296, + "loss": 0.8334, + "num_input_tokens_seen": 25684584, + "step": 44255 + }, + { + "epoch": 6.592195412570748, + "grad_norm": 0.019775390625, + "learning_rate": 0.025434843816553424, + "loss": 0.8001, + "num_input_tokens_seen": 25687528, + "step": 44260 + }, + { + "epoch": 6.592940125111707, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02543344314689117, + "loss": 0.8011, + "num_input_tokens_seen": 25690536, + "step": 44265 + }, + { + "epoch": 6.593684837652666, + "grad_norm": 0.0167236328125, + "learning_rate": 0.0254320423009665, + "loss": 0.8015, + "num_input_tokens_seen": 25693544, + "step": 44270 + }, + { + "epoch": 6.594429550193626, + "grad_norm": 0.01373291015625, + "learning_rate": 0.025430641278803068, + "loss": 0.8115, + "num_input_tokens_seen": 25696104, + "step": 44275 + }, + { + "epoch": 6.595174262734584, + "grad_norm": 0.02197265625, + "learning_rate": 0.025429240080424565, + "loss": 0.8002, + "num_input_tokens_seen": 25699080, + "step": 44280 + }, + { + "epoch": 6.595918975275543, + "grad_norm": 0.018798828125, + "learning_rate": 0.025427838705854643, + "loss": 0.8016, + "num_input_tokens_seen": 25701896, + "step": 44285 + }, + { + "epoch": 6.596663687816503, + "grad_norm": 0.01953125, + "learning_rate": 0.025426437155116985, + "loss": 0.7862, + "num_input_tokens_seen": 25704968, + "step": 44290 + }, + { + "epoch": 6.5974084003574625, + "grad_norm": 0.0244140625, + "learning_rate": 0.02542503542823527, + "loss": 0.7965, + "num_input_tokens_seen": 25707912, + "step": 44295 + }, + { + "epoch": 6.598153112898421, + "grad_norm": 0.0211181640625, + "learning_rate": 0.025423633525233177, + "loss": 0.8215, + "num_input_tokens_seen": 25710824, + "step": 44300 + }, + { + "epoch": 6.59889782543938, + "grad_norm": 0.019287109375, + "learning_rate": 0.025422231446134386, + "loss": 0.7909, + "num_input_tokens_seen": 25714408, + "step": 44305 + }, + { + "epoch": 6.59964253798034, + "grad_norm": 0.021484375, + "learning_rate": 0.025420829190962595, + "loss": 0.8073, + "num_input_tokens_seen": 25717064, + "step": 44310 + }, + { + "epoch": 6.600387250521299, + "grad_norm": 0.023193359375, + "learning_rate": 0.025419426759741483, + "loss": 0.8243, + "num_input_tokens_seen": 25719816, + "step": 44315 + }, + { + "epoch": 6.601131963062258, + "grad_norm": 0.01495361328125, + "learning_rate": 0.02541802415249474, + "loss": 0.7957, + "num_input_tokens_seen": 25722760, + "step": 44320 + }, + { + "epoch": 6.601876675603217, + "grad_norm": 0.01416015625, + "learning_rate": 0.025416621369246075, + "loss": 0.8311, + "num_input_tokens_seen": 25725960, + "step": 44325 + }, + { + "epoch": 6.602621388144176, + "grad_norm": 0.031494140625, + "learning_rate": 0.025415218410019175, + "loss": 0.8018, + "num_input_tokens_seen": 25729096, + "step": 44330 + }, + { + "epoch": 6.603366100685135, + "grad_norm": 0.0125732421875, + "learning_rate": 0.025413815274837746, + "loss": 0.8166, + "num_input_tokens_seen": 25731816, + "step": 44335 + }, + { + "epoch": 6.604110813226095, + "grad_norm": 0.031005859375, + "learning_rate": 0.02541241196372549, + "loss": 0.8146, + "num_input_tokens_seen": 25734824, + "step": 44340 + }, + { + "epoch": 6.604855525767054, + "grad_norm": 0.02490234375, + "learning_rate": 0.02541100847670612, + "loss": 0.7833, + "num_input_tokens_seen": 25737704, + "step": 44345 + }, + { + "epoch": 6.605600238308013, + "grad_norm": 0.0238037109375, + "learning_rate": 0.025409604813803346, + "loss": 0.7994, + "num_input_tokens_seen": 25740488, + "step": 44350 + }, + { + "epoch": 6.606344950848972, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025408200975040873, + "loss": 0.7931, + "num_input_tokens_seen": 25743144, + "step": 44355 + }, + { + "epoch": 6.607089663389932, + "grad_norm": 0.018310546875, + "learning_rate": 0.025406796960442426, + "loss": 0.7891, + "num_input_tokens_seen": 25746280, + "step": 44360 + }, + { + "epoch": 6.60783437593089, + "grad_norm": 0.0189208984375, + "learning_rate": 0.025405392770031718, + "loss": 0.7944, + "num_input_tokens_seen": 25749096, + "step": 44365 + }, + { + "epoch": 6.60857908847185, + "grad_norm": 0.021728515625, + "learning_rate": 0.025403988403832477, + "loss": 0.8132, + "num_input_tokens_seen": 25752328, + "step": 44370 + }, + { + "epoch": 6.609323801012809, + "grad_norm": 0.01300048828125, + "learning_rate": 0.025402583861868427, + "loss": 0.8073, + "num_input_tokens_seen": 25755112, + "step": 44375 + }, + { + "epoch": 6.6100685135537685, + "grad_norm": 0.01904296875, + "learning_rate": 0.02540117914416329, + "loss": 0.8074, + "num_input_tokens_seen": 25757832, + "step": 44380 + }, + { + "epoch": 6.610813226094727, + "grad_norm": 0.0284423828125, + "learning_rate": 0.025399774250740807, + "loss": 0.8336, + "num_input_tokens_seen": 25760904, + "step": 44385 + }, + { + "epoch": 6.611557938635687, + "grad_norm": 0.0185546875, + "learning_rate": 0.02539836918162471, + "loss": 0.8243, + "num_input_tokens_seen": 25763880, + "step": 44390 + }, + { + "epoch": 6.612302651176646, + "grad_norm": 0.026611328125, + "learning_rate": 0.02539696393683873, + "loss": 0.8087, + "num_input_tokens_seen": 25766920, + "step": 44395 + }, + { + "epoch": 6.613047363717605, + "grad_norm": 0.01129150390625, + "learning_rate": 0.025395558516406614, + "loss": 0.8106, + "num_input_tokens_seen": 25769800, + "step": 44400 + }, + { + "epoch": 6.613792076258564, + "grad_norm": 0.022705078125, + "learning_rate": 0.0253941529203521, + "loss": 0.8007, + "num_input_tokens_seen": 25772744, + "step": 44405 + }, + { + "epoch": 6.614536788799524, + "grad_norm": 0.0196533203125, + "learning_rate": 0.025392747148698935, + "loss": 0.7935, + "num_input_tokens_seen": 25775688, + "step": 44410 + }, + { + "epoch": 6.615281501340482, + "grad_norm": 0.02001953125, + "learning_rate": 0.025391341201470876, + "loss": 0.7986, + "num_input_tokens_seen": 25778312, + "step": 44415 + }, + { + "epoch": 6.616026213881442, + "grad_norm": 0.0198974609375, + "learning_rate": 0.025389935078691663, + "loss": 0.8045, + "num_input_tokens_seen": 25781000, + "step": 44420 + }, + { + "epoch": 6.616770926422401, + "grad_norm": 0.037841796875, + "learning_rate": 0.02538852878038506, + "loss": 0.8197, + "num_input_tokens_seen": 25783816, + "step": 44425 + }, + { + "epoch": 6.6175156389633605, + "grad_norm": 0.02001953125, + "learning_rate": 0.025387122306574823, + "loss": 0.8129, + "num_input_tokens_seen": 25786760, + "step": 44430 + }, + { + "epoch": 6.618260351504319, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02538571565728471, + "loss": 0.8099, + "num_input_tokens_seen": 25789512, + "step": 44435 + }, + { + "epoch": 6.619005064045279, + "grad_norm": 0.0233154296875, + "learning_rate": 0.025384308832538484, + "loss": 0.8036, + "num_input_tokens_seen": 25792200, + "step": 44440 + }, + { + "epoch": 6.619749776586238, + "grad_norm": 0.0220947265625, + "learning_rate": 0.025382901832359916, + "loss": 0.7988, + "num_input_tokens_seen": 25795048, + "step": 44445 + }, + { + "epoch": 6.620494489127196, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025381494656772773, + "loss": 0.8153, + "num_input_tokens_seen": 25798312, + "step": 44450 + }, + { + "epoch": 6.621239201668156, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02538008730580083, + "loss": 0.8061, + "num_input_tokens_seen": 25801096, + "step": 44455 + }, + { + "epoch": 6.621983914209116, + "grad_norm": 0.021728515625, + "learning_rate": 0.025378679779467863, + "loss": 0.816, + "num_input_tokens_seen": 25803912, + "step": 44460 + }, + { + "epoch": 6.6227286267500745, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02537727207779765, + "loss": 0.8019, + "num_input_tokens_seen": 25807016, + "step": 44465 + }, + { + "epoch": 6.623473339291033, + "grad_norm": 0.01904296875, + "learning_rate": 0.02537586420081397, + "loss": 0.7835, + "num_input_tokens_seen": 25809864, + "step": 44470 + }, + { + "epoch": 6.624218051831993, + "grad_norm": 0.0247802734375, + "learning_rate": 0.025374456148540614, + "loss": 0.7882, + "num_input_tokens_seen": 25812552, + "step": 44475 + }, + { + "epoch": 6.6249627643729525, + "grad_norm": 0.0205078125, + "learning_rate": 0.02537304792100136, + "loss": 0.8067, + "num_input_tokens_seen": 25815432, + "step": 44480 + }, + { + "epoch": 6.625707476913911, + "grad_norm": 0.01806640625, + "learning_rate": 0.02537163951822001, + "loss": 0.8024, + "num_input_tokens_seen": 25818376, + "step": 44485 + }, + { + "epoch": 6.62645218945487, + "grad_norm": 0.031982421875, + "learning_rate": 0.025370230940220343, + "loss": 0.7895, + "num_input_tokens_seen": 25821160, + "step": 44490 + }, + { + "epoch": 6.62719690199583, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02536882218702617, + "loss": 0.8015, + "num_input_tokens_seen": 25823976, + "step": 44495 + }, + { + "epoch": 6.627941614536788, + "grad_norm": 0.0230712890625, + "learning_rate": 0.025367413258661285, + "loss": 0.8023, + "num_input_tokens_seen": 25827176, + "step": 44500 + }, + { + "epoch": 6.628686327077748, + "grad_norm": 0.019775390625, + "learning_rate": 0.025366004155149488, + "loss": 0.805, + "num_input_tokens_seen": 25830248, + "step": 44505 + }, + { + "epoch": 6.629431039618707, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02536459487651459, + "loss": 0.807, + "num_input_tokens_seen": 25833288, + "step": 44510 + }, + { + "epoch": 6.6301757521596665, + "grad_norm": 0.0264892578125, + "learning_rate": 0.025363185422780394, + "loss": 0.8084, + "num_input_tokens_seen": 25836296, + "step": 44515 + }, + { + "epoch": 6.630920464700625, + "grad_norm": 0.03515625, + "learning_rate": 0.025361775793970708, + "loss": 0.7925, + "num_input_tokens_seen": 25839112, + "step": 44520 + }, + { + "epoch": 6.631665177241585, + "grad_norm": 0.020263671875, + "learning_rate": 0.025360365990109355, + "loss": 0.7857, + "num_input_tokens_seen": 25841992, + "step": 44525 + }, + { + "epoch": 6.632409889782544, + "grad_norm": 0.022216796875, + "learning_rate": 0.025358956011220152, + "loss": 0.8183, + "num_input_tokens_seen": 25844872, + "step": 44530 + }, + { + "epoch": 6.633154602323503, + "grad_norm": 0.0244140625, + "learning_rate": 0.02535754585732691, + "loss": 0.81, + "num_input_tokens_seen": 25847752, + "step": 44535 + }, + { + "epoch": 6.633899314864462, + "grad_norm": 0.013671875, + "learning_rate": 0.025356135528453456, + "loss": 0.8003, + "num_input_tokens_seen": 25850664, + "step": 44540 + }, + { + "epoch": 6.634644027405422, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02535472502462362, + "loss": 0.7964, + "num_input_tokens_seen": 25853928, + "step": 44545 + }, + { + "epoch": 6.6353887399463805, + "grad_norm": 0.01416015625, + "learning_rate": 0.025353314345861234, + "loss": 0.8142, + "num_input_tokens_seen": 25856840, + "step": 44550 + }, + { + "epoch": 6.63613345248734, + "grad_norm": 0.021240234375, + "learning_rate": 0.02535190349219012, + "loss": 0.8123, + "num_input_tokens_seen": 25859752, + "step": 44555 + }, + { + "epoch": 6.636878165028299, + "grad_norm": 0.01275634765625, + "learning_rate": 0.02535049246363412, + "loss": 0.816, + "num_input_tokens_seen": 25862888, + "step": 44560 + }, + { + "epoch": 6.6376228775692585, + "grad_norm": 0.01953125, + "learning_rate": 0.025349081260217066, + "loss": 0.7901, + "num_input_tokens_seen": 25865640, + "step": 44565 + }, + { + "epoch": 6.638367590110217, + "grad_norm": 0.0242919921875, + "learning_rate": 0.025347669881962805, + "loss": 0.7988, + "num_input_tokens_seen": 25868424, + "step": 44570 + }, + { + "epoch": 6.639112302651177, + "grad_norm": 0.01904296875, + "learning_rate": 0.025346258328895182, + "loss": 0.7977, + "num_input_tokens_seen": 25871368, + "step": 44575 + }, + { + "epoch": 6.639857015192136, + "grad_norm": 0.0206298828125, + "learning_rate": 0.025344846601038033, + "loss": 0.7889, + "num_input_tokens_seen": 25874088, + "step": 44580 + }, + { + "epoch": 6.640601727733095, + "grad_norm": 0.0296630859375, + "learning_rate": 0.025343434698415216, + "loss": 0.7794, + "num_input_tokens_seen": 25877000, + "step": 44585 + }, + { + "epoch": 6.641346440274054, + "grad_norm": 0.031494140625, + "learning_rate": 0.025342022621050585, + "loss": 0.8133, + "num_input_tokens_seen": 25879912, + "step": 44590 + }, + { + "epoch": 6.642091152815014, + "grad_norm": 0.0234375, + "learning_rate": 0.025340610368967995, + "loss": 0.7915, + "num_input_tokens_seen": 25883176, + "step": 44595 + }, + { + "epoch": 6.6428358653559725, + "grad_norm": 0.0203857421875, + "learning_rate": 0.025339197942191298, + "loss": 0.7972, + "num_input_tokens_seen": 25886216, + "step": 44600 + }, + { + "epoch": 6.643580577896932, + "grad_norm": 0.02880859375, + "learning_rate": 0.02533778534074436, + "loss": 0.7978, + "num_input_tokens_seen": 25888904, + "step": 44605 + }, + { + "epoch": 6.644325290437891, + "grad_norm": 0.017333984375, + "learning_rate": 0.02533637256465105, + "loss": 0.7991, + "num_input_tokens_seen": 25891816, + "step": 44610 + }, + { + "epoch": 6.6450700029788505, + "grad_norm": 0.03466796875, + "learning_rate": 0.025334959613935226, + "loss": 0.8176, + "num_input_tokens_seen": 25894984, + "step": 44615 + }, + { + "epoch": 6.645814715519809, + "grad_norm": 0.0211181640625, + "learning_rate": 0.025333546488620764, + "loss": 0.8024, + "num_input_tokens_seen": 25897832, + "step": 44620 + }, + { + "epoch": 6.646559428060769, + "grad_norm": 0.0205078125, + "learning_rate": 0.025332133188731537, + "loss": 0.8304, + "num_input_tokens_seen": 25900712, + "step": 44625 + }, + { + "epoch": 6.647304140601728, + "grad_norm": 0.0185546875, + "learning_rate": 0.025330719714291423, + "loss": 0.8239, + "num_input_tokens_seen": 25903432, + "step": 44630 + }, + { + "epoch": 6.6480488531426865, + "grad_norm": 0.0208740234375, + "learning_rate": 0.025329306065324297, + "loss": 0.8244, + "num_input_tokens_seen": 25906216, + "step": 44635 + }, + { + "epoch": 6.648793565683646, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02532789224185405, + "loss": 0.8162, + "num_input_tokens_seen": 25908872, + "step": 44640 + }, + { + "epoch": 6.649538278224606, + "grad_norm": 0.025146484375, + "learning_rate": 0.025326478243904552, + "loss": 0.7986, + "num_input_tokens_seen": 25912008, + "step": 44645 + }, + { + "epoch": 6.6502829907655645, + "grad_norm": 0.03173828125, + "learning_rate": 0.0253250640714997, + "loss": 0.7898, + "num_input_tokens_seen": 25914856, + "step": 44650 + }, + { + "epoch": 6.651027703306523, + "grad_norm": 0.0196533203125, + "learning_rate": 0.025323649724663385, + "loss": 0.7992, + "num_input_tokens_seen": 25918056, + "step": 44655 + }, + { + "epoch": 6.651772415847483, + "grad_norm": 0.021484375, + "learning_rate": 0.025322235203419502, + "loss": 0.8031, + "num_input_tokens_seen": 25920936, + "step": 44660 + }, + { + "epoch": 6.652517128388443, + "grad_norm": 0.0208740234375, + "learning_rate": 0.025320820507791948, + "loss": 0.8091, + "num_input_tokens_seen": 25923816, + "step": 44665 + }, + { + "epoch": 6.653261840929401, + "grad_norm": 0.0294189453125, + "learning_rate": 0.025319405637804618, + "loss": 0.8054, + "num_input_tokens_seen": 25926696, + "step": 44670 + }, + { + "epoch": 6.65400655347036, + "grad_norm": 0.019775390625, + "learning_rate": 0.025317990593481422, + "loss": 0.808, + "num_input_tokens_seen": 25929736, + "step": 44675 + }, + { + "epoch": 6.65475126601132, + "grad_norm": 0.02880859375, + "learning_rate": 0.025316575374846256, + "loss": 0.8037, + "num_input_tokens_seen": 25932744, + "step": 44680 + }, + { + "epoch": 6.6554959785522785, + "grad_norm": 0.02001953125, + "learning_rate": 0.025315159981923036, + "loss": 0.8051, + "num_input_tokens_seen": 25935752, + "step": 44685 + }, + { + "epoch": 6.656240691093238, + "grad_norm": 0.0184326171875, + "learning_rate": 0.025313744414735675, + "loss": 0.7985, + "num_input_tokens_seen": 25938440, + "step": 44690 + }, + { + "epoch": 6.656985403634197, + "grad_norm": 0.023193359375, + "learning_rate": 0.025312328673308085, + "loss": 0.8044, + "num_input_tokens_seen": 25941448, + "step": 44695 + }, + { + "epoch": 6.6577301161751565, + "grad_norm": 0.02685546875, + "learning_rate": 0.02531091275766418, + "loss": 0.823, + "num_input_tokens_seen": 25944136, + "step": 44700 + }, + { + "epoch": 6.658474828716115, + "grad_norm": 0.0107421875, + "learning_rate": 0.02530949666782788, + "loss": 0.7818, + "num_input_tokens_seen": 25946984, + "step": 44705 + }, + { + "epoch": 6.659219541257075, + "grad_norm": 0.0225830078125, + "learning_rate": 0.025308080403823117, + "loss": 0.8009, + "num_input_tokens_seen": 25950280, + "step": 44710 + }, + { + "epoch": 6.659964253798034, + "grad_norm": 0.024169921875, + "learning_rate": 0.025306663965673815, + "loss": 0.8193, + "num_input_tokens_seen": 25953288, + "step": 44715 + }, + { + "epoch": 6.660708966338993, + "grad_norm": 0.01513671875, + "learning_rate": 0.025305247353403896, + "loss": 0.8207, + "num_input_tokens_seen": 25956072, + "step": 44720 + }, + { + "epoch": 6.661453678879952, + "grad_norm": 0.031005859375, + "learning_rate": 0.025303830567037295, + "loss": 0.7878, + "num_input_tokens_seen": 25958792, + "step": 44725 + }, + { + "epoch": 6.662198391420912, + "grad_norm": 0.022216796875, + "learning_rate": 0.025302413606597955, + "loss": 0.8131, + "num_input_tokens_seen": 25961800, + "step": 44730 + }, + { + "epoch": 6.6629431039618705, + "grad_norm": 0.0115966796875, + "learning_rate": 0.025300996472109805, + "loss": 0.8172, + "num_input_tokens_seen": 25964744, + "step": 44735 + }, + { + "epoch": 6.66368781650283, + "grad_norm": 0.01397705078125, + "learning_rate": 0.025299579163596785, + "loss": 0.787, + "num_input_tokens_seen": 25967496, + "step": 44740 + }, + { + "epoch": 6.664432529043789, + "grad_norm": 0.0120849609375, + "learning_rate": 0.025298161681082846, + "loss": 0.7998, + "num_input_tokens_seen": 25970728, + "step": 44745 + }, + { + "epoch": 6.665177241584749, + "grad_norm": 0.028076171875, + "learning_rate": 0.025296744024591936, + "loss": 0.8025, + "num_input_tokens_seen": 25973608, + "step": 44750 + }, + { + "epoch": 6.665921954125707, + "grad_norm": 0.0286865234375, + "learning_rate": 0.025295326194147996, + "loss": 0.7916, + "num_input_tokens_seen": 25976616, + "step": 44755 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.024658203125, + "learning_rate": 0.025293908189774984, + "loss": 0.8102, + "num_input_tokens_seen": 25979656, + "step": 44760 + }, + { + "epoch": 6.667411379207626, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02529249001149686, + "loss": 0.8122, + "num_input_tokens_seen": 25982504, + "step": 44765 + }, + { + "epoch": 6.668156091748585, + "grad_norm": 0.02197265625, + "learning_rate": 0.025291071659337576, + "loss": 0.7935, + "num_input_tokens_seen": 25986152, + "step": 44770 + }, + { + "epoch": 6.668900804289544, + "grad_norm": 0.02783203125, + "learning_rate": 0.025289653133321092, + "loss": 0.7871, + "num_input_tokens_seen": 25989672, + "step": 44775 + }, + { + "epoch": 6.669645516830504, + "grad_norm": 0.0308837890625, + "learning_rate": 0.025288234433471377, + "loss": 0.8049, + "num_input_tokens_seen": 25992584, + "step": 44780 + }, + { + "epoch": 6.6703902293714625, + "grad_norm": 0.013427734375, + "learning_rate": 0.0252868155598124, + "loss": 0.7884, + "num_input_tokens_seen": 25995624, + "step": 44785 + }, + { + "epoch": 6.671134941912422, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02528539651236813, + "loss": 0.8262, + "num_input_tokens_seen": 25998536, + "step": 44790 + }, + { + "epoch": 6.671879654453381, + "grad_norm": 0.01312255859375, + "learning_rate": 0.025283977291162534, + "loss": 0.8012, + "num_input_tokens_seen": 26001288, + "step": 44795 + }, + { + "epoch": 6.67262436699434, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0252825578962196, + "loss": 0.8138, + "num_input_tokens_seen": 26004232, + "step": 44800 + }, + { + "epoch": 6.673369079535299, + "grad_norm": 0.018310546875, + "learning_rate": 0.0252811383275633, + "loss": 0.8109, + "num_input_tokens_seen": 26007272, + "step": 44805 + }, + { + "epoch": 6.674113792076259, + "grad_norm": 0.01171875, + "learning_rate": 0.025279718585217617, + "loss": 0.7948, + "num_input_tokens_seen": 26010024, + "step": 44810 + }, + { + "epoch": 6.674858504617218, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02527829866920653, + "loss": 0.8038, + "num_input_tokens_seen": 26012904, + "step": 44815 + }, + { + "epoch": 6.6756032171581765, + "grad_norm": 0.01220703125, + "learning_rate": 0.025276878579554043, + "loss": 0.7948, + "num_input_tokens_seen": 26015624, + "step": 44820 + }, + { + "epoch": 6.676347929699136, + "grad_norm": 0.01251220703125, + "learning_rate": 0.025275458316284135, + "loss": 0.8157, + "num_input_tokens_seen": 26018600, + "step": 44825 + }, + { + "epoch": 6.677092642240096, + "grad_norm": 0.018310546875, + "learning_rate": 0.0252740378794208, + "loss": 0.8242, + "num_input_tokens_seen": 26021576, + "step": 44830 + }, + { + "epoch": 6.677837354781055, + "grad_norm": 0.0206298828125, + "learning_rate": 0.025272617268988037, + "loss": 0.7978, + "num_input_tokens_seen": 26024456, + "step": 44835 + }, + { + "epoch": 6.678582067322013, + "grad_norm": 0.013671875, + "learning_rate": 0.025271196485009847, + "loss": 0.7979, + "num_input_tokens_seen": 26027400, + "step": 44840 + }, + { + "epoch": 6.679326779862973, + "grad_norm": 0.0174560546875, + "learning_rate": 0.02526977552751023, + "loss": 0.8176, + "num_input_tokens_seen": 26030056, + "step": 44845 + }, + { + "epoch": 6.680071492403932, + "grad_norm": 0.0228271484375, + "learning_rate": 0.025268354396513197, + "loss": 0.7917, + "num_input_tokens_seen": 26033224, + "step": 44850 + }, + { + "epoch": 6.680816204944891, + "grad_norm": 0.0177001953125, + "learning_rate": 0.025266933092042747, + "loss": 0.7876, + "num_input_tokens_seen": 26036136, + "step": 44855 + }, + { + "epoch": 6.68156091748585, + "grad_norm": 0.02587890625, + "learning_rate": 0.0252655116141229, + "loss": 0.8055, + "num_input_tokens_seen": 26039368, + "step": 44860 + }, + { + "epoch": 6.68230563002681, + "grad_norm": 0.019287109375, + "learning_rate": 0.02526408996277767, + "loss": 0.7837, + "num_input_tokens_seen": 26042600, + "step": 44865 + }, + { + "epoch": 6.6830503425677685, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025262668138031068, + "loss": 0.8172, + "num_input_tokens_seen": 26045416, + "step": 44870 + }, + { + "epoch": 6.683795055108728, + "grad_norm": 0.01446533203125, + "learning_rate": 0.02526124613990712, + "loss": 0.7885, + "num_input_tokens_seen": 26048296, + "step": 44875 + }, + { + "epoch": 6.684539767649687, + "grad_norm": 0.025634765625, + "learning_rate": 0.025259823968429847, + "loss": 0.7781, + "num_input_tokens_seen": 26051048, + "step": 44880 + }, + { + "epoch": 6.685284480190647, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02525840162362328, + "loss": 0.7939, + "num_input_tokens_seen": 26054120, + "step": 44885 + }, + { + "epoch": 6.686029192731605, + "grad_norm": 0.0283203125, + "learning_rate": 0.025256979105511435, + "loss": 0.8013, + "num_input_tokens_seen": 26057128, + "step": 44890 + }, + { + "epoch": 6.686773905272565, + "grad_norm": 0.019775390625, + "learning_rate": 0.025255556414118363, + "loss": 0.8036, + "num_input_tokens_seen": 26059880, + "step": 44895 + }, + { + "epoch": 6.687518617813524, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02525413354946808, + "loss": 0.8095, + "num_input_tokens_seen": 26062920, + "step": 44900 + }, + { + "epoch": 6.688263330354483, + "grad_norm": 0.028564453125, + "learning_rate": 0.025252710511584637, + "loss": 0.813, + "num_input_tokens_seen": 26065544, + "step": 44905 + }, + { + "epoch": 6.689008042895442, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02525128730049207, + "loss": 0.8137, + "num_input_tokens_seen": 26068168, + "step": 44910 + }, + { + "epoch": 6.689752755436402, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02524986391621442, + "loss": 0.823, + "num_input_tokens_seen": 26071048, + "step": 44915 + }, + { + "epoch": 6.690497467977361, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02524844035877574, + "loss": 0.8073, + "num_input_tokens_seen": 26074344, + "step": 44920 + }, + { + "epoch": 6.69124218051832, + "grad_norm": 0.0191650390625, + "learning_rate": 0.025247016628200076, + "loss": 0.8124, + "num_input_tokens_seen": 26077480, + "step": 44925 + }, + { + "epoch": 6.691986893059279, + "grad_norm": 0.02490234375, + "learning_rate": 0.025245592724511483, + "loss": 0.7958, + "num_input_tokens_seen": 26080552, + "step": 44930 + }, + { + "epoch": 6.692731605600239, + "grad_norm": 0.018310546875, + "learning_rate": 0.025244168647734006, + "loss": 0.7913, + "num_input_tokens_seen": 26083528, + "step": 44935 + }, + { + "epoch": 6.693476318141197, + "grad_norm": 0.0255126953125, + "learning_rate": 0.025242744397891717, + "loss": 0.7885, + "num_input_tokens_seen": 26086568, + "step": 44940 + }, + { + "epoch": 6.694221030682157, + "grad_norm": 0.0240478515625, + "learning_rate": 0.025241319975008673, + "loss": 0.7913, + "num_input_tokens_seen": 26089480, + "step": 44945 + }, + { + "epoch": 6.694965743223116, + "grad_norm": 0.019775390625, + "learning_rate": 0.025239895379108934, + "loss": 0.7889, + "num_input_tokens_seen": 26092136, + "step": 44950 + }, + { + "epoch": 6.695710455764075, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025238470610216574, + "loss": 0.7865, + "num_input_tokens_seen": 26095144, + "step": 44955 + }, + { + "epoch": 6.696455168305034, + "grad_norm": 0.01507568359375, + "learning_rate": 0.025237045668355655, + "loss": 0.8111, + "num_input_tokens_seen": 26098184, + "step": 44960 + }, + { + "epoch": 6.697199880845994, + "grad_norm": 0.0203857421875, + "learning_rate": 0.025235620553550258, + "loss": 0.8412, + "num_input_tokens_seen": 26101096, + "step": 44965 + }, + { + "epoch": 6.697944593386953, + "grad_norm": 0.025390625, + "learning_rate": 0.025234195265824447, + "loss": 0.8191, + "num_input_tokens_seen": 26103976, + "step": 44970 + }, + { + "epoch": 6.698689305927912, + "grad_norm": 0.038330078125, + "learning_rate": 0.025232769805202313, + "loss": 0.8175, + "num_input_tokens_seen": 26107016, + "step": 44975 + }, + { + "epoch": 6.699434018468871, + "grad_norm": 0.0186767578125, + "learning_rate": 0.025231344171707932, + "loss": 0.7902, + "num_input_tokens_seen": 26109768, + "step": 44980 + }, + { + "epoch": 6.70017873100983, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02522991836536539, + "loss": 0.8268, + "num_input_tokens_seen": 26112808, + "step": 44985 + }, + { + "epoch": 6.700923443550789, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02522849238619878, + "loss": 0.8062, + "num_input_tokens_seen": 26115688, + "step": 44990 + }, + { + "epoch": 6.701668156091749, + "grad_norm": 0.01953125, + "learning_rate": 0.02522706623423218, + "loss": 0.8009, + "num_input_tokens_seen": 26118696, + "step": 44995 + }, + { + "epoch": 6.702412868632708, + "grad_norm": 0.0247802734375, + "learning_rate": 0.025225639909489696, + "loss": 0.8009, + "num_input_tokens_seen": 26121768, + "step": 45000 + }, + { + "epoch": 6.703157581173667, + "grad_norm": 0.010498046875, + "learning_rate": 0.025224213411995416, + "loss": 0.7759, + "num_input_tokens_seen": 26124360, + "step": 45005 + }, + { + "epoch": 6.703902293714626, + "grad_norm": 0.0194091796875, + "learning_rate": 0.025222786741773442, + "loss": 0.8016, + "num_input_tokens_seen": 26127400, + "step": 45010 + }, + { + "epoch": 6.704647006255585, + "grad_norm": 0.02734375, + "learning_rate": 0.02522135989884787, + "loss": 0.8391, + "num_input_tokens_seen": 26130184, + "step": 45015 + }, + { + "epoch": 6.705391718796545, + "grad_norm": 0.023681640625, + "learning_rate": 0.025219932883242816, + "loss": 0.7985, + "num_input_tokens_seen": 26133224, + "step": 45020 + }, + { + "epoch": 6.706136431337503, + "grad_norm": 0.033203125, + "learning_rate": 0.025218505694982386, + "loss": 0.7809, + "num_input_tokens_seen": 26136264, + "step": 45025 + }, + { + "epoch": 6.706881143878463, + "grad_norm": 0.0277099609375, + "learning_rate": 0.025217078334090683, + "loss": 0.7902, + "num_input_tokens_seen": 26138888, + "step": 45030 + }, + { + "epoch": 6.707625856419422, + "grad_norm": 0.021484375, + "learning_rate": 0.02521565080059183, + "loss": 0.8263, + "num_input_tokens_seen": 26141640, + "step": 45035 + }, + { + "epoch": 6.708370568960381, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02521422309450994, + "loss": 0.8052, + "num_input_tokens_seen": 26144904, + "step": 45040 + }, + { + "epoch": 6.70911528150134, + "grad_norm": 0.020751953125, + "learning_rate": 0.025212795215869128, + "loss": 0.7736, + "num_input_tokens_seen": 26147464, + "step": 45045 + }, + { + "epoch": 6.7098599940423, + "grad_norm": 0.0262451171875, + "learning_rate": 0.02521136716469352, + "loss": 0.8259, + "num_input_tokens_seen": 26150824, + "step": 45050 + }, + { + "epoch": 6.710604706583259, + "grad_norm": 0.0224609375, + "learning_rate": 0.025209938941007246, + "loss": 0.8141, + "num_input_tokens_seen": 26153608, + "step": 45055 + }, + { + "epoch": 6.711349419124218, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02520851054483443, + "loss": 0.8038, + "num_input_tokens_seen": 26156552, + "step": 45060 + }, + { + "epoch": 6.712094131665177, + "grad_norm": 0.0205078125, + "learning_rate": 0.0252070819761992, + "loss": 0.8202, + "num_input_tokens_seen": 26159272, + "step": 45065 + }, + { + "epoch": 6.712838844206137, + "grad_norm": 0.0164794921875, + "learning_rate": 0.02520565323512569, + "loss": 0.812, + "num_input_tokens_seen": 26162024, + "step": 45070 + }, + { + "epoch": 6.713583556747095, + "grad_norm": 0.03662109375, + "learning_rate": 0.025204224321638054, + "loss": 0.8002, + "num_input_tokens_seen": 26164840, + "step": 45075 + }, + { + "epoch": 6.714328269288055, + "grad_norm": 0.031982421875, + "learning_rate": 0.025202795235760415, + "loss": 0.8203, + "num_input_tokens_seen": 26167688, + "step": 45080 + }, + { + "epoch": 6.715072981829014, + "grad_norm": 0.0235595703125, + "learning_rate": 0.025201365977516918, + "loss": 0.8152, + "num_input_tokens_seen": 26170760, + "step": 45085 + }, + { + "epoch": 6.7158176943699734, + "grad_norm": 0.02197265625, + "learning_rate": 0.025199936546931707, + "loss": 0.7979, + "num_input_tokens_seen": 26173896, + "step": 45090 + }, + { + "epoch": 6.716562406910932, + "grad_norm": 0.0283203125, + "learning_rate": 0.025198506944028944, + "loss": 0.7881, + "num_input_tokens_seen": 26176488, + "step": 45095 + }, + { + "epoch": 6.717307119451892, + "grad_norm": 0.016845703125, + "learning_rate": 0.025197077168832762, + "loss": 0.7919, + "num_input_tokens_seen": 26179432, + "step": 45100 + }, + { + "epoch": 6.718051831992851, + "grad_norm": 0.031005859375, + "learning_rate": 0.02519564722136733, + "loss": 0.8102, + "num_input_tokens_seen": 26182696, + "step": 45105 + }, + { + "epoch": 6.71879654453381, + "grad_norm": 0.0220947265625, + "learning_rate": 0.025194217101656802, + "loss": 0.7983, + "num_input_tokens_seen": 26185768, + "step": 45110 + }, + { + "epoch": 6.719541257074769, + "grad_norm": 0.016357421875, + "learning_rate": 0.025192786809725334, + "loss": 0.8069, + "num_input_tokens_seen": 26189064, + "step": 45115 + }, + { + "epoch": 6.720285969615729, + "grad_norm": 0.0283203125, + "learning_rate": 0.025191356345597094, + "loss": 0.7971, + "num_input_tokens_seen": 26191880, + "step": 45120 + }, + { + "epoch": 6.721030682156687, + "grad_norm": 0.02001953125, + "learning_rate": 0.02518992570929625, + "loss": 0.8139, + "num_input_tokens_seen": 26194728, + "step": 45125 + }, + { + "epoch": 6.721775394697647, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02518849490084696, + "loss": 0.8095, + "num_input_tokens_seen": 26197384, + "step": 45130 + }, + { + "epoch": 6.722520107238606, + "grad_norm": 0.018310546875, + "learning_rate": 0.025187063920273405, + "loss": 0.7979, + "num_input_tokens_seen": 26200360, + "step": 45135 + }, + { + "epoch": 6.7232648197795655, + "grad_norm": 0.0128173828125, + "learning_rate": 0.02518563276759976, + "loss": 0.8116, + "num_input_tokens_seen": 26203400, + "step": 45140 + }, + { + "epoch": 6.724009532320524, + "grad_norm": 0.0286865234375, + "learning_rate": 0.025184201442850204, + "loss": 0.8025, + "num_input_tokens_seen": 26206152, + "step": 45145 + }, + { + "epoch": 6.724754244861483, + "grad_norm": 0.015380859375, + "learning_rate": 0.025182769946048914, + "loss": 0.8062, + "num_input_tokens_seen": 26209064, + "step": 45150 + }, + { + "epoch": 6.725498957402443, + "grad_norm": 0.0169677734375, + "learning_rate": 0.025181338277220078, + "loss": 0.7813, + "num_input_tokens_seen": 26211816, + "step": 45155 + }, + { + "epoch": 6.726243669943402, + "grad_norm": 0.01495361328125, + "learning_rate": 0.025179906436387876, + "loss": 0.8108, + "num_input_tokens_seen": 26214888, + "step": 45160 + }, + { + "epoch": 6.726988382484361, + "grad_norm": 0.0205078125, + "learning_rate": 0.025178474423576503, + "loss": 0.8062, + "num_input_tokens_seen": 26217672, + "step": 45165 + }, + { + "epoch": 6.72773309502532, + "grad_norm": 0.01953125, + "learning_rate": 0.025177042238810145, + "loss": 0.7844, + "num_input_tokens_seen": 26220840, + "step": 45170 + }, + { + "epoch": 6.7284778075662794, + "grad_norm": 0.01348876953125, + "learning_rate": 0.02517560988211301, + "loss": 0.8159, + "num_input_tokens_seen": 26223944, + "step": 45175 + }, + { + "epoch": 6.729222520107239, + "grad_norm": 0.020263671875, + "learning_rate": 0.025174177353509283, + "loss": 0.7959, + "num_input_tokens_seen": 26226760, + "step": 45180 + }, + { + "epoch": 6.729967232648198, + "grad_norm": 0.012451171875, + "learning_rate": 0.025172744653023167, + "loss": 0.7966, + "num_input_tokens_seen": 26229736, + "step": 45185 + }, + { + "epoch": 6.730711945189157, + "grad_norm": 0.01611328125, + "learning_rate": 0.025171311780678874, + "loss": 0.7875, + "num_input_tokens_seen": 26232552, + "step": 45190 + }, + { + "epoch": 6.731456657730116, + "grad_norm": 0.036376953125, + "learning_rate": 0.025169878736500603, + "loss": 0.8264, + "num_input_tokens_seen": 26235176, + "step": 45195 + }, + { + "epoch": 6.732201370271075, + "grad_norm": 0.017333984375, + "learning_rate": 0.025168445520512575, + "loss": 0.8117, + "num_input_tokens_seen": 26238216, + "step": 45200 + }, + { + "epoch": 6.732946082812035, + "grad_norm": 0.02880859375, + "learning_rate": 0.025167012132738985, + "loss": 0.79, + "num_input_tokens_seen": 26240936, + "step": 45205 + }, + { + "epoch": 6.733690795352993, + "grad_norm": 0.0201416015625, + "learning_rate": 0.025165578573204065, + "loss": 0.7875, + "num_input_tokens_seen": 26243752, + "step": 45210 + }, + { + "epoch": 6.734435507893953, + "grad_norm": 0.0174560546875, + "learning_rate": 0.02516414484193202, + "loss": 0.798, + "num_input_tokens_seen": 26246696, + "step": 45215 + }, + { + "epoch": 6.735180220434912, + "grad_norm": 0.018798828125, + "learning_rate": 0.02516271093894708, + "loss": 0.8045, + "num_input_tokens_seen": 26249832, + "step": 45220 + }, + { + "epoch": 6.7359249329758715, + "grad_norm": 0.0224609375, + "learning_rate": 0.025161276864273472, + "loss": 0.8164, + "num_input_tokens_seen": 26253032, + "step": 45225 + }, + { + "epoch": 6.73666964551683, + "grad_norm": 0.031005859375, + "learning_rate": 0.02515984261793542, + "loss": 0.8133, + "num_input_tokens_seen": 26255944, + "step": 45230 + }, + { + "epoch": 6.73741435805779, + "grad_norm": 0.0128173828125, + "learning_rate": 0.025158408199957145, + "loss": 0.8248, + "num_input_tokens_seen": 26258440, + "step": 45235 + }, + { + "epoch": 6.738159070598749, + "grad_norm": 0.02001953125, + "learning_rate": 0.025156973610362895, + "loss": 0.8101, + "num_input_tokens_seen": 26261224, + "step": 45240 + }, + { + "epoch": 6.738903783139708, + "grad_norm": 0.0162353515625, + "learning_rate": 0.025155538849176897, + "loss": 0.8028, + "num_input_tokens_seen": 26264584, + "step": 45245 + }, + { + "epoch": 6.739648495680667, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02515410391642339, + "loss": 0.7858, + "num_input_tokens_seen": 26267208, + "step": 45250 + }, + { + "epoch": 6.740393208221627, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02515266881212662, + "loss": 0.7957, + "num_input_tokens_seen": 26270088, + "step": 45255 + }, + { + "epoch": 6.7411379207625854, + "grad_norm": 0.02099609375, + "learning_rate": 0.025151233536310828, + "loss": 0.8054, + "num_input_tokens_seen": 26273000, + "step": 45260 + }, + { + "epoch": 6.741882633303545, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02514979808900026, + "loss": 0.7907, + "num_input_tokens_seen": 26275656, + "step": 45265 + }, + { + "epoch": 6.742627345844504, + "grad_norm": 0.0242919921875, + "learning_rate": 0.025148362470219173, + "loss": 0.792, + "num_input_tokens_seen": 26278536, + "step": 45270 + }, + { + "epoch": 6.7433720583854635, + "grad_norm": 0.027587890625, + "learning_rate": 0.025146926679991816, + "loss": 0.7999, + "num_input_tokens_seen": 26281384, + "step": 45275 + }, + { + "epoch": 6.744116770926422, + "grad_norm": 0.01434326171875, + "learning_rate": 0.02514549071834244, + "loss": 0.8197, + "num_input_tokens_seen": 26284392, + "step": 45280 + }, + { + "epoch": 6.744861483467382, + "grad_norm": 0.02294921875, + "learning_rate": 0.025144054585295318, + "loss": 0.797, + "num_input_tokens_seen": 26287080, + "step": 45285 + }, + { + "epoch": 6.745606196008341, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0251426182808747, + "loss": 0.8242, + "num_input_tokens_seen": 26289928, + "step": 45290 + }, + { + "epoch": 6.7463509085493, + "grad_norm": 0.0299072265625, + "learning_rate": 0.025141181805104853, + "loss": 0.7872, + "num_input_tokens_seen": 26292776, + "step": 45295 + }, + { + "epoch": 6.747095621090259, + "grad_norm": 0.0224609375, + "learning_rate": 0.02513974515801005, + "loss": 0.8046, + "num_input_tokens_seen": 26295688, + "step": 45300 + }, + { + "epoch": 6.747840333631219, + "grad_norm": 0.01251220703125, + "learning_rate": 0.025138308339614557, + "loss": 0.7981, + "num_input_tokens_seen": 26298600, + "step": 45305 + }, + { + "epoch": 6.7485850461721775, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02513687134994265, + "loss": 0.7925, + "num_input_tokens_seen": 26301448, + "step": 45310 + }, + { + "epoch": 6.749329758713137, + "grad_norm": 0.0289306640625, + "learning_rate": 0.025135434189018598, + "loss": 0.8136, + "num_input_tokens_seen": 26304520, + "step": 45315 + }, + { + "epoch": 6.750074471254096, + "grad_norm": 0.026123046875, + "learning_rate": 0.02513399685686669, + "loss": 0.8177, + "num_input_tokens_seen": 26307944, + "step": 45320 + }, + { + "epoch": 6.7508191837950555, + "grad_norm": 0.023193359375, + "learning_rate": 0.025132559353511205, + "loss": 0.8147, + "num_input_tokens_seen": 26310856, + "step": 45325 + }, + { + "epoch": 6.751563896336014, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02513112167897643, + "loss": 0.7941, + "num_input_tokens_seen": 26313832, + "step": 45330 + }, + { + "epoch": 6.752308608876973, + "grad_norm": 0.0194091796875, + "learning_rate": 0.025129683833286648, + "loss": 0.785, + "num_input_tokens_seen": 26316744, + "step": 45335 + }, + { + "epoch": 6.753053321417933, + "grad_norm": 0.01904296875, + "learning_rate": 0.02512824581646616, + "loss": 0.7855, + "num_input_tokens_seen": 26319784, + "step": 45340 + }, + { + "epoch": 6.753798033958892, + "grad_norm": 0.0218505859375, + "learning_rate": 0.025126807628539242, + "loss": 0.7966, + "num_input_tokens_seen": 26322728, + "step": 45345 + }, + { + "epoch": 6.754542746499851, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02512536926953021, + "loss": 0.7936, + "num_input_tokens_seen": 26325736, + "step": 45350 + }, + { + "epoch": 6.75528745904081, + "grad_norm": 0.021240234375, + "learning_rate": 0.02512393073946335, + "loss": 0.7919, + "num_input_tokens_seen": 26328744, + "step": 45355 + }, + { + "epoch": 6.7560321715817695, + "grad_norm": 0.02685546875, + "learning_rate": 0.025122492038362972, + "loss": 0.8122, + "num_input_tokens_seen": 26331784, + "step": 45360 + }, + { + "epoch": 6.756776884122728, + "grad_norm": 0.019287109375, + "learning_rate": 0.02512105316625338, + "loss": 0.8087, + "num_input_tokens_seen": 26334696, + "step": 45365 + }, + { + "epoch": 6.757521596663688, + "grad_norm": 0.025634765625, + "learning_rate": 0.02511961412315888, + "loss": 0.801, + "num_input_tokens_seen": 26337512, + "step": 45370 + }, + { + "epoch": 6.758266309204647, + "grad_norm": 0.01953125, + "learning_rate": 0.025118174909103785, + "loss": 0.7927, + "num_input_tokens_seen": 26340456, + "step": 45375 + }, + { + "epoch": 6.759011021745606, + "grad_norm": 0.019287109375, + "learning_rate": 0.025116735524112407, + "loss": 0.7973, + "num_input_tokens_seen": 26343336, + "step": 45380 + }, + { + "epoch": 6.759755734286565, + "grad_norm": 0.0218505859375, + "learning_rate": 0.025115295968209064, + "loss": 0.7997, + "num_input_tokens_seen": 26346120, + "step": 45385 + }, + { + "epoch": 6.760500446827525, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02511385624141808, + "loss": 0.7959, + "num_input_tokens_seen": 26348904, + "step": 45390 + }, + { + "epoch": 6.7612451593684835, + "grad_norm": 0.020751953125, + "learning_rate": 0.025112416343763773, + "loss": 0.8221, + "num_input_tokens_seen": 26351816, + "step": 45395 + }, + { + "epoch": 6.761989871909443, + "grad_norm": 0.019287109375, + "learning_rate": 0.02511097627527047, + "loss": 0.789, + "num_input_tokens_seen": 26354536, + "step": 45400 + }, + { + "epoch": 6.762734584450402, + "grad_norm": 0.024169921875, + "learning_rate": 0.025109536035962495, + "loss": 0.8168, + "num_input_tokens_seen": 26357288, + "step": 45405 + }, + { + "epoch": 6.7634792969913615, + "grad_norm": 0.023193359375, + "learning_rate": 0.025108095625864187, + "loss": 0.7877, + "num_input_tokens_seen": 26360168, + "step": 45410 + }, + { + "epoch": 6.76422400953232, + "grad_norm": 0.0177001953125, + "learning_rate": 0.025106655044999877, + "loss": 0.7624, + "num_input_tokens_seen": 26362728, + "step": 45415 + }, + { + "epoch": 6.76496872207328, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0251052142933939, + "loss": 0.7801, + "num_input_tokens_seen": 26365768, + "step": 45420 + }, + { + "epoch": 6.765713434614239, + "grad_norm": 0.019287109375, + "learning_rate": 0.025103773371070604, + "loss": 0.771, + "num_input_tokens_seen": 26368680, + "step": 45425 + }, + { + "epoch": 6.766458147155198, + "grad_norm": 0.0216064453125, + "learning_rate": 0.025102332278054317, + "loss": 0.79, + "num_input_tokens_seen": 26371880, + "step": 45430 + }, + { + "epoch": 6.767202859696157, + "grad_norm": 0.02001953125, + "learning_rate": 0.025100891014369398, + "loss": 0.7928, + "num_input_tokens_seen": 26374856, + "step": 45435 + }, + { + "epoch": 6.767947572237117, + "grad_norm": 0.0255126953125, + "learning_rate": 0.025099449580040196, + "loss": 0.803, + "num_input_tokens_seen": 26377768, + "step": 45440 + }, + { + "epoch": 6.7686922847780755, + "grad_norm": 0.0291748046875, + "learning_rate": 0.025098007975091055, + "loss": 0.8051, + "num_input_tokens_seen": 26380712, + "step": 45445 + }, + { + "epoch": 6.769436997319035, + "grad_norm": 0.01220703125, + "learning_rate": 0.025096566199546328, + "loss": 0.7927, + "num_input_tokens_seen": 26383624, + "step": 45450 + }, + { + "epoch": 6.770181709859994, + "grad_norm": 0.032470703125, + "learning_rate": 0.025095124253430377, + "loss": 0.7755, + "num_input_tokens_seen": 26386568, + "step": 45455 + }, + { + "epoch": 6.7709264224009535, + "grad_norm": 0.018798828125, + "learning_rate": 0.025093682136767565, + "loss": 0.8103, + "num_input_tokens_seen": 26389160, + "step": 45460 + }, + { + "epoch": 6.771671134941912, + "grad_norm": 0.011962890625, + "learning_rate": 0.02509223984958225, + "loss": 0.8151, + "num_input_tokens_seen": 26391880, + "step": 45465 + }, + { + "epoch": 6.772415847482872, + "grad_norm": 0.0172119140625, + "learning_rate": 0.025090797391898803, + "loss": 0.8006, + "num_input_tokens_seen": 26394664, + "step": 45470 + }, + { + "epoch": 6.773160560023831, + "grad_norm": 0.02978515625, + "learning_rate": 0.025089354763741592, + "loss": 0.8056, + "num_input_tokens_seen": 26397512, + "step": 45475 + }, + { + "epoch": 6.77390527256479, + "grad_norm": 0.0289306640625, + "learning_rate": 0.025087911965134983, + "loss": 0.7797, + "num_input_tokens_seen": 26400328, + "step": 45480 + }, + { + "epoch": 6.774649985105749, + "grad_norm": 0.015869140625, + "learning_rate": 0.025086468996103353, + "loss": 0.8052, + "num_input_tokens_seen": 26402952, + "step": 45485 + }, + { + "epoch": 6.775394697646709, + "grad_norm": 0.0150146484375, + "learning_rate": 0.025085025856671078, + "loss": 0.83, + "num_input_tokens_seen": 26405768, + "step": 45490 + }, + { + "epoch": 6.7761394101876675, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02508358254686255, + "loss": 0.8199, + "num_input_tokens_seen": 26408552, + "step": 45495 + }, + { + "epoch": 6.776884122728626, + "grad_norm": 0.017578125, + "learning_rate": 0.025082139066702137, + "loss": 0.8016, + "num_input_tokens_seen": 26411176, + "step": 45500 + }, + { + "epoch": 6.777628835269586, + "grad_norm": 0.0181884765625, + "learning_rate": 0.025080695416214233, + "loss": 0.7774, + "num_input_tokens_seen": 26413960, + "step": 45505 + }, + { + "epoch": 6.778373547810546, + "grad_norm": 0.0194091796875, + "learning_rate": 0.025079251595423225, + "loss": 0.8155, + "num_input_tokens_seen": 26416936, + "step": 45510 + }, + { + "epoch": 6.779118260351504, + "grad_norm": 0.01483154296875, + "learning_rate": 0.025077807604353505, + "loss": 0.802, + "num_input_tokens_seen": 26419944, + "step": 45515 + }, + { + "epoch": 6.779862972892463, + "grad_norm": 0.020263671875, + "learning_rate": 0.02507636344302947, + "loss": 0.7969, + "num_input_tokens_seen": 26422824, + "step": 45520 + }, + { + "epoch": 6.780607685433423, + "grad_norm": 0.019775390625, + "learning_rate": 0.025074919111475517, + "loss": 0.7975, + "num_input_tokens_seen": 26425736, + "step": 45525 + }, + { + "epoch": 6.781352397974382, + "grad_norm": 0.01904296875, + "learning_rate": 0.025073474609716043, + "loss": 0.8111, + "num_input_tokens_seen": 26428776, + "step": 45530 + }, + { + "epoch": 6.782097110515341, + "grad_norm": 0.020263671875, + "learning_rate": 0.02507202993777545, + "loss": 0.8087, + "num_input_tokens_seen": 26431496, + "step": 45535 + }, + { + "epoch": 6.7828418230563, + "grad_norm": 0.0189208984375, + "learning_rate": 0.025070585095678152, + "loss": 0.7965, + "num_input_tokens_seen": 26434600, + "step": 45540 + }, + { + "epoch": 6.7835865355972595, + "grad_norm": 0.031982421875, + "learning_rate": 0.025069140083448557, + "loss": 0.8453, + "num_input_tokens_seen": 26437448, + "step": 45545 + }, + { + "epoch": 6.784331248138218, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02506769490111107, + "loss": 0.7942, + "num_input_tokens_seen": 26440328, + "step": 45550 + }, + { + "epoch": 6.785075960679178, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02506624954869011, + "loss": 0.7807, + "num_input_tokens_seen": 26443240, + "step": 45555 + }, + { + "epoch": 6.785820673220137, + "grad_norm": 0.01318359375, + "learning_rate": 0.025064804026210096, + "loss": 0.7852, + "num_input_tokens_seen": 26445928, + "step": 45560 + }, + { + "epoch": 6.786565385761096, + "grad_norm": 0.014892578125, + "learning_rate": 0.025063358333695447, + "loss": 0.8108, + "num_input_tokens_seen": 26448872, + "step": 45565 + }, + { + "epoch": 6.787310098302055, + "grad_norm": 0.016845703125, + "learning_rate": 0.025061912471170584, + "loss": 0.8035, + "num_input_tokens_seen": 26451912, + "step": 45570 + }, + { + "epoch": 6.788054810843015, + "grad_norm": 0.019287109375, + "learning_rate": 0.02506046643865994, + "loss": 0.7899, + "num_input_tokens_seen": 26454664, + "step": 45575 + }, + { + "epoch": 6.7887995233839735, + "grad_norm": 0.020263671875, + "learning_rate": 0.025059020236187942, + "loss": 0.812, + "num_input_tokens_seen": 26457448, + "step": 45580 + }, + { + "epoch": 6.789544235924933, + "grad_norm": 0.0341796875, + "learning_rate": 0.02505757386377902, + "loss": 0.8178, + "num_input_tokens_seen": 26460456, + "step": 45585 + }, + { + "epoch": 6.790288948465892, + "grad_norm": 0.0205078125, + "learning_rate": 0.025056127321457608, + "loss": 0.8128, + "num_input_tokens_seen": 26463528, + "step": 45590 + }, + { + "epoch": 6.791033661006852, + "grad_norm": 0.013427734375, + "learning_rate": 0.025054680609248144, + "loss": 0.8133, + "num_input_tokens_seen": 26466440, + "step": 45595 + }, + { + "epoch": 6.79177837354781, + "grad_norm": 0.02294921875, + "learning_rate": 0.02505323372717507, + "loss": 0.8089, + "num_input_tokens_seen": 26469160, + "step": 45600 + }, + { + "epoch": 6.79252308608877, + "grad_norm": 0.038330078125, + "learning_rate": 0.025051786675262835, + "loss": 0.7978, + "num_input_tokens_seen": 26471688, + "step": 45605 + }, + { + "epoch": 6.793267798629729, + "grad_norm": 0.0137939453125, + "learning_rate": 0.025050339453535877, + "loss": 0.8013, + "num_input_tokens_seen": 26474504, + "step": 45610 + }, + { + "epoch": 6.794012511170688, + "grad_norm": 0.0262451171875, + "learning_rate": 0.025048892062018647, + "loss": 0.7968, + "num_input_tokens_seen": 26477480, + "step": 45615 + }, + { + "epoch": 6.794757223711647, + "grad_norm": 0.02587890625, + "learning_rate": 0.0250474445007356, + "loss": 0.7913, + "num_input_tokens_seen": 26480200, + "step": 45620 + }, + { + "epoch": 6.795501936252607, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02504599676971119, + "loss": 0.8045, + "num_input_tokens_seen": 26483048, + "step": 45625 + }, + { + "epoch": 6.7962466487935655, + "grad_norm": 0.0283203125, + "learning_rate": 0.025044548868969875, + "loss": 0.7948, + "num_input_tokens_seen": 26485768, + "step": 45630 + }, + { + "epoch": 6.796991361334525, + "grad_norm": 0.018310546875, + "learning_rate": 0.025043100798536118, + "loss": 0.8004, + "num_input_tokens_seen": 26488872, + "step": 45635 + }, + { + "epoch": 6.797736073875484, + "grad_norm": 0.02734375, + "learning_rate": 0.025041652558434376, + "loss": 0.7887, + "num_input_tokens_seen": 26491784, + "step": 45640 + }, + { + "epoch": 6.798480786416444, + "grad_norm": 0.01239013671875, + "learning_rate": 0.025040204148689127, + "loss": 0.8035, + "num_input_tokens_seen": 26494696, + "step": 45645 + }, + { + "epoch": 6.799225498957402, + "grad_norm": 0.01336669921875, + "learning_rate": 0.025038755569324826, + "loss": 0.7844, + "num_input_tokens_seen": 26497384, + "step": 45650 + }, + { + "epoch": 6.799970211498362, + "grad_norm": 0.0181884765625, + "learning_rate": 0.025037306820365954, + "loss": 0.7853, + "num_input_tokens_seen": 26500392, + "step": 45655 + }, + { + "epoch": 6.800714924039321, + "grad_norm": 0.02734375, + "learning_rate": 0.025035857901836985, + "loss": 0.8227, + "num_input_tokens_seen": 26503176, + "step": 45660 + }, + { + "epoch": 6.8014596365802795, + "grad_norm": 0.01397705078125, + "learning_rate": 0.0250344088137624, + "loss": 0.815, + "num_input_tokens_seen": 26505992, + "step": 45665 + }, + { + "epoch": 6.802204349121239, + "grad_norm": 0.0150146484375, + "learning_rate": 0.025032959556166676, + "loss": 0.8145, + "num_input_tokens_seen": 26509064, + "step": 45670 + }, + { + "epoch": 6.802949061662199, + "grad_norm": 0.01953125, + "learning_rate": 0.025031510129074293, + "loss": 0.7981, + "num_input_tokens_seen": 26512168, + "step": 45675 + }, + { + "epoch": 6.803693774203158, + "grad_norm": 0.0135498046875, + "learning_rate": 0.025030060532509744, + "loss": 0.822, + "num_input_tokens_seen": 26514984, + "step": 45680 + }, + { + "epoch": 6.804438486744116, + "grad_norm": 0.020263671875, + "learning_rate": 0.025028610766497518, + "loss": 0.8053, + "num_input_tokens_seen": 26518088, + "step": 45685 + }, + { + "epoch": 6.805183199285076, + "grad_norm": 0.0224609375, + "learning_rate": 0.025027160831062104, + "loss": 0.8013, + "num_input_tokens_seen": 26520904, + "step": 45690 + }, + { + "epoch": 6.805927911826036, + "grad_norm": 0.0185546875, + "learning_rate": 0.025025710726227997, + "loss": 0.8116, + "num_input_tokens_seen": 26523688, + "step": 45695 + }, + { + "epoch": 6.806672624366994, + "grad_norm": 0.0224609375, + "learning_rate": 0.025024260452019702, + "loss": 0.8031, + "num_input_tokens_seen": 26526568, + "step": 45700 + }, + { + "epoch": 6.807417336907953, + "grad_norm": 0.01470947265625, + "learning_rate": 0.025022810008461713, + "loss": 0.8114, + "num_input_tokens_seen": 26529544, + "step": 45705 + }, + { + "epoch": 6.808162049448913, + "grad_norm": 0.01318359375, + "learning_rate": 0.02502135939557853, + "loss": 0.8193, + "num_input_tokens_seen": 26532488, + "step": 45710 + }, + { + "epoch": 6.8089067619898715, + "grad_norm": 0.02734375, + "learning_rate": 0.025019908613394672, + "loss": 0.7991, + "num_input_tokens_seen": 26535464, + "step": 45715 + }, + { + "epoch": 6.809651474530831, + "grad_norm": 0.0189208984375, + "learning_rate": 0.025018457661934643, + "loss": 0.7829, + "num_input_tokens_seen": 26538536, + "step": 45720 + }, + { + "epoch": 6.81039618707179, + "grad_norm": 0.0157470703125, + "learning_rate": 0.025017006541222953, + "loss": 0.8083, + "num_input_tokens_seen": 26541128, + "step": 45725 + }, + { + "epoch": 6.81114089961275, + "grad_norm": 0.0262451171875, + "learning_rate": 0.025015555251284113, + "loss": 0.8146, + "num_input_tokens_seen": 26543976, + "step": 45730 + }, + { + "epoch": 6.811885612153708, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02501410379214265, + "loss": 0.8243, + "num_input_tokens_seen": 26546856, + "step": 45735 + }, + { + "epoch": 6.812630324694668, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02501265216382308, + "loss": 0.7953, + "num_input_tokens_seen": 26550024, + "step": 45740 + }, + { + "epoch": 6.813375037235627, + "grad_norm": 0.01953125, + "learning_rate": 0.02501120036634993, + "loss": 0.7914, + "num_input_tokens_seen": 26553096, + "step": 45745 + }, + { + "epoch": 6.814119749776586, + "grad_norm": 0.01458740234375, + "learning_rate": 0.025009748399747723, + "loss": 0.8168, + "num_input_tokens_seen": 26555912, + "step": 45750 + }, + { + "epoch": 6.814864462317545, + "grad_norm": 0.031494140625, + "learning_rate": 0.025008296264040993, + "loss": 0.7933, + "num_input_tokens_seen": 26558792, + "step": 45755 + }, + { + "epoch": 6.815609174858505, + "grad_norm": 0.0224609375, + "learning_rate": 0.02500684395925427, + "loss": 0.795, + "num_input_tokens_seen": 26561704, + "step": 45760 + }, + { + "epoch": 6.816353887399464, + "grad_norm": 0.021484375, + "learning_rate": 0.025005391485412084, + "loss": 0.7932, + "num_input_tokens_seen": 26564456, + "step": 45765 + }, + { + "epoch": 6.817098599940423, + "grad_norm": 0.018310546875, + "learning_rate": 0.02500393884253898, + "loss": 0.785, + "num_input_tokens_seen": 26567272, + "step": 45770 + }, + { + "epoch": 6.817843312481382, + "grad_norm": 0.028564453125, + "learning_rate": 0.0250024860306595, + "loss": 0.7613, + "num_input_tokens_seen": 26570120, + "step": 45775 + }, + { + "epoch": 6.818588025022342, + "grad_norm": 0.0228271484375, + "learning_rate": 0.025001033049798182, + "loss": 0.8091, + "num_input_tokens_seen": 26573288, + "step": 45780 + }, + { + "epoch": 6.8193327375633, + "grad_norm": 0.0185546875, + "learning_rate": 0.024999579899979574, + "loss": 0.8033, + "num_input_tokens_seen": 26576488, + "step": 45785 + }, + { + "epoch": 6.82007745010426, + "grad_norm": 0.01202392578125, + "learning_rate": 0.024998126581228228, + "loss": 0.8062, + "num_input_tokens_seen": 26579208, + "step": 45790 + }, + { + "epoch": 6.820822162645219, + "grad_norm": 0.02197265625, + "learning_rate": 0.02499667309356869, + "loss": 0.8194, + "num_input_tokens_seen": 26582248, + "step": 45795 + }, + { + "epoch": 6.821566875186178, + "grad_norm": 0.024658203125, + "learning_rate": 0.02499521943702553, + "loss": 0.8056, + "num_input_tokens_seen": 26585096, + "step": 45800 + }, + { + "epoch": 6.822311587727137, + "grad_norm": 0.02294921875, + "learning_rate": 0.02499376561162329, + "loss": 0.8117, + "num_input_tokens_seen": 26588168, + "step": 45805 + }, + { + "epoch": 6.823056300268097, + "grad_norm": 0.0234375, + "learning_rate": 0.024992311617386537, + "loss": 0.8111, + "num_input_tokens_seen": 26590984, + "step": 45810 + }, + { + "epoch": 6.823801012809056, + "grad_norm": 0.0234375, + "learning_rate": 0.02499085745433984, + "loss": 0.8141, + "num_input_tokens_seen": 26593896, + "step": 45815 + }, + { + "epoch": 6.824545725350015, + "grad_norm": 0.025634765625, + "learning_rate": 0.024989403122507754, + "loss": 0.779, + "num_input_tokens_seen": 26596904, + "step": 45820 + }, + { + "epoch": 6.825290437890974, + "grad_norm": 0.0147705078125, + "learning_rate": 0.02498794862191486, + "loss": 0.7878, + "num_input_tokens_seen": 26599592, + "step": 45825 + }, + { + "epoch": 6.826035150431934, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02498649395258572, + "loss": 0.8013, + "num_input_tokens_seen": 26602536, + "step": 45830 + }, + { + "epoch": 6.826779862972892, + "grad_norm": 0.03173828125, + "learning_rate": 0.024985039114544923, + "loss": 0.807, + "num_input_tokens_seen": 26605352, + "step": 45835 + }, + { + "epoch": 6.827524575513852, + "grad_norm": 0.0242919921875, + "learning_rate": 0.024983584107817033, + "loss": 0.8331, + "num_input_tokens_seen": 26607912, + "step": 45840 + }, + { + "epoch": 6.828269288054811, + "grad_norm": 0.025390625, + "learning_rate": 0.02498212893242664, + "loss": 0.7891, + "num_input_tokens_seen": 26610568, + "step": 45845 + }, + { + "epoch": 6.82901400059577, + "grad_norm": 0.0235595703125, + "learning_rate": 0.024980673588398318, + "loss": 0.8285, + "num_input_tokens_seen": 26613672, + "step": 45850 + }, + { + "epoch": 6.829758713136729, + "grad_norm": 0.01153564453125, + "learning_rate": 0.024979218075756662, + "loss": 0.8057, + "num_input_tokens_seen": 26616680, + "step": 45855 + }, + { + "epoch": 6.830503425677689, + "grad_norm": 0.0216064453125, + "learning_rate": 0.024977762394526256, + "loss": 0.8095, + "num_input_tokens_seen": 26619816, + "step": 45860 + }, + { + "epoch": 6.831248138218648, + "grad_norm": 0.02783203125, + "learning_rate": 0.0249763065447317, + "loss": 0.7951, + "num_input_tokens_seen": 26622760, + "step": 45865 + }, + { + "epoch": 6.831992850759606, + "grad_norm": 0.018798828125, + "learning_rate": 0.024974850526397585, + "loss": 0.8263, + "num_input_tokens_seen": 26625512, + "step": 45870 + }, + { + "epoch": 6.832737563300566, + "grad_norm": 0.02197265625, + "learning_rate": 0.024973394339548503, + "loss": 0.7981, + "num_input_tokens_seen": 26630056, + "step": 45875 + }, + { + "epoch": 6.833482275841525, + "grad_norm": 0.031005859375, + "learning_rate": 0.024971937984209065, + "loss": 0.7883, + "num_input_tokens_seen": 26632808, + "step": 45880 + }, + { + "epoch": 6.834226988382484, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02497048146040387, + "loss": 0.7847, + "num_input_tokens_seen": 26635560, + "step": 45885 + }, + { + "epoch": 6.834971700923443, + "grad_norm": 0.0203857421875, + "learning_rate": 0.024969024768157527, + "loss": 0.8107, + "num_input_tokens_seen": 26638472, + "step": 45890 + }, + { + "epoch": 6.835716413464403, + "grad_norm": 0.0213623046875, + "learning_rate": 0.024967567907494635, + "loss": 0.7906, + "num_input_tokens_seen": 26641448, + "step": 45895 + }, + { + "epoch": 6.836461126005362, + "grad_norm": 0.01953125, + "learning_rate": 0.024966110878439817, + "loss": 0.7956, + "num_input_tokens_seen": 26644264, + "step": 45900 + }, + { + "epoch": 6.837205838546321, + "grad_norm": 0.01171875, + "learning_rate": 0.024964653681017683, + "loss": 0.8088, + "num_input_tokens_seen": 26647016, + "step": 45905 + }, + { + "epoch": 6.83795055108728, + "grad_norm": 0.0291748046875, + "learning_rate": 0.024963196315252855, + "loss": 0.7927, + "num_input_tokens_seen": 26649832, + "step": 45910 + }, + { + "epoch": 6.83869526362824, + "grad_norm": 0.024169921875, + "learning_rate": 0.024961738781169952, + "loss": 0.8324, + "num_input_tokens_seen": 26652776, + "step": 45915 + }, + { + "epoch": 6.839439976169198, + "grad_norm": 0.013427734375, + "learning_rate": 0.024960281078793594, + "loss": 0.8042, + "num_input_tokens_seen": 26655592, + "step": 45920 + }, + { + "epoch": 6.840184688710158, + "grad_norm": 0.0147705078125, + "learning_rate": 0.024958823208148417, + "loss": 0.8241, + "num_input_tokens_seen": 26658824, + "step": 45925 + }, + { + "epoch": 6.840929401251117, + "grad_norm": 0.0185546875, + "learning_rate": 0.02495736516925904, + "loss": 0.8133, + "num_input_tokens_seen": 26661480, + "step": 45930 + }, + { + "epoch": 6.8416741137920765, + "grad_norm": 0.021728515625, + "learning_rate": 0.024955906962150096, + "loss": 0.8213, + "num_input_tokens_seen": 26664328, + "step": 45935 + }, + { + "epoch": 6.842418826333035, + "grad_norm": 0.020263671875, + "learning_rate": 0.024954448586846225, + "loss": 0.823, + "num_input_tokens_seen": 26667592, + "step": 45940 + }, + { + "epoch": 6.843163538873995, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02495299004337206, + "loss": 0.7865, + "num_input_tokens_seen": 26670504, + "step": 45945 + }, + { + "epoch": 6.843908251414954, + "grad_norm": 0.0228271484375, + "learning_rate": 0.024951531331752246, + "loss": 0.7986, + "num_input_tokens_seen": 26673448, + "step": 45950 + }, + { + "epoch": 6.844652963955913, + "grad_norm": 0.018798828125, + "learning_rate": 0.02495007245201142, + "loss": 0.7962, + "num_input_tokens_seen": 26676136, + "step": 45955 + }, + { + "epoch": 6.845397676496872, + "grad_norm": 0.0191650390625, + "learning_rate": 0.024948613404174237, + "loss": 0.8022, + "num_input_tokens_seen": 26679080, + "step": 45960 + }, + { + "epoch": 6.846142389037832, + "grad_norm": 0.0281982421875, + "learning_rate": 0.024947154188265343, + "loss": 0.7916, + "num_input_tokens_seen": 26681864, + "step": 45965 + }, + { + "epoch": 6.84688710157879, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02494569480430939, + "loss": 0.8015, + "num_input_tokens_seen": 26684744, + "step": 45970 + }, + { + "epoch": 6.84763181411975, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02494423525233103, + "loss": 0.7908, + "num_input_tokens_seen": 26687688, + "step": 45975 + }, + { + "epoch": 6.848376526660709, + "grad_norm": 0.026123046875, + "learning_rate": 0.024942775532354917, + "loss": 0.7934, + "num_input_tokens_seen": 26690504, + "step": 45980 + }, + { + "epoch": 6.8491212392016685, + "grad_norm": 0.01513671875, + "learning_rate": 0.024941315644405722, + "loss": 0.7987, + "num_input_tokens_seen": 26693224, + "step": 45985 + }, + { + "epoch": 6.849865951742627, + "grad_norm": 0.012939453125, + "learning_rate": 0.024939855588508104, + "loss": 0.7987, + "num_input_tokens_seen": 26695976, + "step": 45990 + }, + { + "epoch": 6.850610664283587, + "grad_norm": 0.0198974609375, + "learning_rate": 0.024938395364686727, + "loss": 0.7943, + "num_input_tokens_seen": 26698760, + "step": 45995 + }, + { + "epoch": 6.851355376824546, + "grad_norm": 0.01300048828125, + "learning_rate": 0.02493693497296626, + "loss": 0.7895, + "num_input_tokens_seen": 26701448, + "step": 46000 + }, + { + "epoch": 6.852100089365505, + "grad_norm": 0.01904296875, + "learning_rate": 0.024935474413371378, + "loss": 0.7928, + "num_input_tokens_seen": 26704520, + "step": 46005 + }, + { + "epoch": 6.852844801906464, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02493401368592675, + "loss": 0.7995, + "num_input_tokens_seen": 26707176, + "step": 46010 + }, + { + "epoch": 6.853589514447423, + "grad_norm": 0.0130615234375, + "learning_rate": 0.02493255279065706, + "loss": 0.7719, + "num_input_tokens_seen": 26710088, + "step": 46015 + }, + { + "epoch": 6.8543342269883825, + "grad_norm": 0.017333984375, + "learning_rate": 0.024931091727586986, + "loss": 0.7831, + "num_input_tokens_seen": 26713000, + "step": 46020 + }, + { + "epoch": 6.855078939529342, + "grad_norm": 0.0162353515625, + "learning_rate": 0.02492963049674121, + "loss": 0.7824, + "num_input_tokens_seen": 26716232, + "step": 46025 + }, + { + "epoch": 6.855823652070301, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02492816909814442, + "loss": 0.7868, + "num_input_tokens_seen": 26719112, + "step": 46030 + }, + { + "epoch": 6.85656836461126, + "grad_norm": 0.0230712890625, + "learning_rate": 0.0249267075318213, + "loss": 0.7897, + "num_input_tokens_seen": 26721864, + "step": 46035 + }, + { + "epoch": 6.857313077152219, + "grad_norm": 0.033447265625, + "learning_rate": 0.02492524579779655, + "loss": 0.7942, + "num_input_tokens_seen": 26725256, + "step": 46040 + }, + { + "epoch": 6.858057789693179, + "grad_norm": 0.01953125, + "learning_rate": 0.02492378389609486, + "loss": 0.8176, + "num_input_tokens_seen": 26727944, + "step": 46045 + }, + { + "epoch": 6.858802502234138, + "grad_norm": 0.02734375, + "learning_rate": 0.024922321826740923, + "loss": 0.8038, + "num_input_tokens_seen": 26730600, + "step": 46050 + }, + { + "epoch": 6.859547214775096, + "grad_norm": 0.0235595703125, + "learning_rate": 0.024920859589759443, + "loss": 0.8003, + "num_input_tokens_seen": 26733576, + "step": 46055 + }, + { + "epoch": 6.860291927316056, + "grad_norm": 0.0191650390625, + "learning_rate": 0.024919397185175128, + "loss": 0.7897, + "num_input_tokens_seen": 26736424, + "step": 46060 + }, + { + "epoch": 6.861036639857015, + "grad_norm": 0.0296630859375, + "learning_rate": 0.024917934613012675, + "loss": 0.7939, + "num_input_tokens_seen": 26739208, + "step": 46065 + }, + { + "epoch": 6.8617813523979745, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0249164718732968, + "loss": 0.8331, + "num_input_tokens_seen": 26742056, + "step": 46070 + }, + { + "epoch": 6.862526064938933, + "grad_norm": 0.019287109375, + "learning_rate": 0.024915008966052206, + "loss": 0.801, + "num_input_tokens_seen": 26744840, + "step": 46075 + }, + { + "epoch": 6.863270777479893, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02491354589130362, + "loss": 0.8233, + "num_input_tokens_seen": 26747688, + "step": 46080 + }, + { + "epoch": 6.864015490020852, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02491208264907575, + "loss": 0.7941, + "num_input_tokens_seen": 26750408, + "step": 46085 + }, + { + "epoch": 6.864760202561811, + "grad_norm": 0.0267333984375, + "learning_rate": 0.024910619239393316, + "loss": 0.8061, + "num_input_tokens_seen": 26753096, + "step": 46090 + }, + { + "epoch": 6.86550491510277, + "grad_norm": 0.01214599609375, + "learning_rate": 0.02490915566228104, + "loss": 0.817, + "num_input_tokens_seen": 26755848, + "step": 46095 + }, + { + "epoch": 6.86624962764373, + "grad_norm": 0.0311279296875, + "learning_rate": 0.024907691917763652, + "loss": 0.7755, + "num_input_tokens_seen": 26758632, + "step": 46100 + }, + { + "epoch": 6.8669943401846885, + "grad_norm": 0.0172119140625, + "learning_rate": 0.024906228005865883, + "loss": 0.7728, + "num_input_tokens_seen": 26761096, + "step": 46105 + }, + { + "epoch": 6.867739052725648, + "grad_norm": 0.01904296875, + "learning_rate": 0.024904763926612457, + "loss": 0.7817, + "num_input_tokens_seen": 26764008, + "step": 46110 + }, + { + "epoch": 6.868483765266607, + "grad_norm": 0.0128173828125, + "learning_rate": 0.024903299680028107, + "loss": 0.8034, + "num_input_tokens_seen": 26766824, + "step": 46115 + }, + { + "epoch": 6.8692284778075665, + "grad_norm": 0.018798828125, + "learning_rate": 0.024901835266137585, + "loss": 0.7963, + "num_input_tokens_seen": 26769544, + "step": 46120 + }, + { + "epoch": 6.869973190348525, + "grad_norm": 0.015380859375, + "learning_rate": 0.02490037068496561, + "loss": 0.8186, + "num_input_tokens_seen": 26772200, + "step": 46125 + }, + { + "epoch": 6.870717902889485, + "grad_norm": 0.02490234375, + "learning_rate": 0.02489890593653694, + "loss": 0.8274, + "num_input_tokens_seen": 26775144, + "step": 46130 + }, + { + "epoch": 6.871462615430444, + "grad_norm": 0.0196533203125, + "learning_rate": 0.024897441020876314, + "loss": 0.8194, + "num_input_tokens_seen": 26778024, + "step": 46135 + }, + { + "epoch": 6.872207327971403, + "grad_norm": 0.02197265625, + "learning_rate": 0.024895975938008477, + "loss": 0.8013, + "num_input_tokens_seen": 26781000, + "step": 46140 + }, + { + "epoch": 6.872952040512362, + "grad_norm": 0.0152587890625, + "learning_rate": 0.02489451068795819, + "loss": 0.7849, + "num_input_tokens_seen": 26783944, + "step": 46145 + }, + { + "epoch": 6.873696753053322, + "grad_norm": 0.01953125, + "learning_rate": 0.024893045270750196, + "loss": 0.7826, + "num_input_tokens_seen": 26786728, + "step": 46150 + }, + { + "epoch": 6.8744414655942805, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02489157968640926, + "loss": 0.8108, + "num_input_tokens_seen": 26789896, + "step": 46155 + }, + { + "epoch": 6.87518617813524, + "grad_norm": 0.018310546875, + "learning_rate": 0.02489011393496014, + "loss": 0.7984, + "num_input_tokens_seen": 26792648, + "step": 46160 + }, + { + "epoch": 6.875930890676199, + "grad_norm": 0.03125, + "learning_rate": 0.024888648016427595, + "loss": 0.7661, + "num_input_tokens_seen": 26795272, + "step": 46165 + }, + { + "epoch": 6.8766756032171585, + "grad_norm": 0.0223388671875, + "learning_rate": 0.024887181930836392, + "loss": 0.818, + "num_input_tokens_seen": 26798376, + "step": 46170 + }, + { + "epoch": 6.877420315758117, + "grad_norm": 0.022705078125, + "learning_rate": 0.0248857156782113, + "loss": 0.804, + "num_input_tokens_seen": 26801416, + "step": 46175 + }, + { + "epoch": 6.878165028299077, + "grad_norm": 0.01470947265625, + "learning_rate": 0.024884249258577087, + "loss": 0.7982, + "num_input_tokens_seen": 26804104, + "step": 46180 + }, + { + "epoch": 6.878909740840036, + "grad_norm": 0.0211181640625, + "learning_rate": 0.024882782671958533, + "loss": 0.8166, + "num_input_tokens_seen": 26806984, + "step": 46185 + }, + { + "epoch": 6.879654453380995, + "grad_norm": 0.02490234375, + "learning_rate": 0.024881315918380404, + "loss": 0.7997, + "num_input_tokens_seen": 26809768, + "step": 46190 + }, + { + "epoch": 6.880399165921954, + "grad_norm": 0.0380859375, + "learning_rate": 0.024879848997867493, + "loss": 0.8181, + "num_input_tokens_seen": 26812744, + "step": 46195 + }, + { + "epoch": 6.881143878462913, + "grad_norm": 0.02197265625, + "learning_rate": 0.024878381910444565, + "loss": 0.7793, + "num_input_tokens_seen": 26815496, + "step": 46200 + }, + { + "epoch": 6.8818885910038725, + "grad_norm": 0.0208740234375, + "learning_rate": 0.024876914656136426, + "loss": 0.8026, + "num_input_tokens_seen": 26818536, + "step": 46205 + }, + { + "epoch": 6.882633303544832, + "grad_norm": 0.016845703125, + "learning_rate": 0.024875447234967842, + "loss": 0.818, + "num_input_tokens_seen": 26821352, + "step": 46210 + }, + { + "epoch": 6.883378016085791, + "grad_norm": 0.0238037109375, + "learning_rate": 0.024873979646963622, + "loss": 0.7815, + "num_input_tokens_seen": 26824072, + "step": 46215 + }, + { + "epoch": 6.88412272862675, + "grad_norm": 0.0128173828125, + "learning_rate": 0.024872511892148546, + "loss": 0.8126, + "num_input_tokens_seen": 26826984, + "step": 46220 + }, + { + "epoch": 6.884867441167709, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02487104397054742, + "loss": 0.7929, + "num_input_tokens_seen": 26829864, + "step": 46225 + }, + { + "epoch": 6.885612153708668, + "grad_norm": 0.01806640625, + "learning_rate": 0.024869575882185035, + "loss": 0.8286, + "num_input_tokens_seen": 26832616, + "step": 46230 + }, + { + "epoch": 6.886356866249628, + "grad_norm": 0.0152587890625, + "learning_rate": 0.024868107627086196, + "loss": 0.8495, + "num_input_tokens_seen": 26835528, + "step": 46235 + }, + { + "epoch": 6.8871015787905865, + "grad_norm": 0.01519775390625, + "learning_rate": 0.02486663920527571, + "loss": 0.8286, + "num_input_tokens_seen": 26838632, + "step": 46240 + }, + { + "epoch": 6.887846291331546, + "grad_norm": 0.0208740234375, + "learning_rate": 0.024865170616778384, + "loss": 0.7947, + "num_input_tokens_seen": 26841608, + "step": 46245 + }, + { + "epoch": 6.888591003872505, + "grad_norm": 0.01373291015625, + "learning_rate": 0.02486370186161902, + "loss": 0.8066, + "num_input_tokens_seen": 26844616, + "step": 46250 + }, + { + "epoch": 6.8893357164134645, + "grad_norm": 0.02001953125, + "learning_rate": 0.024862232939822447, + "loss": 0.8061, + "num_input_tokens_seen": 26847560, + "step": 46255 + }, + { + "epoch": 6.890080428954423, + "grad_norm": 0.0263671875, + "learning_rate": 0.024860763851413465, + "loss": 0.7806, + "num_input_tokens_seen": 26850248, + "step": 46260 + }, + { + "epoch": 6.890825141495383, + "grad_norm": 0.0198974609375, + "learning_rate": 0.024859294596416903, + "loss": 0.7814, + "num_input_tokens_seen": 26853288, + "step": 46265 + }, + { + "epoch": 6.891569854036342, + "grad_norm": 0.03173828125, + "learning_rate": 0.024857825174857583, + "loss": 0.807, + "num_input_tokens_seen": 26856040, + "step": 46270 + }, + { + "epoch": 6.892314566577301, + "grad_norm": 0.01361083984375, + "learning_rate": 0.02485635558676032, + "loss": 0.8242, + "num_input_tokens_seen": 26858888, + "step": 46275 + }, + { + "epoch": 6.89305927911826, + "grad_norm": 0.0341796875, + "learning_rate": 0.024854885832149946, + "loss": 0.7983, + "num_input_tokens_seen": 26861896, + "step": 46280 + }, + { + "epoch": 6.89380399165922, + "grad_norm": 0.0252685546875, + "learning_rate": 0.024853415911051297, + "loss": 0.7972, + "num_input_tokens_seen": 26864520, + "step": 46285 + }, + { + "epoch": 6.8945487042001785, + "grad_norm": 0.0302734375, + "learning_rate": 0.0248519458234892, + "loss": 0.8001, + "num_input_tokens_seen": 26867496, + "step": 46290 + }, + { + "epoch": 6.895293416741138, + "grad_norm": 0.0272216796875, + "learning_rate": 0.024850475569488488, + "loss": 0.8029, + "num_input_tokens_seen": 26870248, + "step": 46295 + }, + { + "epoch": 6.896038129282097, + "grad_norm": 0.023681640625, + "learning_rate": 0.024849005149074007, + "loss": 0.8155, + "num_input_tokens_seen": 26873032, + "step": 46300 + }, + { + "epoch": 6.896782841823057, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02484753456227059, + "loss": 0.7964, + "num_input_tokens_seen": 26876200, + "step": 46305 + }, + { + "epoch": 6.897527554364015, + "grad_norm": 0.0228271484375, + "learning_rate": 0.024846063809103088, + "loss": 0.7769, + "num_input_tokens_seen": 26878952, + "step": 46310 + }, + { + "epoch": 6.898272266904975, + "grad_norm": 0.0233154296875, + "learning_rate": 0.024844592889596343, + "loss": 0.7836, + "num_input_tokens_seen": 26882024, + "step": 46315 + }, + { + "epoch": 6.899016979445934, + "grad_norm": 0.026123046875, + "learning_rate": 0.024843121803775205, + "loss": 0.8276, + "num_input_tokens_seen": 26884936, + "step": 46320 + }, + { + "epoch": 6.899761691986893, + "grad_norm": 0.0277099609375, + "learning_rate": 0.024841650551664533, + "loss": 0.8137, + "num_input_tokens_seen": 26887944, + "step": 46325 + }, + { + "epoch": 6.900506404527852, + "grad_norm": 0.01904296875, + "learning_rate": 0.024840179133289175, + "loss": 0.7907, + "num_input_tokens_seen": 26891304, + "step": 46330 + }, + { + "epoch": 6.901251117068812, + "grad_norm": 0.0255126953125, + "learning_rate": 0.024838707548673993, + "loss": 0.8002, + "num_input_tokens_seen": 26894152, + "step": 46335 + }, + { + "epoch": 6.9019958296097705, + "grad_norm": 0.0291748046875, + "learning_rate": 0.024837235797843846, + "loss": 0.7823, + "num_input_tokens_seen": 26896936, + "step": 46340 + }, + { + "epoch": 6.90274054215073, + "grad_norm": 0.012939453125, + "learning_rate": 0.0248357638808236, + "loss": 0.8009, + "num_input_tokens_seen": 26899752, + "step": 46345 + }, + { + "epoch": 6.903485254691689, + "grad_norm": 0.0252685546875, + "learning_rate": 0.024834291797638113, + "loss": 0.7843, + "num_input_tokens_seen": 26902696, + "step": 46350 + }, + { + "epoch": 6.904229967232649, + "grad_norm": 0.021240234375, + "learning_rate": 0.02483281954831227, + "loss": 0.7857, + "num_input_tokens_seen": 26906856, + "step": 46355 + }, + { + "epoch": 6.904974679773607, + "grad_norm": 0.0205078125, + "learning_rate": 0.02483134713287093, + "loss": 0.8153, + "num_input_tokens_seen": 26909608, + "step": 46360 + }, + { + "epoch": 6.905719392314566, + "grad_norm": 0.019775390625, + "learning_rate": 0.024829874551338973, + "loss": 0.8014, + "num_input_tokens_seen": 26912488, + "step": 46365 + }, + { + "epoch": 6.906464104855526, + "grad_norm": 0.0206298828125, + "learning_rate": 0.024828401803741275, + "loss": 0.783, + "num_input_tokens_seen": 26915464, + "step": 46370 + }, + { + "epoch": 6.907208817396485, + "grad_norm": 0.0361328125, + "learning_rate": 0.02482692889010272, + "loss": 0.7912, + "num_input_tokens_seen": 26918824, + "step": 46375 + }, + { + "epoch": 6.907953529937444, + "grad_norm": 0.0184326171875, + "learning_rate": 0.024825455810448187, + "loss": 0.8109, + "num_input_tokens_seen": 26921896, + "step": 46380 + }, + { + "epoch": 6.908698242478403, + "grad_norm": 0.019775390625, + "learning_rate": 0.024823982564802564, + "loss": 0.7991, + "num_input_tokens_seen": 26924520, + "step": 46385 + }, + { + "epoch": 6.909442955019363, + "grad_norm": 0.02685546875, + "learning_rate": 0.02482250915319074, + "loss": 0.7944, + "num_input_tokens_seen": 26927016, + "step": 46390 + }, + { + "epoch": 6.910187667560322, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02482103557563761, + "loss": 0.7928, + "num_input_tokens_seen": 26929736, + "step": 46395 + }, + { + "epoch": 6.910932380101281, + "grad_norm": 0.015869140625, + "learning_rate": 0.024819561832168064, + "loss": 0.8264, + "num_input_tokens_seen": 26932712, + "step": 46400 + }, + { + "epoch": 6.91167709264224, + "grad_norm": 0.0262451171875, + "learning_rate": 0.024818087922807, + "loss": 0.8134, + "num_input_tokens_seen": 26935720, + "step": 46405 + }, + { + "epoch": 6.912421805183199, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02481661384757932, + "loss": 0.8281, + "num_input_tokens_seen": 26938728, + "step": 46410 + }, + { + "epoch": 6.913166517724158, + "grad_norm": 0.01519775390625, + "learning_rate": 0.024815139606509926, + "loss": 0.8056, + "num_input_tokens_seen": 26941608, + "step": 46415 + }, + { + "epoch": 6.913911230265118, + "grad_norm": 0.0211181640625, + "learning_rate": 0.024813665199623726, + "loss": 0.8037, + "num_input_tokens_seen": 26944488, + "step": 46420 + }, + { + "epoch": 6.9146559428060765, + "grad_norm": 0.04345703125, + "learning_rate": 0.024812190626945622, + "loss": 0.7946, + "num_input_tokens_seen": 26947272, + "step": 46425 + }, + { + "epoch": 6.915400655347036, + "grad_norm": 0.024658203125, + "learning_rate": 0.02481071588850053, + "loss": 0.8141, + "num_input_tokens_seen": 26950088, + "step": 46430 + }, + { + "epoch": 6.916145367887995, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02480924098431337, + "loss": 0.7973, + "num_input_tokens_seen": 26952904, + "step": 46435 + }, + { + "epoch": 6.916890080428955, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02480776591440905, + "loss": 0.8011, + "num_input_tokens_seen": 26956040, + "step": 46440 + }, + { + "epoch": 6.917634792969913, + "grad_norm": 0.043212890625, + "learning_rate": 0.02480629067881249, + "loss": 0.7758, + "num_input_tokens_seen": 26958760, + "step": 46445 + }, + { + "epoch": 6.918379505510873, + "grad_norm": 0.033935546875, + "learning_rate": 0.024804815277548624, + "loss": 0.7878, + "num_input_tokens_seen": 26962056, + "step": 46450 + }, + { + "epoch": 6.919124218051832, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02480333971064236, + "loss": 0.8101, + "num_input_tokens_seen": 26964936, + "step": 46455 + }, + { + "epoch": 6.919868930592791, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02480186397811864, + "loss": 0.7988, + "num_input_tokens_seen": 26967848, + "step": 46460 + }, + { + "epoch": 6.92061364313375, + "grad_norm": 0.0240478515625, + "learning_rate": 0.024800388080002393, + "loss": 0.8045, + "num_input_tokens_seen": 26970504, + "step": 46465 + }, + { + "epoch": 6.92135835567471, + "grad_norm": 0.0172119140625, + "learning_rate": 0.024798912016318547, + "loss": 0.799, + "num_input_tokens_seen": 26973384, + "step": 46470 + }, + { + "epoch": 6.922103068215669, + "grad_norm": 0.016357421875, + "learning_rate": 0.024797435787092037, + "loss": 0.7884, + "num_input_tokens_seen": 26976392, + "step": 46475 + }, + { + "epoch": 6.922847780756628, + "grad_norm": 0.024169921875, + "learning_rate": 0.024795959392347813, + "loss": 0.8013, + "num_input_tokens_seen": 26978984, + "step": 46480 + }, + { + "epoch": 6.923592493297587, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02479448283211081, + "loss": 0.7969, + "num_input_tokens_seen": 26981768, + "step": 46485 + }, + { + "epoch": 6.924337205838547, + "grad_norm": 0.0262451171875, + "learning_rate": 0.024793006106405974, + "loss": 0.786, + "num_input_tokens_seen": 26984744, + "step": 46490 + }, + { + "epoch": 6.925081918379505, + "grad_norm": 0.031494140625, + "learning_rate": 0.024791529215258252, + "loss": 0.826, + "num_input_tokens_seen": 26987944, + "step": 46495 + }, + { + "epoch": 6.925826630920465, + "grad_norm": 0.03125, + "learning_rate": 0.024790052158692597, + "loss": 0.8117, + "num_input_tokens_seen": 26990696, + "step": 46500 + }, + { + "epoch": 6.926571343461424, + "grad_norm": 0.039306640625, + "learning_rate": 0.024788574936733962, + "loss": 0.7975, + "num_input_tokens_seen": 26993640, + "step": 46505 + }, + { + "epoch": 6.927316056002383, + "grad_norm": 0.01202392578125, + "learning_rate": 0.0247870975494073, + "loss": 0.7833, + "num_input_tokens_seen": 26996392, + "step": 46510 + }, + { + "epoch": 6.928060768543342, + "grad_norm": 0.04150390625, + "learning_rate": 0.02478561999673757, + "loss": 0.8279, + "num_input_tokens_seen": 26999272, + "step": 46515 + }, + { + "epoch": 6.928805481084302, + "grad_norm": 0.01904296875, + "learning_rate": 0.024784142278749737, + "loss": 0.8039, + "num_input_tokens_seen": 27001960, + "step": 46520 + }, + { + "epoch": 6.929550193625261, + "grad_norm": 0.0179443359375, + "learning_rate": 0.024782664395468765, + "loss": 0.7943, + "num_input_tokens_seen": 27004872, + "step": 46525 + }, + { + "epoch": 6.930294906166219, + "grad_norm": 0.02294921875, + "learning_rate": 0.02478118634691962, + "loss": 0.8041, + "num_input_tokens_seen": 27007880, + "step": 46530 + }, + { + "epoch": 6.931039618707179, + "grad_norm": 0.0191650390625, + "learning_rate": 0.024779708133127278, + "loss": 0.7891, + "num_input_tokens_seen": 27010664, + "step": 46535 + }, + { + "epoch": 6.931784331248139, + "grad_norm": 0.0224609375, + "learning_rate": 0.024778229754116702, + "loss": 0.7906, + "num_input_tokens_seen": 27013672, + "step": 46540 + }, + { + "epoch": 6.932529043789097, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02477675120991287, + "loss": 0.7981, + "num_input_tokens_seen": 27016648, + "step": 46545 + }, + { + "epoch": 6.933273756330056, + "grad_norm": 0.0234375, + "learning_rate": 0.02477527250054077, + "loss": 0.8045, + "num_input_tokens_seen": 27019432, + "step": 46550 + }, + { + "epoch": 6.934018468871016, + "grad_norm": 0.0262451171875, + "learning_rate": 0.024773793626025373, + "loss": 0.794, + "num_input_tokens_seen": 27022472, + "step": 46555 + }, + { + "epoch": 6.9347631814119755, + "grad_norm": 0.0262451171875, + "learning_rate": 0.024772314586391665, + "loss": 0.8171, + "num_input_tokens_seen": 27025384, + "step": 46560 + }, + { + "epoch": 6.935507893952934, + "grad_norm": 0.0228271484375, + "learning_rate": 0.024770835381664632, + "loss": 0.7931, + "num_input_tokens_seen": 27028328, + "step": 46565 + }, + { + "epoch": 6.936252606493893, + "grad_norm": 0.031494140625, + "learning_rate": 0.024769356011869272, + "loss": 0.7889, + "num_input_tokens_seen": 27030984, + "step": 46570 + }, + { + "epoch": 6.936997319034853, + "grad_norm": 0.026123046875, + "learning_rate": 0.02476787647703057, + "loss": 0.7848, + "num_input_tokens_seen": 27033960, + "step": 46575 + }, + { + "epoch": 6.937742031575811, + "grad_norm": 0.020263671875, + "learning_rate": 0.02476639677717352, + "loss": 0.7757, + "num_input_tokens_seen": 27036712, + "step": 46580 + }, + { + "epoch": 6.938486744116771, + "grad_norm": 0.0284423828125, + "learning_rate": 0.024764916912323125, + "loss": 0.795, + "num_input_tokens_seen": 27039368, + "step": 46585 + }, + { + "epoch": 6.93923145665773, + "grad_norm": 0.0186767578125, + "learning_rate": 0.024763436882504375, + "loss": 0.7955, + "num_input_tokens_seen": 27042280, + "step": 46590 + }, + { + "epoch": 6.939976169198689, + "grad_norm": 0.020263671875, + "learning_rate": 0.02476195668774229, + "loss": 0.7927, + "num_input_tokens_seen": 27045320, + "step": 46595 + }, + { + "epoch": 6.940720881739648, + "grad_norm": 0.031494140625, + "learning_rate": 0.02476047632806187, + "loss": 0.8071, + "num_input_tokens_seen": 27047912, + "step": 46600 + }, + { + "epoch": 6.941465594280608, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02475899580348812, + "loss": 0.7865, + "num_input_tokens_seen": 27050888, + "step": 46605 + }, + { + "epoch": 6.942210306821567, + "grad_norm": 0.0216064453125, + "learning_rate": 0.024757515114046055, + "loss": 0.8065, + "num_input_tokens_seen": 27053992, + "step": 46610 + }, + { + "epoch": 6.942955019362526, + "grad_norm": 0.0203857421875, + "learning_rate": 0.024756034259760685, + "loss": 0.7589, + "num_input_tokens_seen": 27057096, + "step": 46615 + }, + { + "epoch": 6.943699731903485, + "grad_norm": 0.0213623046875, + "learning_rate": 0.024754553240657038, + "loss": 0.7887, + "num_input_tokens_seen": 27060072, + "step": 46620 + }, + { + "epoch": 6.944444444444445, + "grad_norm": 0.0184326171875, + "learning_rate": 0.024753072056760122, + "loss": 0.8005, + "num_input_tokens_seen": 27062856, + "step": 46625 + }, + { + "epoch": 6.945189156985403, + "grad_norm": 0.02734375, + "learning_rate": 0.02475159070809497, + "loss": 0.8194, + "num_input_tokens_seen": 27065544, + "step": 46630 + }, + { + "epoch": 6.945933869526363, + "grad_norm": 0.0341796875, + "learning_rate": 0.024750109194686603, + "loss": 0.7843, + "num_input_tokens_seen": 27068360, + "step": 46635 + }, + { + "epoch": 6.946678582067322, + "grad_norm": 0.02197265625, + "learning_rate": 0.024748627516560054, + "loss": 0.8289, + "num_input_tokens_seen": 27071080, + "step": 46640 + }, + { + "epoch": 6.9474232946082815, + "grad_norm": 0.019775390625, + "learning_rate": 0.024747145673740346, + "loss": 0.776, + "num_input_tokens_seen": 27073928, + "step": 46645 + }, + { + "epoch": 6.94816800714924, + "grad_norm": 0.0172119140625, + "learning_rate": 0.02474566366625252, + "loss": 0.8064, + "num_input_tokens_seen": 27077000, + "step": 46650 + }, + { + "epoch": 6.9489127196902, + "grad_norm": 0.0277099609375, + "learning_rate": 0.024744181494121612, + "loss": 0.7652, + "num_input_tokens_seen": 27079752, + "step": 46655 + }, + { + "epoch": 6.949657432231159, + "grad_norm": 0.0216064453125, + "learning_rate": 0.024742699157372663, + "loss": 0.7825, + "num_input_tokens_seen": 27082536, + "step": 46660 + }, + { + "epoch": 6.950402144772118, + "grad_norm": 0.04248046875, + "learning_rate": 0.024741216656030712, + "loss": 0.7823, + "num_input_tokens_seen": 27085160, + "step": 46665 + }, + { + "epoch": 6.951146857313077, + "grad_norm": 0.0216064453125, + "learning_rate": 0.024739733990120805, + "loss": 0.7911, + "num_input_tokens_seen": 27088392, + "step": 46670 + }, + { + "epoch": 6.951891569854037, + "grad_norm": 0.0191650390625, + "learning_rate": 0.024738251159667993, + "loss": 0.8407, + "num_input_tokens_seen": 27091240, + "step": 46675 + }, + { + "epoch": 6.952636282394995, + "grad_norm": 0.0302734375, + "learning_rate": 0.024736768164697324, + "loss": 0.793, + "num_input_tokens_seen": 27094344, + "step": 46680 + }, + { + "epoch": 6.953380994935955, + "grad_norm": 0.015625, + "learning_rate": 0.024735285005233854, + "loss": 0.7862, + "num_input_tokens_seen": 27097192, + "step": 46685 + }, + { + "epoch": 6.954125707476914, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02473380168130264, + "loss": 0.823, + "num_input_tokens_seen": 27099976, + "step": 46690 + }, + { + "epoch": 6.9548704200178735, + "grad_norm": 0.0146484375, + "learning_rate": 0.024732318192928735, + "loss": 0.7965, + "num_input_tokens_seen": 27102792, + "step": 46695 + }, + { + "epoch": 6.955615132558832, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02473083454013721, + "loss": 0.7945, + "num_input_tokens_seen": 27105640, + "step": 46700 + }, + { + "epoch": 6.956359845099792, + "grad_norm": 0.028076171875, + "learning_rate": 0.024729350722953118, + "loss": 0.8382, + "num_input_tokens_seen": 27108584, + "step": 46705 + }, + { + "epoch": 6.957104557640751, + "grad_norm": 0.013916015625, + "learning_rate": 0.024727866741401545, + "loss": 0.8016, + "num_input_tokens_seen": 27111560, + "step": 46710 + }, + { + "epoch": 6.957849270181709, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02472638259550754, + "loss": 0.8105, + "num_input_tokens_seen": 27114472, + "step": 46715 + }, + { + "epoch": 6.958593982722669, + "grad_norm": 0.0274658203125, + "learning_rate": 0.024724898285296195, + "loss": 0.8107, + "num_input_tokens_seen": 27117000, + "step": 46720 + }, + { + "epoch": 6.959338695263629, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02472341381079257, + "loss": 0.7848, + "num_input_tokens_seen": 27119784, + "step": 46725 + }, + { + "epoch": 6.9600834078045875, + "grad_norm": 0.0322265625, + "learning_rate": 0.024721929172021758, + "loss": 0.8292, + "num_input_tokens_seen": 27122760, + "step": 46730 + }, + { + "epoch": 6.960828120345546, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02472044436900883, + "loss": 0.7836, + "num_input_tokens_seen": 27125608, + "step": 46735 + }, + { + "epoch": 6.961572832886506, + "grad_norm": 0.031982421875, + "learning_rate": 0.024718959401778876, + "loss": 0.7822, + "num_input_tokens_seen": 27128328, + "step": 46740 + }, + { + "epoch": 6.962317545427465, + "grad_norm": 0.0174560546875, + "learning_rate": 0.024717474270356976, + "loss": 0.7966, + "num_input_tokens_seen": 27131496, + "step": 46745 + }, + { + "epoch": 6.963062257968424, + "grad_norm": 0.023681640625, + "learning_rate": 0.024715988974768226, + "loss": 0.7834, + "num_input_tokens_seen": 27134504, + "step": 46750 + }, + { + "epoch": 6.963806970509383, + "grad_norm": 0.02880859375, + "learning_rate": 0.02471450351503772, + "loss": 0.7849, + "num_input_tokens_seen": 27137480, + "step": 46755 + }, + { + "epoch": 6.964551683050343, + "grad_norm": 0.0311279296875, + "learning_rate": 0.024713017891190552, + "loss": 0.8196, + "num_input_tokens_seen": 27140680, + "step": 46760 + }, + { + "epoch": 6.965296395591301, + "grad_norm": 0.01318359375, + "learning_rate": 0.024711532103251815, + "loss": 0.7662, + "num_input_tokens_seen": 27143560, + "step": 46765 + }, + { + "epoch": 6.966041108132261, + "grad_norm": 0.0269775390625, + "learning_rate": 0.024710046151246615, + "loss": 0.838, + "num_input_tokens_seen": 27146344, + "step": 46770 + }, + { + "epoch": 6.96678582067322, + "grad_norm": 0.0201416015625, + "learning_rate": 0.024708560035200052, + "loss": 0.7875, + "num_input_tokens_seen": 27149448, + "step": 46775 + }, + { + "epoch": 6.9675305332141795, + "grad_norm": 0.0289306640625, + "learning_rate": 0.024707073755137237, + "loss": 0.7762, + "num_input_tokens_seen": 27152200, + "step": 46780 + }, + { + "epoch": 6.968275245755138, + "grad_norm": 0.028564453125, + "learning_rate": 0.024705587311083275, + "loss": 0.7846, + "num_input_tokens_seen": 27155048, + "step": 46785 + }, + { + "epoch": 6.969019958296098, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02470410070306328, + "loss": 0.8056, + "num_input_tokens_seen": 27157800, + "step": 46790 + }, + { + "epoch": 6.969764670837057, + "grad_norm": 0.021728515625, + "learning_rate": 0.024702613931102364, + "loss": 0.7811, + "num_input_tokens_seen": 27160840, + "step": 46795 + }, + { + "epoch": 6.970509383378016, + "grad_norm": 0.0185546875, + "learning_rate": 0.024701126995225652, + "loss": 0.8296, + "num_input_tokens_seen": 27163560, + "step": 46800 + }, + { + "epoch": 6.971254095918975, + "grad_norm": 0.021484375, + "learning_rate": 0.02469963989545826, + "loss": 0.8308, + "num_input_tokens_seen": 27166312, + "step": 46805 + }, + { + "epoch": 6.971998808459935, + "grad_norm": 0.0167236328125, + "learning_rate": 0.02469815263182531, + "loss": 0.8619, + "num_input_tokens_seen": 27169448, + "step": 46810 + }, + { + "epoch": 6.9727435210008935, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02469666520435192, + "loss": 0.7908, + "num_input_tokens_seen": 27172168, + "step": 46815 + }, + { + "epoch": 6.973488233541853, + "grad_norm": 0.0247802734375, + "learning_rate": 0.024695177613063234, + "loss": 0.8246, + "num_input_tokens_seen": 27174984, + "step": 46820 + }, + { + "epoch": 6.974232946082812, + "grad_norm": 0.0262451171875, + "learning_rate": 0.024693689857984376, + "loss": 0.8095, + "num_input_tokens_seen": 27177896, + "step": 46825 + }, + { + "epoch": 6.9749776586237715, + "grad_norm": 0.0206298828125, + "learning_rate": 0.024692201939140474, + "loss": 0.8212, + "num_input_tokens_seen": 27180808, + "step": 46830 + }, + { + "epoch": 6.97572237116473, + "grad_norm": 0.019775390625, + "learning_rate": 0.02469071385655668, + "loss": 0.7986, + "num_input_tokens_seen": 27183496, + "step": 46835 + }, + { + "epoch": 6.97646708370569, + "grad_norm": 0.01171875, + "learning_rate": 0.024689225610258114, + "loss": 0.7858, + "num_input_tokens_seen": 27186440, + "step": 46840 + }, + { + "epoch": 6.977211796246649, + "grad_norm": 0.0162353515625, + "learning_rate": 0.024687737200269937, + "loss": 0.7928, + "num_input_tokens_seen": 27189352, + "step": 46845 + }, + { + "epoch": 6.977956508787608, + "grad_norm": 0.0220947265625, + "learning_rate": 0.024686248626617287, + "loss": 0.7969, + "num_input_tokens_seen": 27192072, + "step": 46850 + }, + { + "epoch": 6.978701221328567, + "grad_norm": 0.034912109375, + "learning_rate": 0.024684759889325305, + "loss": 0.8177, + "num_input_tokens_seen": 27194984, + "step": 46855 + }, + { + "epoch": 6.979445933869527, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02468327098841915, + "loss": 0.8073, + "num_input_tokens_seen": 27197832, + "step": 46860 + }, + { + "epoch": 6.9801906464104855, + "grad_norm": 0.030517578125, + "learning_rate": 0.024681781923923975, + "loss": 0.7964, + "num_input_tokens_seen": 27200712, + "step": 46865 + }, + { + "epoch": 6.980935358951445, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02468029269586493, + "loss": 0.7975, + "num_input_tokens_seen": 27203432, + "step": 46870 + }, + { + "epoch": 6.981680071492404, + "grad_norm": 0.020751953125, + "learning_rate": 0.024678803304267184, + "loss": 0.8022, + "num_input_tokens_seen": 27206152, + "step": 46875 + }, + { + "epoch": 6.982424784033363, + "grad_norm": 0.022705078125, + "learning_rate": 0.024677313749155893, + "loss": 0.7957, + "num_input_tokens_seen": 27209032, + "step": 46880 + }, + { + "epoch": 6.983169496574322, + "grad_norm": 0.01300048828125, + "learning_rate": 0.024675824030556216, + "loss": 0.813, + "num_input_tokens_seen": 27211752, + "step": 46885 + }, + { + "epoch": 6.983914209115282, + "grad_norm": 0.0291748046875, + "learning_rate": 0.024674334148493335, + "loss": 0.8021, + "num_input_tokens_seen": 27214600, + "step": 46890 + }, + { + "epoch": 6.984658921656241, + "grad_norm": 0.01611328125, + "learning_rate": 0.024672844102992402, + "loss": 0.8088, + "num_input_tokens_seen": 27217864, + "step": 46895 + }, + { + "epoch": 6.9854036341971995, + "grad_norm": 0.0205078125, + "learning_rate": 0.02467135389407861, + "loss": 0.8065, + "num_input_tokens_seen": 27220712, + "step": 46900 + }, + { + "epoch": 6.986148346738159, + "grad_norm": 0.01239013671875, + "learning_rate": 0.02466986352177711, + "loss": 0.7896, + "num_input_tokens_seen": 27223976, + "step": 46905 + }, + { + "epoch": 6.986893059279119, + "grad_norm": 0.0146484375, + "learning_rate": 0.024668372986113102, + "loss": 0.8062, + "num_input_tokens_seen": 27226728, + "step": 46910 + }, + { + "epoch": 6.9876377718200775, + "grad_norm": 0.019775390625, + "learning_rate": 0.024666882287111756, + "loss": 0.7815, + "num_input_tokens_seen": 27229544, + "step": 46915 + }, + { + "epoch": 6.988382484361036, + "grad_norm": 0.01165771484375, + "learning_rate": 0.02466539142479826, + "loss": 0.8142, + "num_input_tokens_seen": 27232392, + "step": 46920 + }, + { + "epoch": 6.989127196901996, + "grad_norm": 0.028076171875, + "learning_rate": 0.024663900399197802, + "loss": 0.8046, + "num_input_tokens_seen": 27235272, + "step": 46925 + }, + { + "epoch": 6.989871909442955, + "grad_norm": 0.022216796875, + "learning_rate": 0.024662409210335565, + "loss": 0.8133, + "num_input_tokens_seen": 27238312, + "step": 46930 + }, + { + "epoch": 6.990616621983914, + "grad_norm": 0.021484375, + "learning_rate": 0.024660917858236747, + "loss": 0.8031, + "num_input_tokens_seen": 27241096, + "step": 46935 + }, + { + "epoch": 6.991361334524873, + "grad_norm": 0.0234375, + "learning_rate": 0.024659426342926536, + "loss": 0.7813, + "num_input_tokens_seen": 27244040, + "step": 46940 + }, + { + "epoch": 6.992106047065833, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02465793466443014, + "loss": 0.7834, + "num_input_tokens_seen": 27246760, + "step": 46945 + }, + { + "epoch": 6.9928507596067915, + "grad_norm": 0.01348876953125, + "learning_rate": 0.024656442822772753, + "loss": 0.7749, + "num_input_tokens_seen": 27249864, + "step": 46950 + }, + { + "epoch": 6.993595472147751, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02465495081797958, + "loss": 0.7956, + "num_input_tokens_seen": 27253000, + "step": 46955 + }, + { + "epoch": 6.99434018468871, + "grad_norm": 0.026123046875, + "learning_rate": 0.02465345865007582, + "loss": 0.8189, + "num_input_tokens_seen": 27255848, + "step": 46960 + }, + { + "epoch": 6.9950848972296695, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02465196631908669, + "loss": 0.7817, + "num_input_tokens_seen": 27258728, + "step": 46965 + }, + { + "epoch": 6.995829609770628, + "grad_norm": 0.021484375, + "learning_rate": 0.024650473825037402, + "loss": 0.7814, + "num_input_tokens_seen": 27261832, + "step": 46970 + }, + { + "epoch": 6.996574322311588, + "grad_norm": 0.019287109375, + "learning_rate": 0.024648981167953162, + "loss": 0.784, + "num_input_tokens_seen": 27264776, + "step": 46975 + }, + { + "epoch": 6.997319034852547, + "grad_norm": 0.0162353515625, + "learning_rate": 0.024647488347859195, + "loss": 0.8158, + "num_input_tokens_seen": 27268008, + "step": 46980 + }, + { + "epoch": 6.998063747393506, + "grad_norm": 0.01531982421875, + "learning_rate": 0.02464599536478072, + "loss": 0.8042, + "num_input_tokens_seen": 27271176, + "step": 46985 + }, + { + "epoch": 6.998808459934465, + "grad_norm": 0.02734375, + "learning_rate": 0.024644502218742953, + "loss": 0.8249, + "num_input_tokens_seen": 27274408, + "step": 46990 + }, + { + "epoch": 6.999553172475425, + "grad_norm": 0.02783203125, + "learning_rate": 0.024643008909771124, + "loss": 0.8242, + "num_input_tokens_seen": 27277416, + "step": 46995 + }, + { + "epoch": 7.0, + "eval_loss": 0.80185866355896, + "eval_runtime": 70.7775, + "eval_samples_per_second": 42.16, + "eval_steps_per_second": 10.54, + "num_input_tokens_seen": 27278720, + "step": 46998 + }, + { + "epoch": 7.0002978850163835, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02464151543789046, + "loss": 0.8143, + "num_input_tokens_seen": 27279936, + "step": 47000 + }, + { + "epoch": 7.001042597557343, + "grad_norm": 0.0322265625, + "learning_rate": 0.024640021803126196, + "loss": 0.8251, + "num_input_tokens_seen": 27282816, + "step": 47005 + }, + { + "epoch": 7.001787310098302, + "grad_norm": 0.0252685546875, + "learning_rate": 0.024638528005503556, + "loss": 0.8191, + "num_input_tokens_seen": 27285568, + "step": 47010 + }, + { + "epoch": 7.0025320226392616, + "grad_norm": 0.0260009765625, + "learning_rate": 0.024637034045047784, + "loss": 0.7959, + "num_input_tokens_seen": 27288736, + "step": 47015 + }, + { + "epoch": 7.00327673518022, + "grad_norm": 0.0185546875, + "learning_rate": 0.024635539921784116, + "loss": 0.8058, + "num_input_tokens_seen": 27291392, + "step": 47020 + }, + { + "epoch": 7.00402144772118, + "grad_norm": 0.0277099609375, + "learning_rate": 0.024634045635737793, + "loss": 0.8181, + "num_input_tokens_seen": 27294080, + "step": 47025 + }, + { + "epoch": 7.004766160262139, + "grad_norm": 0.0244140625, + "learning_rate": 0.02463255118693406, + "loss": 0.8061, + "num_input_tokens_seen": 27297024, + "step": 47030 + }, + { + "epoch": 7.005510872803098, + "grad_norm": 0.0211181640625, + "learning_rate": 0.024631056575398172, + "loss": 0.8015, + "num_input_tokens_seen": 27300288, + "step": 47035 + }, + { + "epoch": 7.006255585344057, + "grad_norm": 0.0235595703125, + "learning_rate": 0.024629561801155362, + "loss": 0.7943, + "num_input_tokens_seen": 27303296, + "step": 47040 + }, + { + "epoch": 7.007000297885017, + "grad_norm": 0.03759765625, + "learning_rate": 0.024628066864230896, + "loss": 0.8165, + "num_input_tokens_seen": 27306336, + "step": 47045 + }, + { + "epoch": 7.0077450104259755, + "grad_norm": 0.01904296875, + "learning_rate": 0.024626571764650027, + "loss": 0.8113, + "num_input_tokens_seen": 27309280, + "step": 47050 + }, + { + "epoch": 7.008489722966935, + "grad_norm": 0.01611328125, + "learning_rate": 0.024625076502438014, + "loss": 0.8226, + "num_input_tokens_seen": 27312096, + "step": 47055 + }, + { + "epoch": 7.009234435507894, + "grad_norm": 0.0269775390625, + "learning_rate": 0.024623581077620114, + "loss": 0.815, + "num_input_tokens_seen": 27314976, + "step": 47060 + }, + { + "epoch": 7.009979148048854, + "grad_norm": 0.02099609375, + "learning_rate": 0.02462208549022159, + "loss": 0.7977, + "num_input_tokens_seen": 27317824, + "step": 47065 + }, + { + "epoch": 7.010723860589812, + "grad_norm": 0.020263671875, + "learning_rate": 0.024620589740267713, + "loss": 0.8132, + "num_input_tokens_seen": 27320608, + "step": 47070 + }, + { + "epoch": 7.011468573130771, + "grad_norm": 0.041259765625, + "learning_rate": 0.02461909382778375, + "loss": 0.8209, + "num_input_tokens_seen": 27324032, + "step": 47075 + }, + { + "epoch": 7.012213285671731, + "grad_norm": 0.0289306640625, + "learning_rate": 0.024617597752794974, + "loss": 0.7982, + "num_input_tokens_seen": 27327008, + "step": 47080 + }, + { + "epoch": 7.0129579982126895, + "grad_norm": 0.01397705078125, + "learning_rate": 0.024616101515326658, + "loss": 0.7927, + "num_input_tokens_seen": 27329664, + "step": 47085 + }, + { + "epoch": 7.013702710753649, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02461460511540408, + "loss": 0.8121, + "num_input_tokens_seen": 27332960, + "step": 47090 + }, + { + "epoch": 7.014447423294608, + "grad_norm": 0.0361328125, + "learning_rate": 0.024613108553052522, + "loss": 0.8016, + "num_input_tokens_seen": 27336032, + "step": 47095 + }, + { + "epoch": 7.0151921358355676, + "grad_norm": 0.0184326171875, + "learning_rate": 0.024611611828297266, + "loss": 0.8038, + "num_input_tokens_seen": 27338944, + "step": 47100 + }, + { + "epoch": 7.015936848376526, + "grad_norm": 0.0142822265625, + "learning_rate": 0.024610114941163594, + "loss": 0.8052, + "num_input_tokens_seen": 27341632, + "step": 47105 + }, + { + "epoch": 7.016681560917486, + "grad_norm": 0.013916015625, + "learning_rate": 0.024608617891676798, + "loss": 0.7876, + "num_input_tokens_seen": 27344352, + "step": 47110 + }, + { + "epoch": 7.017426273458445, + "grad_norm": 0.03369140625, + "learning_rate": 0.02460712067986217, + "loss": 0.8065, + "num_input_tokens_seen": 27347520, + "step": 47115 + }, + { + "epoch": 7.018170985999404, + "grad_norm": 0.0233154296875, + "learning_rate": 0.024605623305745006, + "loss": 0.7948, + "num_input_tokens_seen": 27350208, + "step": 47120 + }, + { + "epoch": 7.018915698540363, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02460412576935059, + "loss": 0.8037, + "num_input_tokens_seen": 27352960, + "step": 47125 + }, + { + "epoch": 7.019660411081323, + "grad_norm": 0.0274658203125, + "learning_rate": 0.024602628070704236, + "loss": 0.8054, + "num_input_tokens_seen": 27355584, + "step": 47130 + }, + { + "epoch": 7.0204051236222815, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02460113020983124, + "loss": 0.8102, + "num_input_tokens_seen": 27358624, + "step": 47135 + }, + { + "epoch": 7.021149836163241, + "grad_norm": 0.02099609375, + "learning_rate": 0.02459963218675691, + "loss": 0.8065, + "num_input_tokens_seen": 27361440, + "step": 47140 + }, + { + "epoch": 7.0218945487042, + "grad_norm": 0.0140380859375, + "learning_rate": 0.024598134001506546, + "loss": 0.8133, + "num_input_tokens_seen": 27364320, + "step": 47145 + }, + { + "epoch": 7.02263926124516, + "grad_norm": 0.0174560546875, + "learning_rate": 0.024596635654105463, + "loss": 0.7903, + "num_input_tokens_seen": 27367136, + "step": 47150 + }, + { + "epoch": 7.023383973786118, + "grad_norm": 0.0281982421875, + "learning_rate": 0.024595137144578973, + "loss": 0.7849, + "num_input_tokens_seen": 27370432, + "step": 47155 + }, + { + "epoch": 7.024128686327078, + "grad_norm": 0.034912109375, + "learning_rate": 0.024593638472952397, + "loss": 0.82, + "num_input_tokens_seen": 27373664, + "step": 47160 + }, + { + "epoch": 7.024873398868037, + "grad_norm": 0.0155029296875, + "learning_rate": 0.02459213963925105, + "loss": 0.7968, + "num_input_tokens_seen": 27376448, + "step": 47165 + }, + { + "epoch": 7.025618111408996, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02459064064350025, + "loss": 0.8018, + "num_input_tokens_seen": 27379392, + "step": 47170 + }, + { + "epoch": 7.026362823949955, + "grad_norm": 0.01312255859375, + "learning_rate": 0.024589141485725326, + "loss": 0.8048, + "num_input_tokens_seen": 27382240, + "step": 47175 + }, + { + "epoch": 7.027107536490915, + "grad_norm": 0.01092529296875, + "learning_rate": 0.024587642165951603, + "loss": 0.8081, + "num_input_tokens_seen": 27385472, + "step": 47180 + }, + { + "epoch": 7.0278522490318736, + "grad_norm": 0.032958984375, + "learning_rate": 0.024586142684204407, + "loss": 0.8034, + "num_input_tokens_seen": 27388256, + "step": 47185 + }, + { + "epoch": 7.028596961572833, + "grad_norm": 0.037109375, + "learning_rate": 0.024584643040509075, + "loss": 0.8001, + "num_input_tokens_seen": 27391072, + "step": 47190 + }, + { + "epoch": 7.029341674113792, + "grad_norm": 0.01519775390625, + "learning_rate": 0.024583143234890945, + "loss": 0.7864, + "num_input_tokens_seen": 27394048, + "step": 47195 + }, + { + "epoch": 7.030086386654752, + "grad_norm": 0.0201416015625, + "learning_rate": 0.024581643267375344, + "loss": 0.7895, + "num_input_tokens_seen": 27397024, + "step": 47200 + }, + { + "epoch": 7.03083109919571, + "grad_norm": 0.0177001953125, + "learning_rate": 0.024580143137987617, + "loss": 0.7925, + "num_input_tokens_seen": 27400096, + "step": 47205 + }, + { + "epoch": 7.03157581173667, + "grad_norm": 0.0284423828125, + "learning_rate": 0.024578642846753113, + "loss": 0.7995, + "num_input_tokens_seen": 27402784, + "step": 47210 + }, + { + "epoch": 7.032320524277629, + "grad_norm": 0.01904296875, + "learning_rate": 0.024577142393697172, + "loss": 0.7944, + "num_input_tokens_seen": 27405280, + "step": 47215 + }, + { + "epoch": 7.033065236818588, + "grad_norm": 0.02587890625, + "learning_rate": 0.02457564177884514, + "loss": 0.7842, + "num_input_tokens_seen": 27408704, + "step": 47220 + }, + { + "epoch": 7.033809949359547, + "grad_norm": 0.024169921875, + "learning_rate": 0.024574141002222376, + "loss": 0.8129, + "num_input_tokens_seen": 27411392, + "step": 47225 + }, + { + "epoch": 7.034554661900507, + "grad_norm": 0.0244140625, + "learning_rate": 0.02457264006385423, + "loss": 0.8168, + "num_input_tokens_seen": 27414208, + "step": 47230 + }, + { + "epoch": 7.035299374441466, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02457113896376606, + "loss": 0.8118, + "num_input_tokens_seen": 27416992, + "step": 47235 + }, + { + "epoch": 7.036044086982425, + "grad_norm": 0.0203857421875, + "learning_rate": 0.024569637701983223, + "loss": 0.803, + "num_input_tokens_seen": 27419840, + "step": 47240 + }, + { + "epoch": 7.036788799523384, + "grad_norm": 0.02001953125, + "learning_rate": 0.024568136278531084, + "loss": 0.806, + "num_input_tokens_seen": 27422752, + "step": 47245 + }, + { + "epoch": 7.037533512064343, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02456663469343501, + "loss": 0.7955, + "num_input_tokens_seen": 27425504, + "step": 47250 + }, + { + "epoch": 7.038278224605302, + "grad_norm": 0.0174560546875, + "learning_rate": 0.024565132946720365, + "loss": 0.8046, + "num_input_tokens_seen": 27428384, + "step": 47255 + }, + { + "epoch": 7.039022937146261, + "grad_norm": 0.02294921875, + "learning_rate": 0.02456363103841252, + "loss": 0.7827, + "num_input_tokens_seen": 27431360, + "step": 47260 + }, + { + "epoch": 7.039767649687221, + "grad_norm": 0.01318359375, + "learning_rate": 0.024562128968536847, + "loss": 0.7984, + "num_input_tokens_seen": 27434208, + "step": 47265 + }, + { + "epoch": 7.0405123622281796, + "grad_norm": 0.020751953125, + "learning_rate": 0.024560626737118722, + "loss": 0.8018, + "num_input_tokens_seen": 27436864, + "step": 47270 + }, + { + "epoch": 7.041257074769139, + "grad_norm": 0.0302734375, + "learning_rate": 0.024559124344183527, + "loss": 0.8205, + "num_input_tokens_seen": 27439712, + "step": 47275 + }, + { + "epoch": 7.042001787310098, + "grad_norm": 0.01806640625, + "learning_rate": 0.024557621789756644, + "loss": 0.7816, + "num_input_tokens_seen": 27442464, + "step": 47280 + }, + { + "epoch": 7.042746499851058, + "grad_norm": 0.018798828125, + "learning_rate": 0.024556119073863455, + "loss": 0.8123, + "num_input_tokens_seen": 27445184, + "step": 47285 + }, + { + "epoch": 7.043491212392016, + "grad_norm": 0.01287841796875, + "learning_rate": 0.02455461619652934, + "loss": 0.8067, + "num_input_tokens_seen": 27447840, + "step": 47290 + }, + { + "epoch": 7.044235924932976, + "grad_norm": 0.01953125, + "learning_rate": 0.024553113157779707, + "loss": 0.7933, + "num_input_tokens_seen": 27451008, + "step": 47295 + }, + { + "epoch": 7.044980637473935, + "grad_norm": 0.021728515625, + "learning_rate": 0.02455160995763993, + "loss": 0.7961, + "num_input_tokens_seen": 27454400, + "step": 47300 + }, + { + "epoch": 7.045725350014894, + "grad_norm": 0.0264892578125, + "learning_rate": 0.024550106596135404, + "loss": 0.8049, + "num_input_tokens_seen": 27457280, + "step": 47305 + }, + { + "epoch": 7.046470062555853, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02454860307329154, + "loss": 0.7932, + "num_input_tokens_seen": 27460256, + "step": 47310 + }, + { + "epoch": 7.047214775096813, + "grad_norm": 0.0181884765625, + "learning_rate": 0.024547099389133735, + "loss": 0.8189, + "num_input_tokens_seen": 27463072, + "step": 47315 + }, + { + "epoch": 7.047959487637772, + "grad_norm": 0.0238037109375, + "learning_rate": 0.024545595543687387, + "loss": 0.7998, + "num_input_tokens_seen": 27465728, + "step": 47320 + }, + { + "epoch": 7.048704200178731, + "grad_norm": 0.0186767578125, + "learning_rate": 0.024544091536977897, + "loss": 0.7797, + "num_input_tokens_seen": 27469024, + "step": 47325 + }, + { + "epoch": 7.04944891271969, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02454258736903069, + "loss": 0.7843, + "num_input_tokens_seen": 27471904, + "step": 47330 + }, + { + "epoch": 7.05019362526065, + "grad_norm": 0.0177001953125, + "learning_rate": 0.024541083039871167, + "loss": 0.7986, + "num_input_tokens_seen": 27474880, + "step": 47335 + }, + { + "epoch": 7.050938337801608, + "grad_norm": 0.0128173828125, + "learning_rate": 0.024539578549524734, + "loss": 0.8196, + "num_input_tokens_seen": 27477888, + "step": 47340 + }, + { + "epoch": 7.051683050342568, + "grad_norm": 0.0162353515625, + "learning_rate": 0.024538073898016828, + "loss": 0.7845, + "num_input_tokens_seen": 27480672, + "step": 47345 + }, + { + "epoch": 7.052427762883527, + "grad_norm": 0.026611328125, + "learning_rate": 0.024536569085372852, + "loss": 0.8197, + "num_input_tokens_seen": 27483424, + "step": 47350 + }, + { + "epoch": 7.053172475424486, + "grad_norm": 0.026123046875, + "learning_rate": 0.024535064111618234, + "loss": 0.7975, + "num_input_tokens_seen": 27486176, + "step": 47355 + }, + { + "epoch": 7.053917187965445, + "grad_norm": 0.0142822265625, + "learning_rate": 0.024533558976778398, + "loss": 0.8059, + "num_input_tokens_seen": 27489120, + "step": 47360 + }, + { + "epoch": 7.054661900506405, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02453205368087878, + "loss": 0.8052, + "num_input_tokens_seen": 27491968, + "step": 47365 + }, + { + "epoch": 7.055406613047364, + "grad_norm": 0.024658203125, + "learning_rate": 0.02453054822394479, + "loss": 0.7862, + "num_input_tokens_seen": 27494816, + "step": 47370 + }, + { + "epoch": 7.056151325588323, + "grad_norm": 0.0262451171875, + "learning_rate": 0.024529042606001885, + "loss": 0.7822, + "num_input_tokens_seen": 27497792, + "step": 47375 + }, + { + "epoch": 7.056896038129282, + "grad_norm": 0.02099609375, + "learning_rate": 0.024527536827075487, + "loss": 0.8239, + "num_input_tokens_seen": 27500576, + "step": 47380 + }, + { + "epoch": 7.057640750670242, + "grad_norm": 0.021484375, + "learning_rate": 0.024526030887191033, + "loss": 0.7952, + "num_input_tokens_seen": 27503456, + "step": 47385 + }, + { + "epoch": 7.0583854632112, + "grad_norm": 0.03515625, + "learning_rate": 0.024524524786373975, + "loss": 0.7915, + "num_input_tokens_seen": 27506272, + "step": 47390 + }, + { + "epoch": 7.05913017575216, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02452301852464975, + "loss": 0.7891, + "num_input_tokens_seen": 27509184, + "step": 47395 + }, + { + "epoch": 7.059874888293119, + "grad_norm": 0.0242919921875, + "learning_rate": 0.024521512102043803, + "loss": 0.8332, + "num_input_tokens_seen": 27511808, + "step": 47400 + }, + { + "epoch": 7.0606196008340785, + "grad_norm": 0.022216796875, + "learning_rate": 0.02452000551858159, + "loss": 0.7989, + "num_input_tokens_seen": 27514848, + "step": 47405 + }, + { + "epoch": 7.061364313375037, + "grad_norm": 0.0218505859375, + "learning_rate": 0.024518498774288553, + "loss": 0.7862, + "num_input_tokens_seen": 27517632, + "step": 47410 + }, + { + "epoch": 7.062109025915996, + "grad_norm": 0.0150146484375, + "learning_rate": 0.024516991869190166, + "loss": 0.7874, + "num_input_tokens_seen": 27520640, + "step": 47415 + }, + { + "epoch": 7.062853738456956, + "grad_norm": 0.013427734375, + "learning_rate": 0.024515484803311862, + "loss": 0.8035, + "num_input_tokens_seen": 27523648, + "step": 47420 + }, + { + "epoch": 7.063598450997914, + "grad_norm": 0.024169921875, + "learning_rate": 0.024513977576679117, + "loss": 0.8158, + "num_input_tokens_seen": 27526400, + "step": 47425 + }, + { + "epoch": 7.064343163538874, + "grad_norm": 0.0213623046875, + "learning_rate": 0.024512470189317393, + "loss": 0.8021, + "num_input_tokens_seen": 27529376, + "step": 47430 + }, + { + "epoch": 7.065087876079833, + "grad_norm": 0.015380859375, + "learning_rate": 0.02451096264125215, + "loss": 0.7759, + "num_input_tokens_seen": 27531968, + "step": 47435 + }, + { + "epoch": 7.065832588620792, + "grad_norm": 0.034423828125, + "learning_rate": 0.02450945493250886, + "loss": 0.7985, + "num_input_tokens_seen": 27534784, + "step": 47440 + }, + { + "epoch": 7.066577301161751, + "grad_norm": 0.0284423828125, + "learning_rate": 0.024507947063112993, + "loss": 0.824, + "num_input_tokens_seen": 27537920, + "step": 47445 + }, + { + "epoch": 7.067322013702711, + "grad_norm": 0.025390625, + "learning_rate": 0.02450643903309003, + "loss": 0.8186, + "num_input_tokens_seen": 27540800, + "step": 47450 + }, + { + "epoch": 7.06806672624367, + "grad_norm": 0.03173828125, + "learning_rate": 0.024504930842465432, + "loss": 0.7912, + "num_input_tokens_seen": 27543360, + "step": 47455 + }, + { + "epoch": 7.068811438784629, + "grad_norm": 0.02490234375, + "learning_rate": 0.024503422491264696, + "loss": 0.8061, + "num_input_tokens_seen": 27546240, + "step": 47460 + }, + { + "epoch": 7.069556151325588, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02450191397951329, + "loss": 0.806, + "num_input_tokens_seen": 27549248, + "step": 47465 + }, + { + "epoch": 7.070300863866548, + "grad_norm": 0.0191650390625, + "learning_rate": 0.024500405307236708, + "loss": 0.7849, + "num_input_tokens_seen": 27552512, + "step": 47470 + }, + { + "epoch": 7.071045576407506, + "grad_norm": 0.0208740234375, + "learning_rate": 0.024498896474460432, + "loss": 0.7811, + "num_input_tokens_seen": 27555360, + "step": 47475 + }, + { + "epoch": 7.071790288948466, + "grad_norm": 0.017578125, + "learning_rate": 0.024497387481209956, + "loss": 0.7837, + "num_input_tokens_seen": 27558432, + "step": 47480 + }, + { + "epoch": 7.072535001489425, + "grad_norm": 0.018798828125, + "learning_rate": 0.024495878327510767, + "loss": 0.791, + "num_input_tokens_seen": 27561184, + "step": 47485 + }, + { + "epoch": 7.0732797140303845, + "grad_norm": 0.025634765625, + "learning_rate": 0.024494369013388372, + "loss": 0.7767, + "num_input_tokens_seen": 27564288, + "step": 47490 + }, + { + "epoch": 7.074024426571343, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02449285953886826, + "loss": 0.7883, + "num_input_tokens_seen": 27567008, + "step": 47495 + }, + { + "epoch": 7.074769139112303, + "grad_norm": 0.0260009765625, + "learning_rate": 0.024491349903975935, + "loss": 0.7779, + "num_input_tokens_seen": 27569920, + "step": 47500 + }, + { + "epoch": 7.075513851653262, + "grad_norm": 0.0279541015625, + "learning_rate": 0.024489840108736897, + "loss": 0.8087, + "num_input_tokens_seen": 27572864, + "step": 47505 + }, + { + "epoch": 7.076258564194221, + "grad_norm": 0.0201416015625, + "learning_rate": 0.024488330153176655, + "loss": 0.7846, + "num_input_tokens_seen": 27575776, + "step": 47510 + }, + { + "epoch": 7.07700327673518, + "grad_norm": 0.02783203125, + "learning_rate": 0.024486820037320722, + "loss": 0.83, + "num_input_tokens_seen": 27578560, + "step": 47515 + }, + { + "epoch": 7.07774798927614, + "grad_norm": 0.03076171875, + "learning_rate": 0.0244853097611946, + "loss": 0.8349, + "num_input_tokens_seen": 27581600, + "step": 47520 + }, + { + "epoch": 7.078492701817098, + "grad_norm": 0.0186767578125, + "learning_rate": 0.024483799324823818, + "loss": 0.8004, + "num_input_tokens_seen": 27584640, + "step": 47525 + }, + { + "epoch": 7.079237414358058, + "grad_norm": 0.0140380859375, + "learning_rate": 0.024482288728233877, + "loss": 0.8156, + "num_input_tokens_seen": 27587616, + "step": 47530 + }, + { + "epoch": 7.079982126899017, + "grad_norm": 0.0216064453125, + "learning_rate": 0.024480777971450315, + "loss": 0.8018, + "num_input_tokens_seen": 27590368, + "step": 47535 + }, + { + "epoch": 7.0807268394399765, + "grad_norm": 0.0205078125, + "learning_rate": 0.024479267054498637, + "loss": 0.8187, + "num_input_tokens_seen": 27593152, + "step": 47540 + }, + { + "epoch": 7.081471551980935, + "grad_norm": 0.01806640625, + "learning_rate": 0.02447775597740438, + "loss": 0.778, + "num_input_tokens_seen": 27596064, + "step": 47545 + }, + { + "epoch": 7.082216264521895, + "grad_norm": 0.0245361328125, + "learning_rate": 0.024476244740193068, + "loss": 0.7723, + "num_input_tokens_seen": 27598912, + "step": 47550 + }, + { + "epoch": 7.082960977062854, + "grad_norm": 0.017333984375, + "learning_rate": 0.024474733342890232, + "loss": 0.7967, + "num_input_tokens_seen": 27602016, + "step": 47555 + }, + { + "epoch": 7.083705689603813, + "grad_norm": 0.0205078125, + "learning_rate": 0.024473221785521405, + "loss": 0.7984, + "num_input_tokens_seen": 27604608, + "step": 47560 + }, + { + "epoch": 7.084450402144772, + "grad_norm": 0.0198974609375, + "learning_rate": 0.024471710068112124, + "loss": 0.791, + "num_input_tokens_seen": 27607520, + "step": 47565 + }, + { + "epoch": 7.085195114685732, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02447019819068793, + "loss": 0.833, + "num_input_tokens_seen": 27610784, + "step": 47570 + }, + { + "epoch": 7.0859398272266905, + "grad_norm": 0.0142822265625, + "learning_rate": 0.024468686153274365, + "loss": 0.8182, + "num_input_tokens_seen": 27613632, + "step": 47575 + }, + { + "epoch": 7.08668453976765, + "grad_norm": 0.0264892578125, + "learning_rate": 0.024467173955896968, + "loss": 0.7821, + "num_input_tokens_seen": 27616448, + "step": 47580 + }, + { + "epoch": 7.087429252308609, + "grad_norm": 0.02001953125, + "learning_rate": 0.02446566159858129, + "loss": 0.7851, + "num_input_tokens_seen": 27619392, + "step": 47585 + }, + { + "epoch": 7.088173964849568, + "grad_norm": 0.01904296875, + "learning_rate": 0.024464149081352877, + "loss": 0.7726, + "num_input_tokens_seen": 27622304, + "step": 47590 + }, + { + "epoch": 7.088918677390527, + "grad_norm": 0.0211181640625, + "learning_rate": 0.024462636404237285, + "loss": 0.7994, + "num_input_tokens_seen": 27625120, + "step": 47595 + }, + { + "epoch": 7.089663389931486, + "grad_norm": 0.0235595703125, + "learning_rate": 0.024461123567260067, + "loss": 0.8103, + "num_input_tokens_seen": 27628576, + "step": 47600 + }, + { + "epoch": 7.090408102472446, + "grad_norm": 0.0322265625, + "learning_rate": 0.024459610570446783, + "loss": 0.7908, + "num_input_tokens_seen": 27631296, + "step": 47605 + }, + { + "epoch": 7.091152815013404, + "grad_norm": 0.017333984375, + "learning_rate": 0.024458097413822998, + "loss": 0.7591, + "num_input_tokens_seen": 27634048, + "step": 47610 + }, + { + "epoch": 7.091897527554364, + "grad_norm": 0.01171875, + "learning_rate": 0.024456584097414264, + "loss": 0.8001, + "num_input_tokens_seen": 27637216, + "step": 47615 + }, + { + "epoch": 7.092642240095323, + "grad_norm": 0.0213623046875, + "learning_rate": 0.024455070621246154, + "loss": 0.8099, + "num_input_tokens_seen": 27640160, + "step": 47620 + }, + { + "epoch": 7.0933869526362825, + "grad_norm": 0.0301513671875, + "learning_rate": 0.024453556985344238, + "loss": 0.7644, + "num_input_tokens_seen": 27642944, + "step": 47625 + }, + { + "epoch": 7.094131665177241, + "grad_norm": 0.03564453125, + "learning_rate": 0.024452043189734082, + "loss": 0.7787, + "num_input_tokens_seen": 27645856, + "step": 47630 + }, + { + "epoch": 7.094876377718201, + "grad_norm": 0.0262451171875, + "learning_rate": 0.024450529234441264, + "loss": 0.8402, + "num_input_tokens_seen": 27648800, + "step": 47635 + }, + { + "epoch": 7.09562109025916, + "grad_norm": 0.025390625, + "learning_rate": 0.024449015119491362, + "loss": 0.8302, + "num_input_tokens_seen": 27651680, + "step": 47640 + }, + { + "epoch": 7.096365802800119, + "grad_norm": 0.0125732421875, + "learning_rate": 0.024447500844909948, + "loss": 0.771, + "num_input_tokens_seen": 27654624, + "step": 47645 + }, + { + "epoch": 7.097110515341078, + "grad_norm": 0.0220947265625, + "learning_rate": 0.024445986410722615, + "loss": 0.7844, + "num_input_tokens_seen": 27657504, + "step": 47650 + }, + { + "epoch": 7.097855227882038, + "grad_norm": 0.0206298828125, + "learning_rate": 0.024444471816954937, + "loss": 0.7874, + "num_input_tokens_seen": 27660224, + "step": 47655 + }, + { + "epoch": 7.0985999404229965, + "grad_norm": 0.016357421875, + "learning_rate": 0.02444295706363251, + "loss": 0.7836, + "num_input_tokens_seen": 27663232, + "step": 47660 + }, + { + "epoch": 7.099344652963956, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02444144215078092, + "loss": 0.7897, + "num_input_tokens_seen": 27665984, + "step": 47665 + }, + { + "epoch": 7.100089365504915, + "grad_norm": 0.02294921875, + "learning_rate": 0.02443992707842576, + "loss": 0.7976, + "num_input_tokens_seen": 27669152, + "step": 47670 + }, + { + "epoch": 7.1008340780458745, + "grad_norm": 0.02490234375, + "learning_rate": 0.02443841184659263, + "loss": 0.8127, + "num_input_tokens_seen": 27672032, + "step": 47675 + }, + { + "epoch": 7.101578790586833, + "grad_norm": 0.037353515625, + "learning_rate": 0.024436896455307118, + "loss": 0.8191, + "num_input_tokens_seen": 27674720, + "step": 47680 + }, + { + "epoch": 7.102323503127793, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02443538090459484, + "loss": 0.7961, + "num_input_tokens_seen": 27677984, + "step": 47685 + }, + { + "epoch": 7.103068215668752, + "grad_norm": 0.02490234375, + "learning_rate": 0.024433865194481385, + "loss": 0.7926, + "num_input_tokens_seen": 27680768, + "step": 47690 + }, + { + "epoch": 7.103812928209711, + "grad_norm": 0.0284423828125, + "learning_rate": 0.024432349324992365, + "loss": 0.8227, + "num_input_tokens_seen": 27683264, + "step": 47695 + }, + { + "epoch": 7.10455764075067, + "grad_norm": 0.0216064453125, + "learning_rate": 0.024430833296153392, + "loss": 0.7816, + "num_input_tokens_seen": 27686304, + "step": 47700 + }, + { + "epoch": 7.10530235329163, + "grad_norm": 0.01239013671875, + "learning_rate": 0.024429317107990078, + "loss": 0.8005, + "num_input_tokens_seen": 27689280, + "step": 47705 + }, + { + "epoch": 7.1060470658325885, + "grad_norm": 0.0267333984375, + "learning_rate": 0.024427800760528032, + "loss": 0.799, + "num_input_tokens_seen": 27692160, + "step": 47710 + }, + { + "epoch": 7.106791778373548, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02442628425379287, + "loss": 0.8642, + "num_input_tokens_seen": 27694816, + "step": 47715 + }, + { + "epoch": 7.107536490914507, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02442476758781022, + "loss": 0.7983, + "num_input_tokens_seen": 27697760, + "step": 47720 + }, + { + "epoch": 7.1082812034554665, + "grad_norm": 0.026123046875, + "learning_rate": 0.024423250762605705, + "loss": 0.7937, + "num_input_tokens_seen": 27701024, + "step": 47725 + }, + { + "epoch": 7.109025915996425, + "grad_norm": 0.02099609375, + "learning_rate": 0.02442173377820494, + "loss": 0.7896, + "num_input_tokens_seen": 27703712, + "step": 47730 + }, + { + "epoch": 7.109770628537385, + "grad_norm": 0.0181884765625, + "learning_rate": 0.02442021663463356, + "loss": 0.7984, + "num_input_tokens_seen": 27706592, + "step": 47735 + }, + { + "epoch": 7.110515341078344, + "grad_norm": 0.0177001953125, + "learning_rate": 0.024418699331917197, + "loss": 0.8064, + "num_input_tokens_seen": 27709408, + "step": 47740 + }, + { + "epoch": 7.111260053619303, + "grad_norm": 0.01806640625, + "learning_rate": 0.02441718187008148, + "loss": 0.8, + "num_input_tokens_seen": 27712160, + "step": 47745 + }, + { + "epoch": 7.112004766160262, + "grad_norm": 0.017333984375, + "learning_rate": 0.024415664249152043, + "loss": 0.8316, + "num_input_tokens_seen": 27715232, + "step": 47750 + }, + { + "epoch": 7.112749478701222, + "grad_norm": 0.017822265625, + "learning_rate": 0.024414146469154532, + "loss": 0.8125, + "num_input_tokens_seen": 27718208, + "step": 47755 + }, + { + "epoch": 7.1134941912421805, + "grad_norm": 0.0146484375, + "learning_rate": 0.024412628530114583, + "loss": 0.8178, + "num_input_tokens_seen": 27721056, + "step": 47760 + }, + { + "epoch": 7.114238903783139, + "grad_norm": 0.03759765625, + "learning_rate": 0.02441111043205784, + "loss": 0.8416, + "num_input_tokens_seen": 27724000, + "step": 47765 + }, + { + "epoch": 7.114983616324099, + "grad_norm": 0.0267333984375, + "learning_rate": 0.024409592175009952, + "loss": 0.8186, + "num_input_tokens_seen": 27727008, + "step": 47770 + }, + { + "epoch": 7.115728328865058, + "grad_norm": 0.0262451171875, + "learning_rate": 0.024408073758996573, + "loss": 0.8156, + "num_input_tokens_seen": 27730176, + "step": 47775 + }, + { + "epoch": 7.116473041406017, + "grad_norm": 0.01953125, + "learning_rate": 0.024406555184043346, + "loss": 0.798, + "num_input_tokens_seen": 27732928, + "step": 47780 + }, + { + "epoch": 7.117217753946976, + "grad_norm": 0.01324462890625, + "learning_rate": 0.024405036450175926, + "loss": 0.8133, + "num_input_tokens_seen": 27736160, + "step": 47785 + }, + { + "epoch": 7.117962466487936, + "grad_norm": 0.01348876953125, + "learning_rate": 0.024403517557419978, + "loss": 0.8037, + "num_input_tokens_seen": 27739168, + "step": 47790 + }, + { + "epoch": 7.1187071790288945, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02440199850580116, + "loss": 0.8138, + "num_input_tokens_seen": 27742304, + "step": 47795 + }, + { + "epoch": 7.119451891569854, + "grad_norm": 0.026611328125, + "learning_rate": 0.02440047929534513, + "loss": 0.7851, + "num_input_tokens_seen": 27745120, + "step": 47800 + }, + { + "epoch": 7.120196604110813, + "grad_norm": 0.0223388671875, + "learning_rate": 0.024398959926077556, + "loss": 0.7869, + "num_input_tokens_seen": 27747968, + "step": 47805 + }, + { + "epoch": 7.1209413166517725, + "grad_norm": 0.0189208984375, + "learning_rate": 0.024397440398024106, + "loss": 0.7918, + "num_input_tokens_seen": 27750816, + "step": 47810 + }, + { + "epoch": 7.121686029192731, + "grad_norm": 0.0169677734375, + "learning_rate": 0.024395920711210453, + "loss": 0.7971, + "num_input_tokens_seen": 27753728, + "step": 47815 + }, + { + "epoch": 7.122430741733691, + "grad_norm": 0.024169921875, + "learning_rate": 0.02439440086566227, + "loss": 0.8057, + "num_input_tokens_seen": 27756544, + "step": 47820 + }, + { + "epoch": 7.12317545427465, + "grad_norm": 0.01416015625, + "learning_rate": 0.024392880861405234, + "loss": 0.8026, + "num_input_tokens_seen": 27759424, + "step": 47825 + }, + { + "epoch": 7.123920166815609, + "grad_norm": 0.020263671875, + "learning_rate": 0.02439136069846502, + "loss": 0.7985, + "num_input_tokens_seen": 27762112, + "step": 47830 + }, + { + "epoch": 7.124664879356568, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02438984037686731, + "loss": 0.7899, + "num_input_tokens_seen": 27764896, + "step": 47835 + }, + { + "epoch": 7.125409591897528, + "grad_norm": 0.018798828125, + "learning_rate": 0.02438831989663779, + "loss": 0.798, + "num_input_tokens_seen": 27767808, + "step": 47840 + }, + { + "epoch": 7.1261543044384865, + "grad_norm": 0.0179443359375, + "learning_rate": 0.024386799257802152, + "loss": 0.7796, + "num_input_tokens_seen": 27770656, + "step": 47845 + }, + { + "epoch": 7.126899016979446, + "grad_norm": 0.017822265625, + "learning_rate": 0.024385278460386076, + "loss": 0.7863, + "num_input_tokens_seen": 27773472, + "step": 47850 + }, + { + "epoch": 7.127643729520405, + "grad_norm": 0.0240478515625, + "learning_rate": 0.024383757504415265, + "loss": 0.8001, + "num_input_tokens_seen": 27776448, + "step": 47855 + }, + { + "epoch": 7.128388442061365, + "grad_norm": 0.022705078125, + "learning_rate": 0.024382236389915404, + "loss": 0.7893, + "num_input_tokens_seen": 27779040, + "step": 47860 + }, + { + "epoch": 7.129133154602323, + "grad_norm": 0.0191650390625, + "learning_rate": 0.0243807151169122, + "loss": 0.8586, + "num_input_tokens_seen": 27781664, + "step": 47865 + }, + { + "epoch": 7.129877867143283, + "grad_norm": 0.01312255859375, + "learning_rate": 0.024379193685431337, + "loss": 0.8167, + "num_input_tokens_seen": 27784448, + "step": 47870 + }, + { + "epoch": 7.130622579684242, + "grad_norm": 0.0145263671875, + "learning_rate": 0.02437767209549854, + "loss": 0.7899, + "num_input_tokens_seen": 27787296, + "step": 47875 + }, + { + "epoch": 7.131367292225201, + "grad_norm": 0.0267333984375, + "learning_rate": 0.024376150347139497, + "loss": 0.8284, + "num_input_tokens_seen": 27790208, + "step": 47880 + }, + { + "epoch": 7.13211200476616, + "grad_norm": 0.01177978515625, + "learning_rate": 0.02437462844037993, + "loss": 0.793, + "num_input_tokens_seen": 27793216, + "step": 47885 + }, + { + "epoch": 7.13285671730712, + "grad_norm": 0.0186767578125, + "learning_rate": 0.024373106375245538, + "loss": 0.8321, + "num_input_tokens_seen": 27796256, + "step": 47890 + }, + { + "epoch": 7.1336014298480785, + "grad_norm": 0.0159912109375, + "learning_rate": 0.024371584151762042, + "loss": 0.8027, + "num_input_tokens_seen": 27799104, + "step": 47895 + }, + { + "epoch": 7.134346142389038, + "grad_norm": 0.0213623046875, + "learning_rate": 0.024370061769955157, + "loss": 0.7963, + "num_input_tokens_seen": 27801824, + "step": 47900 + }, + { + "epoch": 7.135090854929997, + "grad_norm": 0.014892578125, + "learning_rate": 0.024368539229850605, + "loss": 0.7956, + "num_input_tokens_seen": 27804608, + "step": 47905 + }, + { + "epoch": 7.135835567470957, + "grad_norm": 0.029296875, + "learning_rate": 0.0243670165314741, + "loss": 0.7958, + "num_input_tokens_seen": 27807648, + "step": 47910 + }, + { + "epoch": 7.136580280011915, + "grad_norm": 0.01300048828125, + "learning_rate": 0.02436549367485137, + "loss": 0.7782, + "num_input_tokens_seen": 27810240, + "step": 47915 + }, + { + "epoch": 7.137324992552875, + "grad_norm": 0.017578125, + "learning_rate": 0.024363970660008144, + "loss": 0.8311, + "num_input_tokens_seen": 27813216, + "step": 47920 + }, + { + "epoch": 7.138069705093834, + "grad_norm": 0.0198974609375, + "learning_rate": 0.024362447486970154, + "loss": 0.7999, + "num_input_tokens_seen": 27816352, + "step": 47925 + }, + { + "epoch": 7.1388144176347925, + "grad_norm": 0.0185546875, + "learning_rate": 0.024360924155763124, + "loss": 0.7914, + "num_input_tokens_seen": 27819360, + "step": 47930 + }, + { + "epoch": 7.139559130175752, + "grad_norm": 0.0206298828125, + "learning_rate": 0.0243594006664128, + "loss": 0.7969, + "num_input_tokens_seen": 27822432, + "step": 47935 + }, + { + "epoch": 7.140303842716711, + "grad_norm": 0.026611328125, + "learning_rate": 0.02435787701894491, + "loss": 0.8001, + "num_input_tokens_seen": 27825376, + "step": 47940 + }, + { + "epoch": 7.141048555257671, + "grad_norm": 0.024169921875, + "learning_rate": 0.024356353213385203, + "loss": 0.7955, + "num_input_tokens_seen": 27828384, + "step": 47945 + }, + { + "epoch": 7.141793267798629, + "grad_norm": 0.0181884765625, + "learning_rate": 0.024354829249759416, + "loss": 0.8046, + "num_input_tokens_seen": 27831328, + "step": 47950 + }, + { + "epoch": 7.142537980339589, + "grad_norm": 0.0211181640625, + "learning_rate": 0.024353305128093295, + "loss": 0.792, + "num_input_tokens_seen": 27834016, + "step": 47955 + }, + { + "epoch": 7.143282692880548, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02435178084841259, + "loss": 0.771, + "num_input_tokens_seen": 27836672, + "step": 47960 + }, + { + "epoch": 7.144027405421507, + "grad_norm": 0.013671875, + "learning_rate": 0.024350256410743055, + "loss": 0.8145, + "num_input_tokens_seen": 27839296, + "step": 47965 + }, + { + "epoch": 7.144772117962466, + "grad_norm": 0.0245361328125, + "learning_rate": 0.024348731815110432, + "loss": 0.7786, + "num_input_tokens_seen": 27842112, + "step": 47970 + }, + { + "epoch": 7.145516830503426, + "grad_norm": 0.01806640625, + "learning_rate": 0.024347207061540498, + "loss": 0.8198, + "num_input_tokens_seen": 27845024, + "step": 47975 + }, + { + "epoch": 7.1462615430443845, + "grad_norm": 0.02392578125, + "learning_rate": 0.024345682150058997, + "loss": 0.8129, + "num_input_tokens_seen": 27847776, + "step": 47980 + }, + { + "epoch": 7.147006255585344, + "grad_norm": 0.01397705078125, + "learning_rate": 0.024344157080691698, + "loss": 0.7813, + "num_input_tokens_seen": 27851008, + "step": 47985 + }, + { + "epoch": 7.147750968126303, + "grad_norm": 0.01513671875, + "learning_rate": 0.02434263185346435, + "loss": 0.7906, + "num_input_tokens_seen": 27854240, + "step": 47990 + }, + { + "epoch": 7.148495680667263, + "grad_norm": 0.029541015625, + "learning_rate": 0.024341106468402745, + "loss": 0.8022, + "num_input_tokens_seen": 27857312, + "step": 47995 + }, + { + "epoch": 7.149240393208221, + "grad_norm": 0.0303955078125, + "learning_rate": 0.024339580925532636, + "loss": 0.788, + "num_input_tokens_seen": 27860032, + "step": 48000 + }, + { + "epoch": 7.149985105749181, + "grad_norm": 0.0263671875, + "learning_rate": 0.0243380552248798, + "loss": 0.7933, + "num_input_tokens_seen": 27863008, + "step": 48005 + }, + { + "epoch": 7.15072981829014, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02433652936647001, + "loss": 0.8063, + "num_input_tokens_seen": 27865920, + "step": 48010 + }, + { + "epoch": 7.151474530831099, + "grad_norm": 0.0220947265625, + "learning_rate": 0.024335003350329044, + "loss": 0.7824, + "num_input_tokens_seen": 27869120, + "step": 48015 + }, + { + "epoch": 7.152219243372058, + "grad_norm": 0.0196533203125, + "learning_rate": 0.024333477176482692, + "loss": 0.7878, + "num_input_tokens_seen": 27872128, + "step": 48020 + }, + { + "epoch": 7.152963955913018, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02433195084495672, + "loss": 0.8079, + "num_input_tokens_seen": 27874912, + "step": 48025 + }, + { + "epoch": 7.153708668453977, + "grad_norm": 0.018310546875, + "learning_rate": 0.02433042435577693, + "loss": 0.7944, + "num_input_tokens_seen": 27877344, + "step": 48030 + }, + { + "epoch": 7.154453380994936, + "grad_norm": 0.039794921875, + "learning_rate": 0.024328897708969097, + "loss": 0.8439, + "num_input_tokens_seen": 27880128, + "step": 48035 + }, + { + "epoch": 7.155198093535895, + "grad_norm": 0.03466796875, + "learning_rate": 0.024327370904559022, + "loss": 0.8265, + "num_input_tokens_seen": 27882816, + "step": 48040 + }, + { + "epoch": 7.155942806076855, + "grad_norm": 0.01373291015625, + "learning_rate": 0.0243258439425725, + "loss": 0.8253, + "num_input_tokens_seen": 27885504, + "step": 48045 + }, + { + "epoch": 7.156687518617813, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02432431682303532, + "loss": 0.802, + "num_input_tokens_seen": 27888480, + "step": 48050 + }, + { + "epoch": 7.157432231158773, + "grad_norm": 0.0245361328125, + "learning_rate": 0.024322789545973278, + "loss": 0.8101, + "num_input_tokens_seen": 27891328, + "step": 48055 + }, + { + "epoch": 7.158176943699732, + "grad_norm": 0.0289306640625, + "learning_rate": 0.024321262111412188, + "loss": 0.7853, + "num_input_tokens_seen": 27894112, + "step": 48060 + }, + { + "epoch": 7.158921656240691, + "grad_norm": 0.0233154296875, + "learning_rate": 0.024319734519377845, + "loss": 0.7636, + "num_input_tokens_seen": 27897216, + "step": 48065 + }, + { + "epoch": 7.15966636878165, + "grad_norm": 0.02783203125, + "learning_rate": 0.024318206769896066, + "loss": 0.7874, + "num_input_tokens_seen": 27900064, + "step": 48070 + }, + { + "epoch": 7.16041108132261, + "grad_norm": 0.0196533203125, + "learning_rate": 0.024316678862992647, + "loss": 0.8162, + "num_input_tokens_seen": 27903072, + "step": 48075 + }, + { + "epoch": 7.161155793863569, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02431515079869341, + "loss": 0.7988, + "num_input_tokens_seen": 27905984, + "step": 48080 + }, + { + "epoch": 7.161900506404528, + "grad_norm": 0.0191650390625, + "learning_rate": 0.024313622577024168, + "loss": 0.7971, + "num_input_tokens_seen": 27908736, + "step": 48085 + }, + { + "epoch": 7.162645218945487, + "grad_norm": 0.01092529296875, + "learning_rate": 0.02431209419801074, + "loss": 0.7973, + "num_input_tokens_seen": 27911552, + "step": 48090 + }, + { + "epoch": 7.163389931486447, + "grad_norm": 0.0211181640625, + "learning_rate": 0.024310565661678944, + "loss": 0.8247, + "num_input_tokens_seen": 27914400, + "step": 48095 + }, + { + "epoch": 7.164134644027405, + "grad_norm": 0.0252685546875, + "learning_rate": 0.024309036968054604, + "loss": 0.78, + "num_input_tokens_seen": 27917408, + "step": 48100 + }, + { + "epoch": 7.164879356568365, + "grad_norm": 0.022216796875, + "learning_rate": 0.024307508117163545, + "loss": 0.788, + "num_input_tokens_seen": 27920896, + "step": 48105 + }, + { + "epoch": 7.165624069109324, + "grad_norm": 0.019775390625, + "learning_rate": 0.024305979109031595, + "loss": 0.7773, + "num_input_tokens_seen": 27923712, + "step": 48110 + }, + { + "epoch": 7.166368781650283, + "grad_norm": 0.018798828125, + "learning_rate": 0.024304449943684588, + "loss": 0.8037, + "num_input_tokens_seen": 27926304, + "step": 48115 + }, + { + "epoch": 7.167113494191242, + "grad_norm": 0.021240234375, + "learning_rate": 0.024302920621148354, + "loss": 0.8104, + "num_input_tokens_seen": 27929216, + "step": 48120 + }, + { + "epoch": 7.167858206732201, + "grad_norm": 0.0185546875, + "learning_rate": 0.02430139114144873, + "loss": 0.8286, + "num_input_tokens_seen": 27931872, + "step": 48125 + }, + { + "epoch": 7.168602919273161, + "grad_norm": 0.021240234375, + "learning_rate": 0.02429986150461156, + "loss": 0.787, + "num_input_tokens_seen": 27934784, + "step": 48130 + }, + { + "epoch": 7.169347631814119, + "grad_norm": 0.01385498046875, + "learning_rate": 0.024298331710662677, + "loss": 0.7928, + "num_input_tokens_seen": 27937536, + "step": 48135 + }, + { + "epoch": 7.170092344355079, + "grad_norm": 0.01806640625, + "learning_rate": 0.024296801759627937, + "loss": 0.8014, + "num_input_tokens_seen": 27940352, + "step": 48140 + }, + { + "epoch": 7.170837056896038, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02429527165153318, + "loss": 0.7842, + "num_input_tokens_seen": 27943488, + "step": 48145 + }, + { + "epoch": 7.171581769436997, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02429374138640425, + "loss": 0.7911, + "num_input_tokens_seen": 27946560, + "step": 48150 + }, + { + "epoch": 7.172326481977956, + "grad_norm": 0.027587890625, + "learning_rate": 0.024292210964267008, + "loss": 0.7834, + "num_input_tokens_seen": 27949472, + "step": 48155 + }, + { + "epoch": 7.173071194518916, + "grad_norm": 0.0294189453125, + "learning_rate": 0.024290680385147305, + "loss": 0.8138, + "num_input_tokens_seen": 27952288, + "step": 48160 + }, + { + "epoch": 7.173815907059875, + "grad_norm": 0.017822265625, + "learning_rate": 0.024289149649070998, + "loss": 0.7984, + "num_input_tokens_seen": 27955200, + "step": 48165 + }, + { + "epoch": 7.174560619600834, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02428761875606395, + "loss": 0.8177, + "num_input_tokens_seen": 27957952, + "step": 48170 + }, + { + "epoch": 7.175305332141793, + "grad_norm": 0.0198974609375, + "learning_rate": 0.024286087706152024, + "loss": 0.8021, + "num_input_tokens_seen": 27960768, + "step": 48175 + }, + { + "epoch": 7.176050044682753, + "grad_norm": 0.03173828125, + "learning_rate": 0.024284556499361083, + "loss": 0.8014, + "num_input_tokens_seen": 27963520, + "step": 48180 + }, + { + "epoch": 7.176794757223711, + "grad_norm": 0.02294921875, + "learning_rate": 0.024283025135716996, + "loss": 0.8047, + "num_input_tokens_seen": 27966336, + "step": 48185 + }, + { + "epoch": 7.177539469764671, + "grad_norm": 0.0184326171875, + "learning_rate": 0.024281493615245637, + "loss": 0.7837, + "num_input_tokens_seen": 27969408, + "step": 48190 + }, + { + "epoch": 7.17828418230563, + "grad_norm": 0.038330078125, + "learning_rate": 0.024279961937972872, + "loss": 0.8455, + "num_input_tokens_seen": 27972608, + "step": 48195 + }, + { + "epoch": 7.1790288948465895, + "grad_norm": 0.013427734375, + "learning_rate": 0.024278430103924584, + "loss": 0.8049, + "num_input_tokens_seen": 27975328, + "step": 48200 + }, + { + "epoch": 7.179773607387548, + "grad_norm": 0.0201416015625, + "learning_rate": 0.024276898113126652, + "loss": 0.7827, + "num_input_tokens_seen": 27978240, + "step": 48205 + }, + { + "epoch": 7.180518319928508, + "grad_norm": 0.01251220703125, + "learning_rate": 0.024275365965604954, + "loss": 0.7946, + "num_input_tokens_seen": 27981088, + "step": 48210 + }, + { + "epoch": 7.181263032469467, + "grad_norm": 0.018798828125, + "learning_rate": 0.024273833661385375, + "loss": 0.7915, + "num_input_tokens_seen": 27983872, + "step": 48215 + }, + { + "epoch": 7.182007745010426, + "grad_norm": 0.01495361328125, + "learning_rate": 0.0242723012004938, + "loss": 0.7823, + "num_input_tokens_seen": 27986752, + "step": 48220 + }, + { + "epoch": 7.182752457551385, + "grad_norm": 0.0133056640625, + "learning_rate": 0.024270768582956122, + "loss": 0.7777, + "num_input_tokens_seen": 27989504, + "step": 48225 + }, + { + "epoch": 7.183497170092345, + "grad_norm": 0.0234375, + "learning_rate": 0.024269235808798233, + "loss": 0.8114, + "num_input_tokens_seen": 27992256, + "step": 48230 + }, + { + "epoch": 7.184241882633303, + "grad_norm": 0.0208740234375, + "learning_rate": 0.024267702878046016, + "loss": 0.8213, + "num_input_tokens_seen": 27995136, + "step": 48235 + }, + { + "epoch": 7.184986595174263, + "grad_norm": 0.0233154296875, + "learning_rate": 0.024266169790725392, + "loss": 0.8059, + "num_input_tokens_seen": 27998048, + "step": 48240 + }, + { + "epoch": 7.185731307715222, + "grad_norm": 0.02734375, + "learning_rate": 0.02426463654686224, + "loss": 0.7853, + "num_input_tokens_seen": 28000896, + "step": 48245 + }, + { + "epoch": 7.1864760202561815, + "grad_norm": 0.0235595703125, + "learning_rate": 0.024263103146482466, + "loss": 0.7959, + "num_input_tokens_seen": 28004000, + "step": 48250 + }, + { + "epoch": 7.18722073279714, + "grad_norm": 0.032958984375, + "learning_rate": 0.02426156958961198, + "loss": 0.8394, + "num_input_tokens_seen": 28006752, + "step": 48255 + }, + { + "epoch": 7.1879654453381, + "grad_norm": 0.0184326171875, + "learning_rate": 0.024260035876276696, + "loss": 0.8025, + "num_input_tokens_seen": 28009760, + "step": 48260 + }, + { + "epoch": 7.188710157879059, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02425850200650251, + "loss": 0.7864, + "num_input_tokens_seen": 28012512, + "step": 48265 + }, + { + "epoch": 7.189454870420018, + "grad_norm": 0.0269775390625, + "learning_rate": 0.024256967980315347, + "loss": 0.7764, + "num_input_tokens_seen": 28015328, + "step": 48270 + }, + { + "epoch": 7.190199582960977, + "grad_norm": 0.02734375, + "learning_rate": 0.024255433797741116, + "loss": 0.8011, + "num_input_tokens_seen": 28018048, + "step": 48275 + }, + { + "epoch": 7.190944295501936, + "grad_norm": 0.01904296875, + "learning_rate": 0.024253899458805742, + "loss": 0.7845, + "num_input_tokens_seen": 28020832, + "step": 48280 + }, + { + "epoch": 7.1916890080428955, + "grad_norm": 0.0247802734375, + "learning_rate": 0.024252364963535136, + "loss": 0.7827, + "num_input_tokens_seen": 28023840, + "step": 48285 + }, + { + "epoch": 7.192433720583854, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02425083031195523, + "loss": 0.8192, + "num_input_tokens_seen": 28026624, + "step": 48290 + }, + { + "epoch": 7.193178433124814, + "grad_norm": 0.0185546875, + "learning_rate": 0.02424929550409195, + "loss": 0.7687, + "num_input_tokens_seen": 28029376, + "step": 48295 + }, + { + "epoch": 7.193923145665773, + "grad_norm": 0.035888671875, + "learning_rate": 0.02424776053997122, + "loss": 0.7854, + "num_input_tokens_seen": 28032704, + "step": 48300 + }, + { + "epoch": 7.194667858206732, + "grad_norm": 0.031982421875, + "learning_rate": 0.024246225419618978, + "loss": 0.7663, + "num_input_tokens_seen": 28035392, + "step": 48305 + }, + { + "epoch": 7.195412570747691, + "grad_norm": 0.022216796875, + "learning_rate": 0.024244690143061152, + "loss": 0.8231, + "num_input_tokens_seen": 28038080, + "step": 48310 + }, + { + "epoch": 7.196157283288651, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02424315471032368, + "loss": 0.7907, + "num_input_tokens_seen": 28041056, + "step": 48315 + }, + { + "epoch": 7.196901995829609, + "grad_norm": 0.0234375, + "learning_rate": 0.02424161912143251, + "loss": 0.8158, + "num_input_tokens_seen": 28044000, + "step": 48320 + }, + { + "epoch": 7.197646708370569, + "grad_norm": 0.0206298828125, + "learning_rate": 0.024240083376413573, + "loss": 0.8297, + "num_input_tokens_seen": 28046720, + "step": 48325 + }, + { + "epoch": 7.198391420911528, + "grad_norm": 0.01708984375, + "learning_rate": 0.024238547475292818, + "loss": 0.8205, + "num_input_tokens_seen": 28049600, + "step": 48330 + }, + { + "epoch": 7.1991361334524875, + "grad_norm": 0.0169677734375, + "learning_rate": 0.024237011418096196, + "loss": 0.819, + "num_input_tokens_seen": 28052544, + "step": 48335 + }, + { + "epoch": 7.199880845993446, + "grad_norm": 0.019287109375, + "learning_rate": 0.02423547520484965, + "loss": 0.8289, + "num_input_tokens_seen": 28055680, + "step": 48340 + }, + { + "epoch": 7.200625558534406, + "grad_norm": 0.0225830078125, + "learning_rate": 0.024233938835579144, + "loss": 0.7781, + "num_input_tokens_seen": 28058592, + "step": 48345 + }, + { + "epoch": 7.201370271075365, + "grad_norm": 0.01434326171875, + "learning_rate": 0.02423240231031062, + "loss": 0.7972, + "num_input_tokens_seen": 28061504, + "step": 48350 + }, + { + "epoch": 7.202114983616324, + "grad_norm": 0.0264892578125, + "learning_rate": 0.024230865629070047, + "loss": 0.8221, + "num_input_tokens_seen": 28064320, + "step": 48355 + }, + { + "epoch": 7.202859696157283, + "grad_norm": 0.037841796875, + "learning_rate": 0.024229328791883376, + "loss": 0.8061, + "num_input_tokens_seen": 28067232, + "step": 48360 + }, + { + "epoch": 7.203604408698243, + "grad_norm": 0.0260009765625, + "learning_rate": 0.024227791798776578, + "loss": 0.8202, + "num_input_tokens_seen": 28070400, + "step": 48365 + }, + { + "epoch": 7.2043491212392015, + "grad_norm": 0.024169921875, + "learning_rate": 0.024226254649775614, + "loss": 0.8049, + "num_input_tokens_seen": 28073152, + "step": 48370 + }, + { + "epoch": 7.205093833780161, + "grad_norm": 0.0198974609375, + "learning_rate": 0.024224717344906457, + "loss": 0.8119, + "num_input_tokens_seen": 28075808, + "step": 48375 + }, + { + "epoch": 7.20583854632112, + "grad_norm": 0.0235595703125, + "learning_rate": 0.024223179884195076, + "loss": 0.798, + "num_input_tokens_seen": 28079008, + "step": 48380 + }, + { + "epoch": 7.2065832588620795, + "grad_norm": 0.01495361328125, + "learning_rate": 0.024221642267667445, + "loss": 0.7915, + "num_input_tokens_seen": 28081664, + "step": 48385 + }, + { + "epoch": 7.207327971403038, + "grad_norm": 0.0263671875, + "learning_rate": 0.024220104495349542, + "loss": 0.8018, + "num_input_tokens_seen": 28084256, + "step": 48390 + }, + { + "epoch": 7.208072683943998, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02421856656726734, + "loss": 0.7945, + "num_input_tokens_seen": 28087136, + "step": 48395 + }, + { + "epoch": 7.208817396484957, + "grad_norm": 0.0244140625, + "learning_rate": 0.024217028483446833, + "loss": 0.8266, + "num_input_tokens_seen": 28090112, + "step": 48400 + }, + { + "epoch": 7.209562109025916, + "grad_norm": 0.03955078125, + "learning_rate": 0.02421549024391399, + "loss": 0.8306, + "num_input_tokens_seen": 28092736, + "step": 48405 + }, + { + "epoch": 7.210306821566875, + "grad_norm": 0.033447265625, + "learning_rate": 0.024213951848694806, + "loss": 0.8212, + "num_input_tokens_seen": 28095616, + "step": 48410 + }, + { + "epoch": 7.211051534107835, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02421241329781527, + "loss": 0.8037, + "num_input_tokens_seen": 28098400, + "step": 48415 + }, + { + "epoch": 7.2117962466487935, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02421087459130138, + "loss": 0.8105, + "num_input_tokens_seen": 28101280, + "step": 48420 + }, + { + "epoch": 7.212540959189753, + "grad_norm": 0.0272216796875, + "learning_rate": 0.024209335729179122, + "loss": 0.8044, + "num_input_tokens_seen": 28104416, + "step": 48425 + }, + { + "epoch": 7.213285671730712, + "grad_norm": 0.02294921875, + "learning_rate": 0.024207796711474498, + "loss": 0.8155, + "num_input_tokens_seen": 28107040, + "step": 48430 + }, + { + "epoch": 7.2140303842716715, + "grad_norm": 0.02001953125, + "learning_rate": 0.024206257538213503, + "loss": 0.7955, + "num_input_tokens_seen": 28109760, + "step": 48435 + }, + { + "epoch": 7.21477509681263, + "grad_norm": 0.0234375, + "learning_rate": 0.024204718209422146, + "loss": 0.8017, + "num_input_tokens_seen": 28112608, + "step": 48440 + }, + { + "epoch": 7.21551980935359, + "grad_norm": 0.0205078125, + "learning_rate": 0.02420317872512643, + "loss": 0.7947, + "num_input_tokens_seen": 28115616, + "step": 48445 + }, + { + "epoch": 7.216264521894549, + "grad_norm": 0.025390625, + "learning_rate": 0.024201639085352365, + "loss": 0.8145, + "num_input_tokens_seen": 28118464, + "step": 48450 + }, + { + "epoch": 7.217009234435508, + "grad_norm": 0.03955078125, + "learning_rate": 0.024200099290125957, + "loss": 0.8182, + "num_input_tokens_seen": 28121312, + "step": 48455 + }, + { + "epoch": 7.217753946976467, + "grad_norm": 0.031982421875, + "learning_rate": 0.024198559339473225, + "loss": 0.8174, + "num_input_tokens_seen": 28124256, + "step": 48460 + }, + { + "epoch": 7.218498659517426, + "grad_norm": 0.0322265625, + "learning_rate": 0.02419701923342018, + "loss": 0.8235, + "num_input_tokens_seen": 28127040, + "step": 48465 + }, + { + "epoch": 7.2192433720583855, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02419547897199284, + "loss": 0.8118, + "num_input_tokens_seen": 28129920, + "step": 48470 + }, + { + "epoch": 7.219988084599344, + "grad_norm": 0.03759765625, + "learning_rate": 0.02419393855521723, + "loss": 0.7999, + "num_input_tokens_seen": 28132672, + "step": 48475 + }, + { + "epoch": 7.220732797140304, + "grad_norm": 0.0283203125, + "learning_rate": 0.024192397983119375, + "loss": 0.7975, + "num_input_tokens_seen": 28135392, + "step": 48480 + }, + { + "epoch": 7.221477509681263, + "grad_norm": 0.023681640625, + "learning_rate": 0.024190857255725298, + "loss": 0.7946, + "num_input_tokens_seen": 28138144, + "step": 48485 + }, + { + "epoch": 7.222222222222222, + "grad_norm": 0.021728515625, + "learning_rate": 0.02418931637306103, + "loss": 0.7971, + "num_input_tokens_seen": 28140992, + "step": 48490 + }, + { + "epoch": 7.222966934763181, + "grad_norm": 0.028076171875, + "learning_rate": 0.0241877753351526, + "loss": 0.8048, + "num_input_tokens_seen": 28143936, + "step": 48495 + }, + { + "epoch": 7.223711647304141, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02418623414202604, + "loss": 0.8105, + "num_input_tokens_seen": 28146912, + "step": 48500 + }, + { + "epoch": 7.2244563598450995, + "grad_norm": 0.0205078125, + "learning_rate": 0.024184692793707396, + "loss": 0.7864, + "num_input_tokens_seen": 28149792, + "step": 48505 + }, + { + "epoch": 7.225201072386059, + "grad_norm": 0.0191650390625, + "learning_rate": 0.024183151290222703, + "loss": 0.7977, + "num_input_tokens_seen": 28152864, + "step": 48510 + }, + { + "epoch": 7.225945784927018, + "grad_norm": 0.013427734375, + "learning_rate": 0.024181609631597996, + "loss": 0.7838, + "num_input_tokens_seen": 28155744, + "step": 48515 + }, + { + "epoch": 7.2266904974679775, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02418006781785933, + "loss": 0.8019, + "num_input_tokens_seen": 28158816, + "step": 48520 + }, + { + "epoch": 7.227435210008936, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02417852584903275, + "loss": 0.8058, + "num_input_tokens_seen": 28161696, + "step": 48525 + }, + { + "epoch": 7.228179922549896, + "grad_norm": 0.027099609375, + "learning_rate": 0.0241769837251443, + "loss": 0.8109, + "num_input_tokens_seen": 28164448, + "step": 48530 + }, + { + "epoch": 7.228924635090855, + "grad_norm": 0.029052734375, + "learning_rate": 0.02417544144622004, + "loss": 0.7948, + "num_input_tokens_seen": 28167264, + "step": 48535 + }, + { + "epoch": 7.229669347631814, + "grad_norm": 0.026123046875, + "learning_rate": 0.024173899012286025, + "loss": 0.8304, + "num_input_tokens_seen": 28170176, + "step": 48540 + }, + { + "epoch": 7.230414060172773, + "grad_norm": 0.038818359375, + "learning_rate": 0.024172356423368308, + "loss": 0.8222, + "num_input_tokens_seen": 28173056, + "step": 48545 + }, + { + "epoch": 7.231158772713733, + "grad_norm": 0.0294189453125, + "learning_rate": 0.024170813679492947, + "loss": 0.8009, + "num_input_tokens_seen": 28176224, + "step": 48550 + }, + { + "epoch": 7.2319034852546915, + "grad_norm": 0.02490234375, + "learning_rate": 0.024169270780686015, + "loss": 0.7992, + "num_input_tokens_seen": 28179104, + "step": 48555 + }, + { + "epoch": 7.232648197795651, + "grad_norm": 0.022705078125, + "learning_rate": 0.02416772772697357, + "loss": 0.7933, + "num_input_tokens_seen": 28182016, + "step": 48560 + }, + { + "epoch": 7.23339291033661, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02416618451838168, + "loss": 0.791, + "num_input_tokens_seen": 28184832, + "step": 48565 + }, + { + "epoch": 7.23413762287757, + "grad_norm": 0.0216064453125, + "learning_rate": 0.024164641154936425, + "loss": 0.807, + "num_input_tokens_seen": 28187616, + "step": 48570 + }, + { + "epoch": 7.234882335418528, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02416309763666387, + "loss": 0.8177, + "num_input_tokens_seen": 28190464, + "step": 48575 + }, + { + "epoch": 7.235627047959488, + "grad_norm": 0.04443359375, + "learning_rate": 0.02416155396359009, + "loss": 0.7994, + "num_input_tokens_seen": 28193248, + "step": 48580 + }, + { + "epoch": 7.236371760500447, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02416001013574117, + "loss": 0.8038, + "num_input_tokens_seen": 28196128, + "step": 48585 + }, + { + "epoch": 7.237116473041406, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02415846615314319, + "loss": 0.7874, + "num_input_tokens_seen": 28199040, + "step": 48590 + }, + { + "epoch": 7.237861185582365, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02415692201582223, + "loss": 0.7998, + "num_input_tokens_seen": 28202368, + "step": 48595 + }, + { + "epoch": 7.238605898123325, + "grad_norm": 0.01904296875, + "learning_rate": 0.02415537772380438, + "loss": 0.7911, + "num_input_tokens_seen": 28205312, + "step": 48600 + }, + { + "epoch": 7.2393506106642835, + "grad_norm": 0.02001953125, + "learning_rate": 0.024153833277115733, + "loss": 0.8056, + "num_input_tokens_seen": 28208320, + "step": 48605 + }, + { + "epoch": 7.240095323205243, + "grad_norm": 0.0244140625, + "learning_rate": 0.02415228867578237, + "loss": 0.7978, + "num_input_tokens_seen": 28211168, + "step": 48610 + }, + { + "epoch": 7.240840035746202, + "grad_norm": 0.029052734375, + "learning_rate": 0.02415074391983039, + "loss": 0.803, + "num_input_tokens_seen": 28214080, + "step": 48615 + }, + { + "epoch": 7.241584748287162, + "grad_norm": 0.02099609375, + "learning_rate": 0.024149199009285898, + "loss": 0.8114, + "num_input_tokens_seen": 28216832, + "step": 48620 + }, + { + "epoch": 7.24232946082812, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02414765394417499, + "loss": 0.7871, + "num_input_tokens_seen": 28219648, + "step": 48625 + }, + { + "epoch": 7.243074173369079, + "grad_norm": 0.021484375, + "learning_rate": 0.024146108724523756, + "loss": 0.7966, + "num_input_tokens_seen": 28222496, + "step": 48630 + }, + { + "epoch": 7.243818885910039, + "grad_norm": 0.05078125, + "learning_rate": 0.024144563350358316, + "loss": 0.844, + "num_input_tokens_seen": 28225984, + "step": 48635 + }, + { + "epoch": 7.2445635984509975, + "grad_norm": 0.0225830078125, + "learning_rate": 0.024143017821704773, + "loss": 0.7991, + "num_input_tokens_seen": 28228992, + "step": 48640 + }, + { + "epoch": 7.245308310991957, + "grad_norm": 0.03125, + "learning_rate": 0.024141472138589238, + "loss": 0.7967, + "num_input_tokens_seen": 28232288, + "step": 48645 + }, + { + "epoch": 7.246053023532916, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02413992630103782, + "loss": 0.8152, + "num_input_tokens_seen": 28235168, + "step": 48650 + }, + { + "epoch": 7.246797736073876, + "grad_norm": 0.01092529296875, + "learning_rate": 0.024138380309076633, + "loss": 0.8004, + "num_input_tokens_seen": 28238368, + "step": 48655 + }, + { + "epoch": 7.247542448614834, + "grad_norm": 0.023193359375, + "learning_rate": 0.0241368341627318, + "loss": 0.8029, + "num_input_tokens_seen": 28241376, + "step": 48660 + }, + { + "epoch": 7.248287161155794, + "grad_norm": 0.01507568359375, + "learning_rate": 0.024135287862029446, + "loss": 0.7949, + "num_input_tokens_seen": 28244384, + "step": 48665 + }, + { + "epoch": 7.249031873696753, + "grad_norm": 0.0235595703125, + "learning_rate": 0.024133741406995684, + "loss": 0.8155, + "num_input_tokens_seen": 28247168, + "step": 48670 + }, + { + "epoch": 7.249776586237712, + "grad_norm": 0.0223388671875, + "learning_rate": 0.024132194797656645, + "loss": 0.7929, + "num_input_tokens_seen": 28250048, + "step": 48675 + }, + { + "epoch": 7.250521298778671, + "grad_norm": 0.0216064453125, + "learning_rate": 0.024130648034038454, + "loss": 0.793, + "num_input_tokens_seen": 28253120, + "step": 48680 + }, + { + "epoch": 7.251266011319631, + "grad_norm": 0.0205078125, + "learning_rate": 0.024129101116167246, + "loss": 0.8232, + "num_input_tokens_seen": 28256064, + "step": 48685 + }, + { + "epoch": 7.2520107238605895, + "grad_norm": 0.0220947265625, + "learning_rate": 0.024127554044069154, + "loss": 0.8074, + "num_input_tokens_seen": 28258912, + "step": 48690 + }, + { + "epoch": 7.252755436401549, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02412600681777031, + "loss": 0.8031, + "num_input_tokens_seen": 28261792, + "step": 48695 + }, + { + "epoch": 7.253500148942508, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02412445943729686, + "loss": 0.7954, + "num_input_tokens_seen": 28264768, + "step": 48700 + }, + { + "epoch": 7.254244861483468, + "grad_norm": 0.02099609375, + "learning_rate": 0.02412291190267494, + "loss": 0.788, + "num_input_tokens_seen": 28268160, + "step": 48705 + }, + { + "epoch": 7.254989574024426, + "grad_norm": 0.0205078125, + "learning_rate": 0.024121364213930695, + "loss": 0.7911, + "num_input_tokens_seen": 28270784, + "step": 48710 + }, + { + "epoch": 7.255734286565386, + "grad_norm": 0.0230712890625, + "learning_rate": 0.024119816371090275, + "loss": 0.8473, + "num_input_tokens_seen": 28273536, + "step": 48715 + }, + { + "epoch": 7.256478999106345, + "grad_norm": 0.0162353515625, + "learning_rate": 0.024118268374179824, + "loss": 0.8196, + "num_input_tokens_seen": 28276512, + "step": 48720 + }, + { + "epoch": 7.257223711647304, + "grad_norm": 0.0224609375, + "learning_rate": 0.024116720223225495, + "loss": 0.8212, + "num_input_tokens_seen": 28279552, + "step": 48725 + }, + { + "epoch": 7.257968424188263, + "grad_norm": 0.0201416015625, + "learning_rate": 0.024115171918253443, + "loss": 0.7929, + "num_input_tokens_seen": 28282240, + "step": 48730 + }, + { + "epoch": 7.258713136729223, + "grad_norm": 0.02197265625, + "learning_rate": 0.02411362345928983, + "loss": 0.7884, + "num_input_tokens_seen": 28284896, + "step": 48735 + }, + { + "epoch": 7.259457849270182, + "grad_norm": 0.02099609375, + "learning_rate": 0.024112074846360812, + "loss": 0.7848, + "num_input_tokens_seen": 28287808, + "step": 48740 + }, + { + "epoch": 7.260202561811141, + "grad_norm": 0.01153564453125, + "learning_rate": 0.024110526079492545, + "loss": 0.7903, + "num_input_tokens_seen": 28290528, + "step": 48745 + }, + { + "epoch": 7.2609472743521, + "grad_norm": 0.01708984375, + "learning_rate": 0.0241089771587112, + "loss": 0.7919, + "num_input_tokens_seen": 28293664, + "step": 48750 + }, + { + "epoch": 7.26169198689306, + "grad_norm": 0.020263671875, + "learning_rate": 0.02410742808404295, + "loss": 0.7718, + "num_input_tokens_seen": 28296640, + "step": 48755 + }, + { + "epoch": 7.262436699434018, + "grad_norm": 0.0181884765625, + "learning_rate": 0.024105878855513952, + "loss": 0.7918, + "num_input_tokens_seen": 28299584, + "step": 48760 + }, + { + "epoch": 7.263181411974978, + "grad_norm": 0.01055908203125, + "learning_rate": 0.024104329473150388, + "loss": 0.8108, + "num_input_tokens_seen": 28302400, + "step": 48765 + }, + { + "epoch": 7.263926124515937, + "grad_norm": 0.018310546875, + "learning_rate": 0.02410277993697843, + "loss": 0.7773, + "num_input_tokens_seen": 28305504, + "step": 48770 + }, + { + "epoch": 7.264670837056896, + "grad_norm": 0.020263671875, + "learning_rate": 0.024101230247024256, + "loss": 0.7928, + "num_input_tokens_seen": 28308480, + "step": 48775 + }, + { + "epoch": 7.265415549597855, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02409968040331405, + "loss": 0.7929, + "num_input_tokens_seen": 28311584, + "step": 48780 + }, + { + "epoch": 7.266160262138815, + "grad_norm": 0.024169921875, + "learning_rate": 0.024098130405873993, + "loss": 0.8005, + "num_input_tokens_seen": 28315168, + "step": 48785 + }, + { + "epoch": 7.266904974679774, + "grad_norm": 0.01953125, + "learning_rate": 0.024096580254730268, + "loss": 0.789, + "num_input_tokens_seen": 28318048, + "step": 48790 + }, + { + "epoch": 7.267649687220732, + "grad_norm": 0.0128173828125, + "learning_rate": 0.02409502994990906, + "loss": 0.8124, + "num_input_tokens_seen": 28320800, + "step": 48795 + }, + { + "epoch": 7.268394399761692, + "grad_norm": 0.033935546875, + "learning_rate": 0.024093479491436573, + "loss": 0.7739, + "num_input_tokens_seen": 28323584, + "step": 48800 + }, + { + "epoch": 7.269139112302652, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02409192887933899, + "loss": 0.8233, + "num_input_tokens_seen": 28326368, + "step": 48805 + }, + { + "epoch": 7.26988382484361, + "grad_norm": 0.02880859375, + "learning_rate": 0.024090378113642508, + "loss": 0.8069, + "num_input_tokens_seen": 28328960, + "step": 48810 + }, + { + "epoch": 7.270628537384569, + "grad_norm": 0.020263671875, + "learning_rate": 0.024088827194373323, + "loss": 0.789, + "num_input_tokens_seen": 28331936, + "step": 48815 + }, + { + "epoch": 7.271373249925529, + "grad_norm": 0.018310546875, + "learning_rate": 0.024087276121557644, + "loss": 0.7986, + "num_input_tokens_seen": 28334976, + "step": 48820 + }, + { + "epoch": 7.272117962466488, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02408572489522167, + "loss": 0.8023, + "num_input_tokens_seen": 28337888, + "step": 48825 + }, + { + "epoch": 7.272862675007447, + "grad_norm": 0.0228271484375, + "learning_rate": 0.024084173515391608, + "loss": 0.7983, + "num_input_tokens_seen": 28340896, + "step": 48830 + }, + { + "epoch": 7.273607387548406, + "grad_norm": 0.018798828125, + "learning_rate": 0.02408262198209367, + "loss": 0.7889, + "num_input_tokens_seen": 28343616, + "step": 48835 + }, + { + "epoch": 7.274352100089366, + "grad_norm": 0.03173828125, + "learning_rate": 0.024081070295354064, + "loss": 0.8436, + "num_input_tokens_seen": 28346368, + "step": 48840 + }, + { + "epoch": 7.275096812630324, + "grad_norm": 0.0235595703125, + "learning_rate": 0.024079518455199, + "loss": 0.7781, + "num_input_tokens_seen": 28349344, + "step": 48845 + }, + { + "epoch": 7.275841525171284, + "grad_norm": 0.0157470703125, + "learning_rate": 0.024077966461654703, + "loss": 0.7979, + "num_input_tokens_seen": 28352768, + "step": 48850 + }, + { + "epoch": 7.276586237712243, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02407641431474739, + "loss": 0.795, + "num_input_tokens_seen": 28355360, + "step": 48855 + }, + { + "epoch": 7.277330950253202, + "grad_norm": 0.021240234375, + "learning_rate": 0.02407486201450328, + "loss": 0.792, + "num_input_tokens_seen": 28358400, + "step": 48860 + }, + { + "epoch": 7.278075662794161, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0240733095609486, + "loss": 0.8319, + "num_input_tokens_seen": 28361344, + "step": 48865 + }, + { + "epoch": 7.278820375335121, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02407175695410958, + "loss": 0.8017, + "num_input_tokens_seen": 28364512, + "step": 48870 + }, + { + "epoch": 7.27956508787608, + "grad_norm": 0.01507568359375, + "learning_rate": 0.024070204194012446, + "loss": 0.7858, + "num_input_tokens_seen": 28367264, + "step": 48875 + }, + { + "epoch": 7.280309800417039, + "grad_norm": 0.0225830078125, + "learning_rate": 0.024068651280683428, + "loss": 0.8181, + "num_input_tokens_seen": 28370240, + "step": 48880 + }, + { + "epoch": 7.281054512957998, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02406709821414876, + "loss": 0.7912, + "num_input_tokens_seen": 28373216, + "step": 48885 + }, + { + "epoch": 7.281799225498958, + "grad_norm": 0.023681640625, + "learning_rate": 0.02406554499443469, + "loss": 0.8048, + "num_input_tokens_seen": 28375872, + "step": 48890 + }, + { + "epoch": 7.282543938039916, + "grad_norm": 0.01409912109375, + "learning_rate": 0.024063991621567447, + "loss": 0.7816, + "num_input_tokens_seen": 28378912, + "step": 48895 + }, + { + "epoch": 7.283288650580876, + "grad_norm": 0.01458740234375, + "learning_rate": 0.02406243809557328, + "loss": 0.7933, + "num_input_tokens_seen": 28381728, + "step": 48900 + }, + { + "epoch": 7.284033363121835, + "grad_norm": 0.02392578125, + "learning_rate": 0.02406088441647843, + "loss": 0.808, + "num_input_tokens_seen": 28384832, + "step": 48905 + }, + { + "epoch": 7.2847780756627944, + "grad_norm": 0.05126953125, + "learning_rate": 0.024059330584309147, + "loss": 0.8318, + "num_input_tokens_seen": 28387520, + "step": 48910 + }, + { + "epoch": 7.285522788203753, + "grad_norm": 0.0269775390625, + "learning_rate": 0.024057776599091682, + "loss": 0.791, + "num_input_tokens_seen": 28390816, + "step": 48915 + }, + { + "epoch": 7.286267500744713, + "grad_norm": 0.026611328125, + "learning_rate": 0.02405622246085229, + "loss": 0.7923, + "num_input_tokens_seen": 28393984, + "step": 48920 + }, + { + "epoch": 7.287012213285672, + "grad_norm": 0.01531982421875, + "learning_rate": 0.02405466816961722, + "loss": 0.8113, + "num_input_tokens_seen": 28396736, + "step": 48925 + }, + { + "epoch": 7.287756925826631, + "grad_norm": 0.01806640625, + "learning_rate": 0.024053113725412734, + "loss": 0.7927, + "num_input_tokens_seen": 28399680, + "step": 48930 + }, + { + "epoch": 7.28850163836759, + "grad_norm": 0.0205078125, + "learning_rate": 0.024051559128265097, + "loss": 0.8276, + "num_input_tokens_seen": 28402336, + "step": 48935 + }, + { + "epoch": 7.28924635090855, + "grad_norm": 0.0181884765625, + "learning_rate": 0.024050004378200564, + "loss": 0.8083, + "num_input_tokens_seen": 28404928, + "step": 48940 + }, + { + "epoch": 7.289991063449508, + "grad_norm": 0.0218505859375, + "learning_rate": 0.024048449475245405, + "loss": 0.8049, + "num_input_tokens_seen": 28407936, + "step": 48945 + }, + { + "epoch": 7.290735775990468, + "grad_norm": 0.02783203125, + "learning_rate": 0.024046894419425893, + "loss": 0.788, + "num_input_tokens_seen": 28410816, + "step": 48950 + }, + { + "epoch": 7.291480488531427, + "grad_norm": 0.01385498046875, + "learning_rate": 0.02404533921076829, + "loss": 0.8277, + "num_input_tokens_seen": 28413824, + "step": 48955 + }, + { + "epoch": 7.292225201072386, + "grad_norm": 0.01953125, + "learning_rate": 0.024043783849298877, + "loss": 0.7953, + "num_input_tokens_seen": 28416928, + "step": 48960 + }, + { + "epoch": 7.292969913613345, + "grad_norm": 0.0234375, + "learning_rate": 0.024042228335043922, + "loss": 0.8081, + "num_input_tokens_seen": 28420000, + "step": 48965 + }, + { + "epoch": 7.293714626154305, + "grad_norm": 0.02197265625, + "learning_rate": 0.024040672668029717, + "loss": 0.8024, + "num_input_tokens_seen": 28422976, + "step": 48970 + }, + { + "epoch": 7.294459338695264, + "grad_norm": 0.0135498046875, + "learning_rate": 0.024039116848282534, + "loss": 0.7952, + "num_input_tokens_seen": 28426208, + "step": 48975 + }, + { + "epoch": 7.295204051236222, + "grad_norm": 0.0216064453125, + "learning_rate": 0.024037560875828656, + "loss": 0.7926, + "num_input_tokens_seen": 28428992, + "step": 48980 + }, + { + "epoch": 7.295948763777182, + "grad_norm": 0.027587890625, + "learning_rate": 0.024036004750694374, + "loss": 0.7877, + "num_input_tokens_seen": 28431840, + "step": 48985 + }, + { + "epoch": 7.296693476318141, + "grad_norm": 0.02294921875, + "learning_rate": 0.024034448472905975, + "loss": 0.7881, + "num_input_tokens_seen": 28435008, + "step": 48990 + }, + { + "epoch": 7.2974381888591004, + "grad_norm": 0.02392578125, + "learning_rate": 0.02403289204248975, + "loss": 0.7915, + "num_input_tokens_seen": 28438176, + "step": 48995 + }, + { + "epoch": 7.298182901400059, + "grad_norm": 0.016845703125, + "learning_rate": 0.024031335459471996, + "loss": 0.8158, + "num_input_tokens_seen": 28441472, + "step": 49000 + }, + { + "epoch": 7.298927613941019, + "grad_norm": 0.0234375, + "learning_rate": 0.02402977872387901, + "loss": 0.7826, + "num_input_tokens_seen": 28444288, + "step": 49005 + }, + { + "epoch": 7.299672326481978, + "grad_norm": 0.0181884765625, + "learning_rate": 0.024028221835737086, + "loss": 0.8027, + "num_input_tokens_seen": 28446944, + "step": 49010 + }, + { + "epoch": 7.300417039022937, + "grad_norm": 0.0179443359375, + "learning_rate": 0.024026664795072532, + "loss": 0.7947, + "num_input_tokens_seen": 28449888, + "step": 49015 + }, + { + "epoch": 7.301161751563896, + "grad_norm": 0.0198974609375, + "learning_rate": 0.024025107601911653, + "loss": 0.8005, + "num_input_tokens_seen": 28452768, + "step": 49020 + }, + { + "epoch": 7.301906464104856, + "grad_norm": 0.01708984375, + "learning_rate": 0.02402355025628075, + "loss": 0.7862, + "num_input_tokens_seen": 28455872, + "step": 49025 + }, + { + "epoch": 7.302651176645814, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02402199275820614, + "loss": 0.8009, + "num_input_tokens_seen": 28458880, + "step": 49030 + }, + { + "epoch": 7.303395889186774, + "grad_norm": 0.01287841796875, + "learning_rate": 0.02402043510771413, + "loss": 0.8101, + "num_input_tokens_seen": 28461696, + "step": 49035 + }, + { + "epoch": 7.304140601727733, + "grad_norm": 0.0177001953125, + "learning_rate": 0.024018877304831036, + "loss": 0.8164, + "num_input_tokens_seen": 28464768, + "step": 49040 + }, + { + "epoch": 7.3048853142686925, + "grad_norm": 0.0302734375, + "learning_rate": 0.024017319349583177, + "loss": 0.7996, + "num_input_tokens_seen": 28467552, + "step": 49045 + }, + { + "epoch": 7.305630026809651, + "grad_norm": 0.0145263671875, + "learning_rate": 0.024015761241996874, + "loss": 0.8005, + "num_input_tokens_seen": 28470592, + "step": 49050 + }, + { + "epoch": 7.306374739350611, + "grad_norm": 0.01806640625, + "learning_rate": 0.02401420298209845, + "loss": 0.7676, + "num_input_tokens_seen": 28473376, + "step": 49055 + }, + { + "epoch": 7.30711945189157, + "grad_norm": 0.02392578125, + "learning_rate": 0.024012644569914223, + "loss": 0.8038, + "num_input_tokens_seen": 28476160, + "step": 49060 + }, + { + "epoch": 7.307864164432529, + "grad_norm": 0.0184326171875, + "learning_rate": 0.024011086005470524, + "loss": 0.7956, + "num_input_tokens_seen": 28478816, + "step": 49065 + }, + { + "epoch": 7.308608876973488, + "grad_norm": 0.01806640625, + "learning_rate": 0.024009527288793693, + "loss": 0.8065, + "num_input_tokens_seen": 28481920, + "step": 49070 + }, + { + "epoch": 7.309353589514448, + "grad_norm": 0.021728515625, + "learning_rate": 0.02400796841991005, + "loss": 0.7997, + "num_input_tokens_seen": 28484704, + "step": 49075 + }, + { + "epoch": 7.3100983020554064, + "grad_norm": 0.0283203125, + "learning_rate": 0.02400640939884594, + "loss": 0.8052, + "num_input_tokens_seen": 28487616, + "step": 49080 + }, + { + "epoch": 7.310843014596366, + "grad_norm": 0.013671875, + "learning_rate": 0.024004850225627695, + "loss": 0.8219, + "num_input_tokens_seen": 28490656, + "step": 49085 + }, + { + "epoch": 7.311587727137325, + "grad_norm": 0.015869140625, + "learning_rate": 0.02400329090028166, + "loss": 0.782, + "num_input_tokens_seen": 28493888, + "step": 49090 + }, + { + "epoch": 7.3123324396782845, + "grad_norm": 0.019775390625, + "learning_rate": 0.024001731422834165, + "loss": 0.7885, + "num_input_tokens_seen": 28496864, + "step": 49095 + }, + { + "epoch": 7.313077152219243, + "grad_norm": 0.0257568359375, + "learning_rate": 0.024000171793311577, + "loss": 0.8253, + "num_input_tokens_seen": 28499648, + "step": 49100 + }, + { + "epoch": 7.313821864760203, + "grad_norm": 0.01904296875, + "learning_rate": 0.02399861201174023, + "loss": 0.7977, + "num_input_tokens_seen": 28502592, + "step": 49105 + }, + { + "epoch": 7.314566577301162, + "grad_norm": 0.023193359375, + "learning_rate": 0.023997052078146484, + "loss": 0.8129, + "num_input_tokens_seen": 28505344, + "step": 49110 + }, + { + "epoch": 7.315311289842121, + "grad_norm": 0.0260009765625, + "learning_rate": 0.023995491992556683, + "loss": 0.7907, + "num_input_tokens_seen": 28508032, + "step": 49115 + }, + { + "epoch": 7.31605600238308, + "grad_norm": 0.01275634765625, + "learning_rate": 0.023993931754997186, + "loss": 0.8018, + "num_input_tokens_seen": 28511136, + "step": 49120 + }, + { + "epoch": 7.31680071492404, + "grad_norm": 0.03662109375, + "learning_rate": 0.023992371365494355, + "loss": 0.8233, + "num_input_tokens_seen": 28513888, + "step": 49125 + }, + { + "epoch": 7.3175454274649985, + "grad_norm": 0.0218505859375, + "learning_rate": 0.023990810824074546, + "loss": 0.7832, + "num_input_tokens_seen": 28517024, + "step": 49130 + }, + { + "epoch": 7.318290140005958, + "grad_norm": 0.023193359375, + "learning_rate": 0.02398925013076413, + "loss": 0.8214, + "num_input_tokens_seen": 28519680, + "step": 49135 + }, + { + "epoch": 7.319034852546917, + "grad_norm": 0.031494140625, + "learning_rate": 0.02398768928558947, + "loss": 0.777, + "num_input_tokens_seen": 28522752, + "step": 49140 + }, + { + "epoch": 7.319779565087876, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02398612828857693, + "loss": 0.8062, + "num_input_tokens_seen": 28525824, + "step": 49145 + }, + { + "epoch": 7.320524277628835, + "grad_norm": 0.0235595703125, + "learning_rate": 0.023984567139752888, + "loss": 0.78, + "num_input_tokens_seen": 28528864, + "step": 49150 + }, + { + "epoch": 7.321268990169794, + "grad_norm": 0.0235595703125, + "learning_rate": 0.023983005839143718, + "loss": 0.8115, + "num_input_tokens_seen": 28531680, + "step": 49155 + }, + { + "epoch": 7.322013702710754, + "grad_norm": 0.0147705078125, + "learning_rate": 0.023981444386775793, + "loss": 0.7872, + "num_input_tokens_seen": 28534432, + "step": 49160 + }, + { + "epoch": 7.3227584152517124, + "grad_norm": 0.0267333984375, + "learning_rate": 0.023979882782675488, + "loss": 0.8102, + "num_input_tokens_seen": 28537440, + "step": 49165 + }, + { + "epoch": 7.323503127792672, + "grad_norm": 0.0201416015625, + "learning_rate": 0.023978321026869198, + "loss": 0.8012, + "num_input_tokens_seen": 28540608, + "step": 49170 + }, + { + "epoch": 7.324247840333631, + "grad_norm": 0.0220947265625, + "learning_rate": 0.023976759119383295, + "loss": 0.7925, + "num_input_tokens_seen": 28543520, + "step": 49175 + }, + { + "epoch": 7.3249925528745905, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02397519706024417, + "loss": 0.8288, + "num_input_tokens_seen": 28546400, + "step": 49180 + }, + { + "epoch": 7.325737265415549, + "grad_norm": 0.019775390625, + "learning_rate": 0.023973634849478218, + "loss": 0.8127, + "num_input_tokens_seen": 28549408, + "step": 49185 + }, + { + "epoch": 7.326481977956509, + "grad_norm": 0.026123046875, + "learning_rate": 0.023972072487111824, + "loss": 0.8017, + "num_input_tokens_seen": 28552416, + "step": 49190 + }, + { + "epoch": 7.327226690497468, + "grad_norm": 0.0277099609375, + "learning_rate": 0.023970509973171378, + "loss": 0.812, + "num_input_tokens_seen": 28555392, + "step": 49195 + }, + { + "epoch": 7.327971403038427, + "grad_norm": 0.0242919921875, + "learning_rate": 0.023968947307683286, + "loss": 0.8013, + "num_input_tokens_seen": 28558336, + "step": 49200 + }, + { + "epoch": 7.328716115579386, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02396738449067395, + "loss": 0.7954, + "num_input_tokens_seen": 28561056, + "step": 49205 + }, + { + "epoch": 7.329460828120346, + "grad_norm": 0.032958984375, + "learning_rate": 0.023965821522169764, + "loss": 0.8092, + "num_input_tokens_seen": 28563840, + "step": 49210 + }, + { + "epoch": 7.3302055406613045, + "grad_norm": 0.03369140625, + "learning_rate": 0.023964258402197132, + "loss": 0.7977, + "num_input_tokens_seen": 28566624, + "step": 49215 + }, + { + "epoch": 7.330950253202264, + "grad_norm": 0.02783203125, + "learning_rate": 0.023962695130782467, + "loss": 0.7891, + "num_input_tokens_seen": 28569344, + "step": 49220 + }, + { + "epoch": 7.331694965743223, + "grad_norm": 0.0274658203125, + "learning_rate": 0.023961131707952176, + "loss": 0.833, + "num_input_tokens_seen": 28572096, + "step": 49225 + }, + { + "epoch": 7.3324396782841825, + "grad_norm": 0.0267333984375, + "learning_rate": 0.023959568133732678, + "loss": 0.7908, + "num_input_tokens_seen": 28574944, + "step": 49230 + }, + { + "epoch": 7.333184390825141, + "grad_norm": 0.02099609375, + "learning_rate": 0.023958004408150376, + "loss": 0.7992, + "num_input_tokens_seen": 28577664, + "step": 49235 + }, + { + "epoch": 7.333929103366101, + "grad_norm": 0.030517578125, + "learning_rate": 0.0239564405312317, + "loss": 0.7955, + "num_input_tokens_seen": 28580448, + "step": 49240 + }, + { + "epoch": 7.33467381590706, + "grad_norm": 0.022705078125, + "learning_rate": 0.02395487650300306, + "loss": 0.81, + "num_input_tokens_seen": 28583328, + "step": 49245 + }, + { + "epoch": 7.335418528448019, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02395331232349088, + "loss": 0.7882, + "num_input_tokens_seen": 28586080, + "step": 49250 + }, + { + "epoch": 7.336163240988978, + "grad_norm": 0.0250244140625, + "learning_rate": 0.023951747992721595, + "loss": 0.8103, + "num_input_tokens_seen": 28589280, + "step": 49255 + }, + { + "epoch": 7.336907953529938, + "grad_norm": 0.0223388671875, + "learning_rate": 0.023950183510721625, + "loss": 0.8046, + "num_input_tokens_seen": 28592224, + "step": 49260 + }, + { + "epoch": 7.3376526660708965, + "grad_norm": 0.0255126953125, + "learning_rate": 0.023948618877517397, + "loss": 0.8033, + "num_input_tokens_seen": 28595168, + "step": 49265 + }, + { + "epoch": 7.338397378611856, + "grad_norm": 0.0137939453125, + "learning_rate": 0.02394705409313535, + "loss": 0.8097, + "num_input_tokens_seen": 28598112, + "step": 49270 + }, + { + "epoch": 7.339142091152815, + "grad_norm": 0.0220947265625, + "learning_rate": 0.023945489157601918, + "loss": 0.8229, + "num_input_tokens_seen": 28601152, + "step": 49275 + }, + { + "epoch": 7.3398868036937746, + "grad_norm": 0.01361083984375, + "learning_rate": 0.023943924070943538, + "loss": 0.7804, + "num_input_tokens_seen": 28604192, + "step": 49280 + }, + { + "epoch": 7.340631516234733, + "grad_norm": 0.0296630859375, + "learning_rate": 0.023942358833186652, + "loss": 0.8068, + "num_input_tokens_seen": 28606976, + "step": 49285 + }, + { + "epoch": 7.341376228775693, + "grad_norm": 0.02197265625, + "learning_rate": 0.0239407934443577, + "loss": 0.8313, + "num_input_tokens_seen": 28609632, + "step": 49290 + }, + { + "epoch": 7.342120941316652, + "grad_norm": 0.0269775390625, + "learning_rate": 0.023939227904483128, + "loss": 0.7952, + "num_input_tokens_seen": 28612256, + "step": 49295 + }, + { + "epoch": 7.342865653857611, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02393766221358939, + "loss": 0.8048, + "num_input_tokens_seen": 28615072, + "step": 49300 + }, + { + "epoch": 7.34361036639857, + "grad_norm": 0.0194091796875, + "learning_rate": 0.023936096371702933, + "loss": 0.8179, + "num_input_tokens_seen": 28617856, + "step": 49305 + }, + { + "epoch": 7.344355078939529, + "grad_norm": 0.015625, + "learning_rate": 0.02393453037885021, + "loss": 0.8007, + "num_input_tokens_seen": 28620736, + "step": 49310 + }, + { + "epoch": 7.3450997914804885, + "grad_norm": 0.020751953125, + "learning_rate": 0.023932964235057672, + "loss": 0.7869, + "num_input_tokens_seen": 28623520, + "step": 49315 + }, + { + "epoch": 7.345844504021448, + "grad_norm": 0.021728515625, + "learning_rate": 0.02393139794035179, + "loss": 0.7811, + "num_input_tokens_seen": 28628000, + "step": 49320 + }, + { + "epoch": 7.346589216562407, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02392983149475901, + "loss": 0.8042, + "num_input_tokens_seen": 28631072, + "step": 49325 + }, + { + "epoch": 7.347333929103366, + "grad_norm": 0.0205078125, + "learning_rate": 0.023928264898305806, + "loss": 0.7981, + "num_input_tokens_seen": 28633984, + "step": 49330 + }, + { + "epoch": 7.348078641644325, + "grad_norm": 0.019775390625, + "learning_rate": 0.02392669815101864, + "loss": 0.8146, + "num_input_tokens_seen": 28636768, + "step": 49335 + }, + { + "epoch": 7.348823354185284, + "grad_norm": 0.03662109375, + "learning_rate": 0.023925131252923986, + "loss": 0.791, + "num_input_tokens_seen": 28639488, + "step": 49340 + }, + { + "epoch": 7.349568066726244, + "grad_norm": 0.01318359375, + "learning_rate": 0.023923564204048304, + "loss": 0.7956, + "num_input_tokens_seen": 28642720, + "step": 49345 + }, + { + "epoch": 7.3503127792672025, + "grad_norm": 0.0225830078125, + "learning_rate": 0.023921997004418077, + "loss": 0.7905, + "num_input_tokens_seen": 28645504, + "step": 49350 + }, + { + "epoch": 7.351057491808162, + "grad_norm": 0.027099609375, + "learning_rate": 0.02392042965405978, + "loss": 0.8073, + "num_input_tokens_seen": 28648544, + "step": 49355 + }, + { + "epoch": 7.351802204349121, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02391886215299989, + "loss": 0.8032, + "num_input_tokens_seen": 28651744, + "step": 49360 + }, + { + "epoch": 7.3525469168900806, + "grad_norm": 0.0234375, + "learning_rate": 0.023917294501264886, + "loss": 0.7765, + "num_input_tokens_seen": 28654528, + "step": 49365 + }, + { + "epoch": 7.353291629431039, + "grad_norm": 0.01422119140625, + "learning_rate": 0.023915726698881253, + "loss": 0.8044, + "num_input_tokens_seen": 28657312, + "step": 49370 + }, + { + "epoch": 7.354036341971999, + "grad_norm": 0.020263671875, + "learning_rate": 0.02391415874587548, + "loss": 0.8131, + "num_input_tokens_seen": 28660544, + "step": 49375 + }, + { + "epoch": 7.354781054512958, + "grad_norm": 0.0390625, + "learning_rate": 0.02391259064227406, + "loss": 0.8304, + "num_input_tokens_seen": 28663488, + "step": 49380 + }, + { + "epoch": 7.355525767053917, + "grad_norm": 0.0213623046875, + "learning_rate": 0.023911022388103473, + "loss": 0.7888, + "num_input_tokens_seen": 28666304, + "step": 49385 + }, + { + "epoch": 7.356270479594876, + "grad_norm": 0.0240478515625, + "learning_rate": 0.023909453983390224, + "loss": 0.8029, + "num_input_tokens_seen": 28669408, + "step": 49390 + }, + { + "epoch": 7.357015192135836, + "grad_norm": 0.017822265625, + "learning_rate": 0.02390788542816081, + "loss": 0.7706, + "num_input_tokens_seen": 28672160, + "step": 49395 + }, + { + "epoch": 7.3577599046767945, + "grad_norm": 0.0264892578125, + "learning_rate": 0.023906316722441718, + "loss": 0.7856, + "num_input_tokens_seen": 28675008, + "step": 49400 + }, + { + "epoch": 7.358504617217754, + "grad_norm": 0.0279541015625, + "learning_rate": 0.023904747866259454, + "loss": 0.8254, + "num_input_tokens_seen": 28677728, + "step": 49405 + }, + { + "epoch": 7.359249329758713, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02390317885964053, + "loss": 0.8047, + "num_input_tokens_seen": 28680352, + "step": 49410 + }, + { + "epoch": 7.359994042299673, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02390160970261145, + "loss": 0.7892, + "num_input_tokens_seen": 28683264, + "step": 49415 + }, + { + "epoch": 7.360738754840631, + "grad_norm": 0.017822265625, + "learning_rate": 0.02390004039519872, + "loss": 0.7971, + "num_input_tokens_seen": 28685952, + "step": 49420 + }, + { + "epoch": 7.361483467381591, + "grad_norm": 0.0191650390625, + "learning_rate": 0.023898470937428844, + "loss": 0.8007, + "num_input_tokens_seen": 28688864, + "step": 49425 + }, + { + "epoch": 7.36222817992255, + "grad_norm": 0.016357421875, + "learning_rate": 0.023896901329328358, + "loss": 0.8159, + "num_input_tokens_seen": 28691776, + "step": 49430 + }, + { + "epoch": 7.362972892463509, + "grad_norm": 0.034423828125, + "learning_rate": 0.023895331570923762, + "loss": 0.7983, + "num_input_tokens_seen": 28694464, + "step": 49435 + }, + { + "epoch": 7.363717605004468, + "grad_norm": 0.021728515625, + "learning_rate": 0.02389376166224158, + "loss": 0.7869, + "num_input_tokens_seen": 28697440, + "step": 49440 + }, + { + "epoch": 7.364462317545428, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02389219160330833, + "loss": 0.8146, + "num_input_tokens_seen": 28700352, + "step": 49445 + }, + { + "epoch": 7.3652070300863866, + "grad_norm": 0.0306396484375, + "learning_rate": 0.023890621394150546, + "loss": 0.7984, + "num_input_tokens_seen": 28703552, + "step": 49450 + }, + { + "epoch": 7.365951742627346, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02388905103479475, + "loss": 0.7933, + "num_input_tokens_seen": 28706496, + "step": 49455 + }, + { + "epoch": 7.366696455168305, + "grad_norm": 0.034423828125, + "learning_rate": 0.023887480525267464, + "loss": 0.8105, + "num_input_tokens_seen": 28709312, + "step": 49460 + }, + { + "epoch": 7.367441167709265, + "grad_norm": 0.0322265625, + "learning_rate": 0.023885909865595235, + "loss": 0.8114, + "num_input_tokens_seen": 28712224, + "step": 49465 + }, + { + "epoch": 7.368185880250223, + "grad_norm": 0.0196533203125, + "learning_rate": 0.023884339055804584, + "loss": 0.8099, + "num_input_tokens_seen": 28714880, + "step": 49470 + }, + { + "epoch": 7.368930592791183, + "grad_norm": 0.028076171875, + "learning_rate": 0.02388276809592206, + "loss": 0.8026, + "num_input_tokens_seen": 28717952, + "step": 49475 + }, + { + "epoch": 7.369675305332142, + "grad_norm": 0.029052734375, + "learning_rate": 0.02388119698597419, + "loss": 0.8141, + "num_input_tokens_seen": 28720576, + "step": 49480 + }, + { + "epoch": 7.370420017873101, + "grad_norm": 0.018798828125, + "learning_rate": 0.023879625725987532, + "loss": 0.8016, + "num_input_tokens_seen": 28723488, + "step": 49485 + }, + { + "epoch": 7.37116473041406, + "grad_norm": 0.0152587890625, + "learning_rate": 0.02387805431598862, + "loss": 0.8117, + "num_input_tokens_seen": 28726176, + "step": 49490 + }, + { + "epoch": 7.371909442955019, + "grad_norm": 0.0224609375, + "learning_rate": 0.023876482756004, + "loss": 0.8287, + "num_input_tokens_seen": 28728992, + "step": 49495 + }, + { + "epoch": 7.372654155495979, + "grad_norm": 0.0206298828125, + "learning_rate": 0.023874911046060227, + "loss": 0.7999, + "num_input_tokens_seen": 28732032, + "step": 49500 + }, + { + "epoch": 7.373398868036937, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02387333918618385, + "loss": 0.8039, + "num_input_tokens_seen": 28734944, + "step": 49505 + }, + { + "epoch": 7.374143580577897, + "grad_norm": 0.0281982421875, + "learning_rate": 0.023871767176401427, + "loss": 0.7979, + "num_input_tokens_seen": 28737920, + "step": 49510 + }, + { + "epoch": 7.374888293118856, + "grad_norm": 0.018310546875, + "learning_rate": 0.02387019501673952, + "loss": 0.7841, + "num_input_tokens_seen": 28740896, + "step": 49515 + }, + { + "epoch": 7.375633005659815, + "grad_norm": 0.02099609375, + "learning_rate": 0.02386862270722468, + "loss": 0.8099, + "num_input_tokens_seen": 28743616, + "step": 49520 + }, + { + "epoch": 7.376377718200774, + "grad_norm": 0.01904296875, + "learning_rate": 0.023867050247883468, + "loss": 0.7997, + "num_input_tokens_seen": 28746560, + "step": 49525 + }, + { + "epoch": 7.377122430741734, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02386547763874246, + "loss": 0.8006, + "num_input_tokens_seen": 28749504, + "step": 49530 + }, + { + "epoch": 7.3778671432826926, + "grad_norm": 0.0247802734375, + "learning_rate": 0.023863904879828216, + "loss": 0.7919, + "num_input_tokens_seen": 28752128, + "step": 49535 + }, + { + "epoch": 7.378611855823652, + "grad_norm": 0.030517578125, + "learning_rate": 0.023862331971167303, + "loss": 0.8137, + "num_input_tokens_seen": 28755328, + "step": 49540 + }, + { + "epoch": 7.379356568364611, + "grad_norm": 0.0242919921875, + "learning_rate": 0.023860758912786308, + "loss": 0.803, + "num_input_tokens_seen": 28758240, + "step": 49545 + }, + { + "epoch": 7.380101280905571, + "grad_norm": 0.033203125, + "learning_rate": 0.02385918570471179, + "loss": 0.791, + "num_input_tokens_seen": 28761088, + "step": 49550 + }, + { + "epoch": 7.380845993446529, + "grad_norm": 0.020263671875, + "learning_rate": 0.023857612346970338, + "loss": 0.818, + "num_input_tokens_seen": 28763904, + "step": 49555 + }, + { + "epoch": 7.381590705987489, + "grad_norm": 0.028564453125, + "learning_rate": 0.023856038839588527, + "loss": 0.7979, + "num_input_tokens_seen": 28766656, + "step": 49560 + }, + { + "epoch": 7.382335418528448, + "grad_norm": 0.0225830078125, + "learning_rate": 0.023854465182592936, + "loss": 0.785, + "num_input_tokens_seen": 28769280, + "step": 49565 + }, + { + "epoch": 7.383080131069407, + "grad_norm": 0.027099609375, + "learning_rate": 0.02385289137601016, + "loss": 0.8, + "num_input_tokens_seen": 28771872, + "step": 49570 + }, + { + "epoch": 7.383824843610366, + "grad_norm": 0.0167236328125, + "learning_rate": 0.02385131741986678, + "loss": 0.8143, + "num_input_tokens_seen": 28774464, + "step": 49575 + }, + { + "epoch": 7.384569556151326, + "grad_norm": 0.05078125, + "learning_rate": 0.02384974331418939, + "loss": 0.8225, + "num_input_tokens_seen": 28777280, + "step": 49580 + }, + { + "epoch": 7.385314268692285, + "grad_norm": 0.07421875, + "learning_rate": 0.023848169059004578, + "loss": 0.8093, + "num_input_tokens_seen": 28780352, + "step": 49585 + }, + { + "epoch": 7.386058981233244, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02384659465433894, + "loss": 0.8041, + "num_input_tokens_seen": 28783328, + "step": 49590 + }, + { + "epoch": 7.386803693774203, + "grad_norm": 0.0206298828125, + "learning_rate": 0.023845020100219088, + "loss": 0.812, + "num_input_tokens_seen": 28786208, + "step": 49595 + }, + { + "epoch": 7.387548406315163, + "grad_norm": 0.022216796875, + "learning_rate": 0.023843445396671603, + "loss": 0.7896, + "num_input_tokens_seen": 28789088, + "step": 49600 + }, + { + "epoch": 7.388293118856121, + "grad_norm": 0.044189453125, + "learning_rate": 0.023841870543723092, + "loss": 0.8013, + "num_input_tokens_seen": 28792064, + "step": 49605 + }, + { + "epoch": 7.389037831397081, + "grad_norm": 0.029052734375, + "learning_rate": 0.023840295541400172, + "loss": 0.809, + "num_input_tokens_seen": 28794912, + "step": 49610 + }, + { + "epoch": 7.38978254393804, + "grad_norm": 0.022705078125, + "learning_rate": 0.02383872038972944, + "loss": 0.8082, + "num_input_tokens_seen": 28798016, + "step": 49615 + }, + { + "epoch": 7.390527256478999, + "grad_norm": 0.0223388671875, + "learning_rate": 0.023837145088737514, + "loss": 0.7958, + "num_input_tokens_seen": 28800960, + "step": 49620 + }, + { + "epoch": 7.391271969019958, + "grad_norm": 0.01483154296875, + "learning_rate": 0.023835569638451, + "loss": 0.805, + "num_input_tokens_seen": 28803872, + "step": 49625 + }, + { + "epoch": 7.392016681560918, + "grad_norm": 0.029541015625, + "learning_rate": 0.02383399403889652, + "loss": 0.801, + "num_input_tokens_seen": 28806976, + "step": 49630 + }, + { + "epoch": 7.392761394101877, + "grad_norm": 0.0308837890625, + "learning_rate": 0.023832418290100688, + "loss": 0.8072, + "num_input_tokens_seen": 28809856, + "step": 49635 + }, + { + "epoch": 7.393506106642836, + "grad_norm": 0.0269775390625, + "learning_rate": 0.023830842392090122, + "loss": 0.8041, + "num_input_tokens_seen": 28812672, + "step": 49640 + }, + { + "epoch": 7.394250819183795, + "grad_norm": 0.022705078125, + "learning_rate": 0.023829266344891454, + "loss": 0.805, + "num_input_tokens_seen": 28815584, + "step": 49645 + }, + { + "epoch": 7.394995531724755, + "grad_norm": 0.0244140625, + "learning_rate": 0.023827690148531304, + "loss": 0.8054, + "num_input_tokens_seen": 28818656, + "step": 49650 + }, + { + "epoch": 7.395740244265713, + "grad_norm": 0.01318359375, + "learning_rate": 0.023826113803036297, + "loss": 0.8133, + "num_input_tokens_seen": 28821280, + "step": 49655 + }, + { + "epoch": 7.396484956806672, + "grad_norm": 0.024658203125, + "learning_rate": 0.023824537308433074, + "loss": 0.7955, + "num_input_tokens_seen": 28824192, + "step": 49660 + }, + { + "epoch": 7.397229669347632, + "grad_norm": 0.02197265625, + "learning_rate": 0.023822960664748257, + "loss": 0.7991, + "num_input_tokens_seen": 28827104, + "step": 49665 + }, + { + "epoch": 7.3979743818885915, + "grad_norm": 0.0201416015625, + "learning_rate": 0.023821383872008488, + "loss": 0.7924, + "num_input_tokens_seen": 28829920, + "step": 49670 + }, + { + "epoch": 7.39871909442955, + "grad_norm": 0.0255126953125, + "learning_rate": 0.023819806930240408, + "loss": 0.812, + "num_input_tokens_seen": 28832704, + "step": 49675 + }, + { + "epoch": 7.399463806970509, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02381822983947065, + "loss": 0.8063, + "num_input_tokens_seen": 28835616, + "step": 49680 + }, + { + "epoch": 7.400208519511469, + "grad_norm": 0.0135498046875, + "learning_rate": 0.02381665259972586, + "loss": 0.7918, + "num_input_tokens_seen": 28838560, + "step": 49685 + }, + { + "epoch": 7.400953232052427, + "grad_norm": 0.021484375, + "learning_rate": 0.02381507521103269, + "loss": 0.8058, + "num_input_tokens_seen": 28841344, + "step": 49690 + }, + { + "epoch": 7.401697944593387, + "grad_norm": 0.023193359375, + "learning_rate": 0.023813497673417776, + "loss": 0.8237, + "num_input_tokens_seen": 28844000, + "step": 49695 + }, + { + "epoch": 7.402442657134346, + "grad_norm": 0.015380859375, + "learning_rate": 0.023811919986907782, + "loss": 0.8105, + "num_input_tokens_seen": 28847008, + "step": 49700 + }, + { + "epoch": 7.403187369675305, + "grad_norm": 0.021484375, + "learning_rate": 0.023810342151529357, + "loss": 0.7946, + "num_input_tokens_seen": 28849760, + "step": 49705 + }, + { + "epoch": 7.403932082216264, + "grad_norm": 0.023193359375, + "learning_rate": 0.023808764167309152, + "loss": 0.7949, + "num_input_tokens_seen": 28852896, + "step": 49710 + }, + { + "epoch": 7.404676794757224, + "grad_norm": 0.0263671875, + "learning_rate": 0.023807186034273835, + "loss": 0.7943, + "num_input_tokens_seen": 28856096, + "step": 49715 + }, + { + "epoch": 7.405421507298183, + "grad_norm": 0.0281982421875, + "learning_rate": 0.023805607752450053, + "loss": 0.8122, + "num_input_tokens_seen": 28859136, + "step": 49720 + }, + { + "epoch": 7.406166219839142, + "grad_norm": 0.0245361328125, + "learning_rate": 0.023804029321864484, + "loss": 0.7863, + "num_input_tokens_seen": 28861984, + "step": 49725 + }, + { + "epoch": 7.406910932380101, + "grad_norm": 0.0184326171875, + "learning_rate": 0.023802450742543786, + "loss": 0.8086, + "num_input_tokens_seen": 28865088, + "step": 49730 + }, + { + "epoch": 7.407655644921061, + "grad_norm": 0.032958984375, + "learning_rate": 0.023800872014514626, + "loss": 0.7982, + "num_input_tokens_seen": 28868064, + "step": 49735 + }, + { + "epoch": 7.408400357462019, + "grad_norm": 0.02294921875, + "learning_rate": 0.02379929313780368, + "loss": 0.8233, + "num_input_tokens_seen": 28870976, + "step": 49740 + }, + { + "epoch": 7.409145070002979, + "grad_norm": 0.0224609375, + "learning_rate": 0.023797714112437623, + "loss": 0.7876, + "num_input_tokens_seen": 28873568, + "step": 49745 + }, + { + "epoch": 7.409889782543938, + "grad_norm": 0.0250244140625, + "learning_rate": 0.023796134938443122, + "loss": 0.7946, + "num_input_tokens_seen": 28876576, + "step": 49750 + }, + { + "epoch": 7.4106344950848975, + "grad_norm": 0.019775390625, + "learning_rate": 0.02379455561584687, + "loss": 0.7935, + "num_input_tokens_seen": 28879360, + "step": 49755 + }, + { + "epoch": 7.411379207625856, + "grad_norm": 0.0294189453125, + "learning_rate": 0.023792976144675532, + "loss": 0.7943, + "num_input_tokens_seen": 28882336, + "step": 49760 + }, + { + "epoch": 7.412123920166816, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0237913965249558, + "loss": 0.7883, + "num_input_tokens_seen": 28884992, + "step": 49765 + }, + { + "epoch": 7.412868632707775, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02378981675671436, + "loss": 0.8177, + "num_input_tokens_seen": 28887840, + "step": 49770 + }, + { + "epoch": 7.413613345248734, + "grad_norm": 0.03173828125, + "learning_rate": 0.023788236839977894, + "loss": 0.8131, + "num_input_tokens_seen": 28890848, + "step": 49775 + }, + { + "epoch": 7.414358057789693, + "grad_norm": 0.02978515625, + "learning_rate": 0.023786656774773103, + "loss": 0.7937, + "num_input_tokens_seen": 28893728, + "step": 49780 + }, + { + "epoch": 7.415102770330653, + "grad_norm": 0.0252685546875, + "learning_rate": 0.023785076561126675, + "loss": 0.7903, + "num_input_tokens_seen": 28896704, + "step": 49785 + }, + { + "epoch": 7.415847482871611, + "grad_norm": 0.025634765625, + "learning_rate": 0.02378349619906531, + "loss": 0.8078, + "num_input_tokens_seen": 28899712, + "step": 49790 + }, + { + "epoch": 7.416592195412571, + "grad_norm": 0.03173828125, + "learning_rate": 0.023781915688615703, + "loss": 0.8133, + "num_input_tokens_seen": 28902592, + "step": 49795 + }, + { + "epoch": 7.41733690795353, + "grad_norm": 0.0205078125, + "learning_rate": 0.023780335029804558, + "loss": 0.8201, + "num_input_tokens_seen": 28905440, + "step": 49800 + }, + { + "epoch": 7.4180816204944895, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02377875422265857, + "loss": 0.8063, + "num_input_tokens_seen": 28908160, + "step": 49805 + }, + { + "epoch": 7.418826333035448, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02377717326720446, + "loss": 0.8067, + "num_input_tokens_seen": 28910848, + "step": 49810 + }, + { + "epoch": 7.419571045576408, + "grad_norm": 0.026611328125, + "learning_rate": 0.023775592163468924, + "loss": 0.8152, + "num_input_tokens_seen": 28913664, + "step": 49815 + }, + { + "epoch": 7.420315758117367, + "grad_norm": 0.041748046875, + "learning_rate": 0.023774010911478675, + "loss": 0.7967, + "num_input_tokens_seen": 28916512, + "step": 49820 + }, + { + "epoch": 7.421060470658326, + "grad_norm": 0.038818359375, + "learning_rate": 0.02377242951126043, + "loss": 0.8117, + "num_input_tokens_seen": 28919648, + "step": 49825 + }, + { + "epoch": 7.421805183199285, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02377084796284091, + "loss": 0.8007, + "num_input_tokens_seen": 28922656, + "step": 49830 + }, + { + "epoch": 7.422549895740245, + "grad_norm": 0.0244140625, + "learning_rate": 0.023769266266246824, + "loss": 0.8064, + "num_input_tokens_seen": 28925376, + "step": 49835 + }, + { + "epoch": 7.4232946082812035, + "grad_norm": 0.022216796875, + "learning_rate": 0.023767684421504898, + "loss": 0.8033, + "num_input_tokens_seen": 28928384, + "step": 49840 + }, + { + "epoch": 7.424039320822162, + "grad_norm": 0.016845703125, + "learning_rate": 0.023766102428641855, + "loss": 0.7887, + "num_input_tokens_seen": 28931328, + "step": 49845 + }, + { + "epoch": 7.424784033363122, + "grad_norm": 0.036865234375, + "learning_rate": 0.02376452028768442, + "loss": 0.8062, + "num_input_tokens_seen": 28934272, + "step": 49850 + }, + { + "epoch": 7.425528745904081, + "grad_norm": 0.02587890625, + "learning_rate": 0.02376293799865932, + "loss": 0.8114, + "num_input_tokens_seen": 28937056, + "step": 49855 + }, + { + "epoch": 7.42627345844504, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02376135556159329, + "loss": 0.8084, + "num_input_tokens_seen": 28940032, + "step": 49860 + }, + { + "epoch": 7.427018170985999, + "grad_norm": 0.0308837890625, + "learning_rate": 0.023759772976513063, + "loss": 0.8116, + "num_input_tokens_seen": 28942944, + "step": 49865 + }, + { + "epoch": 7.427762883526959, + "grad_norm": 0.0322265625, + "learning_rate": 0.023758190243445377, + "loss": 0.7927, + "num_input_tokens_seen": 28946080, + "step": 49870 + }, + { + "epoch": 7.428507596067917, + "grad_norm": 0.03173828125, + "learning_rate": 0.023756607362416967, + "loss": 0.8084, + "num_input_tokens_seen": 28948896, + "step": 49875 + }, + { + "epoch": 7.429252308608877, + "grad_norm": 0.0322265625, + "learning_rate": 0.023755024333454576, + "loss": 0.8124, + "num_input_tokens_seen": 28951488, + "step": 49880 + }, + { + "epoch": 7.429997021149836, + "grad_norm": 0.0233154296875, + "learning_rate": 0.023753441156584947, + "loss": 0.7945, + "num_input_tokens_seen": 28954432, + "step": 49885 + }, + { + "epoch": 7.4307417336907955, + "grad_norm": 0.03564453125, + "learning_rate": 0.023751857831834824, + "loss": 0.8195, + "num_input_tokens_seen": 28957408, + "step": 49890 + }, + { + "epoch": 7.431486446231754, + "grad_norm": 0.019775390625, + "learning_rate": 0.023750274359230963, + "loss": 0.8248, + "num_input_tokens_seen": 28960160, + "step": 49895 + }, + { + "epoch": 7.432231158772714, + "grad_norm": 0.026611328125, + "learning_rate": 0.023748690738800104, + "loss": 0.7992, + "num_input_tokens_seen": 28963040, + "step": 49900 + }, + { + "epoch": 7.432975871313673, + "grad_norm": 0.0159912109375, + "learning_rate": 0.023747106970569008, + "loss": 0.7924, + "num_input_tokens_seen": 28965856, + "step": 49905 + }, + { + "epoch": 7.433720583854632, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02374552305456443, + "loss": 0.8058, + "num_input_tokens_seen": 28968800, + "step": 49910 + }, + { + "epoch": 7.434465296395591, + "grad_norm": 0.031982421875, + "learning_rate": 0.023743938990813132, + "loss": 0.8007, + "num_input_tokens_seen": 28972000, + "step": 49915 + }, + { + "epoch": 7.435210008936551, + "grad_norm": 0.0252685546875, + "learning_rate": 0.023742354779341866, + "loss": 0.8003, + "num_input_tokens_seen": 28974688, + "step": 49920 + }, + { + "epoch": 7.4359547214775095, + "grad_norm": 0.0150146484375, + "learning_rate": 0.023740770420177408, + "loss": 0.7994, + "num_input_tokens_seen": 28977504, + "step": 49925 + }, + { + "epoch": 7.436699434018469, + "grad_norm": 0.023193359375, + "learning_rate": 0.023739185913346512, + "loss": 0.8023, + "num_input_tokens_seen": 28980672, + "step": 49930 + }, + { + "epoch": 7.437444146559428, + "grad_norm": 0.04736328125, + "learning_rate": 0.023737601258875956, + "loss": 0.8081, + "num_input_tokens_seen": 28984160, + "step": 49935 + }, + { + "epoch": 7.4381888591003875, + "grad_norm": 0.02294921875, + "learning_rate": 0.023736016456792505, + "loss": 0.797, + "num_input_tokens_seen": 28987200, + "step": 49940 + }, + { + "epoch": 7.438933571641346, + "grad_norm": 0.034912109375, + "learning_rate": 0.023734431507122938, + "loss": 0.7916, + "num_input_tokens_seen": 28989984, + "step": 49945 + }, + { + "epoch": 7.439678284182306, + "grad_norm": 0.0167236328125, + "learning_rate": 0.023732846409894023, + "loss": 0.8126, + "num_input_tokens_seen": 28992928, + "step": 49950 + }, + { + "epoch": 7.440422996723265, + "grad_norm": 0.03466796875, + "learning_rate": 0.023731261165132543, + "loss": 0.802, + "num_input_tokens_seen": 28995808, + "step": 49955 + }, + { + "epoch": 7.441167709264224, + "grad_norm": 0.038330078125, + "learning_rate": 0.023729675772865284, + "loss": 0.7878, + "num_input_tokens_seen": 28998848, + "step": 49960 + }, + { + "epoch": 7.441912421805183, + "grad_norm": 0.0252685546875, + "learning_rate": 0.023728090233119028, + "loss": 0.7993, + "num_input_tokens_seen": 29001536, + "step": 49965 + }, + { + "epoch": 7.442657134346143, + "grad_norm": 0.022216796875, + "learning_rate": 0.02372650454592055, + "loss": 0.8064, + "num_input_tokens_seen": 29004256, + "step": 49970 + }, + { + "epoch": 7.4434018468871015, + "grad_norm": 0.03466796875, + "learning_rate": 0.023724918711296648, + "loss": 0.796, + "num_input_tokens_seen": 29007232, + "step": 49975 + }, + { + "epoch": 7.444146559428061, + "grad_norm": 0.0218505859375, + "learning_rate": 0.023723332729274117, + "loss": 0.7855, + "num_input_tokens_seen": 29010272, + "step": 49980 + }, + { + "epoch": 7.44489127196902, + "grad_norm": 0.031494140625, + "learning_rate": 0.023721746599879745, + "loss": 0.8134, + "num_input_tokens_seen": 29013280, + "step": 49985 + }, + { + "epoch": 7.4456359845099795, + "grad_norm": 0.02197265625, + "learning_rate": 0.023720160323140324, + "loss": 0.8015, + "num_input_tokens_seen": 29016160, + "step": 49990 + }, + { + "epoch": 7.446380697050938, + "grad_norm": 0.01531982421875, + "learning_rate": 0.023718573899082662, + "loss": 0.8122, + "num_input_tokens_seen": 29019072, + "step": 49995 + }, + { + "epoch": 7.447125409591898, + "grad_norm": 0.0206298828125, + "learning_rate": 0.023716987327733554, + "loss": 0.81, + "num_input_tokens_seen": 29021920, + "step": 50000 + }, + { + "epoch": 7.447870122132857, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0237154006091198, + "loss": 0.7989, + "num_input_tokens_seen": 29024832, + "step": 50005 + }, + { + "epoch": 7.4486148346738155, + "grad_norm": 0.0302734375, + "learning_rate": 0.02371381374326822, + "loss": 0.8243, + "num_input_tokens_seen": 29027712, + "step": 50010 + }, + { + "epoch": 7.449359547214775, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02371222673020561, + "loss": 0.8067, + "num_input_tokens_seen": 29030624, + "step": 50015 + }, + { + "epoch": 7.450104259755734, + "grad_norm": 0.0135498046875, + "learning_rate": 0.02371063956995878, + "loss": 0.7918, + "num_input_tokens_seen": 29033824, + "step": 50020 + }, + { + "epoch": 7.4508489722966935, + "grad_norm": 0.025390625, + "learning_rate": 0.023709052262554545, + "loss": 0.7881, + "num_input_tokens_seen": 29036704, + "step": 50025 + }, + { + "epoch": 7.451593684837652, + "grad_norm": 0.02294921875, + "learning_rate": 0.02370746480801973, + "loss": 0.7918, + "num_input_tokens_seen": 29039712, + "step": 50030 + }, + { + "epoch": 7.452338397378612, + "grad_norm": 0.028564453125, + "learning_rate": 0.023705877206381145, + "loss": 0.7848, + "num_input_tokens_seen": 29042688, + "step": 50035 + }, + { + "epoch": 7.453083109919571, + "grad_norm": 0.022705078125, + "learning_rate": 0.023704289457665617, + "loss": 0.8058, + "num_input_tokens_seen": 29045344, + "step": 50040 + }, + { + "epoch": 7.45382782246053, + "grad_norm": 0.027587890625, + "learning_rate": 0.023702701561899957, + "loss": 0.8302, + "num_input_tokens_seen": 29048160, + "step": 50045 + }, + { + "epoch": 7.454572535001489, + "grad_norm": 0.0172119140625, + "learning_rate": 0.023701113519111004, + "loss": 0.7984, + "num_input_tokens_seen": 29050752, + "step": 50050 + }, + { + "epoch": 7.455317247542449, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02369952532932558, + "loss": 0.8238, + "num_input_tokens_seen": 29053792, + "step": 50055 + }, + { + "epoch": 7.4560619600834075, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02369793699257052, + "loss": 0.8087, + "num_input_tokens_seen": 29056576, + "step": 50060 + }, + { + "epoch": 7.456806672624367, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02369634850887265, + "loss": 0.8046, + "num_input_tokens_seen": 29059328, + "step": 50065 + }, + { + "epoch": 7.457551385165326, + "grad_norm": 0.0244140625, + "learning_rate": 0.023694759878258812, + "loss": 0.8105, + "num_input_tokens_seen": 29062240, + "step": 50070 + }, + { + "epoch": 7.4582960977062855, + "grad_norm": 0.0269775390625, + "learning_rate": 0.023693171100755844, + "loss": 0.799, + "num_input_tokens_seen": 29064928, + "step": 50075 + }, + { + "epoch": 7.459040810247244, + "grad_norm": 0.02392578125, + "learning_rate": 0.02369158217639059, + "loss": 0.8089, + "num_input_tokens_seen": 29067840, + "step": 50080 + }, + { + "epoch": 7.459785522788204, + "grad_norm": 0.022216796875, + "learning_rate": 0.023689993105189884, + "loss": 0.807, + "num_input_tokens_seen": 29070528, + "step": 50085 + }, + { + "epoch": 7.460530235329163, + "grad_norm": 0.0218505859375, + "learning_rate": 0.023688403887180576, + "loss": 0.7911, + "num_input_tokens_seen": 29073280, + "step": 50090 + }, + { + "epoch": 7.461274947870122, + "grad_norm": 0.033935546875, + "learning_rate": 0.023686814522389518, + "loss": 0.8004, + "num_input_tokens_seen": 29076128, + "step": 50095 + }, + { + "epoch": 7.462019660411081, + "grad_norm": 0.019287109375, + "learning_rate": 0.023685225010843552, + "loss": 0.8006, + "num_input_tokens_seen": 29079392, + "step": 50100 + }, + { + "epoch": 7.462764372952041, + "grad_norm": 0.03173828125, + "learning_rate": 0.02368363535256954, + "loss": 0.8073, + "num_input_tokens_seen": 29082272, + "step": 50105 + }, + { + "epoch": 7.4635090854929995, + "grad_norm": 0.036865234375, + "learning_rate": 0.023682045547594337, + "loss": 0.8079, + "num_input_tokens_seen": 29085056, + "step": 50110 + }, + { + "epoch": 7.464253798033959, + "grad_norm": 0.0361328125, + "learning_rate": 0.023680455595944793, + "loss": 0.7911, + "num_input_tokens_seen": 29088032, + "step": 50115 + }, + { + "epoch": 7.464998510574918, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02367886549764778, + "loss": 0.7973, + "num_input_tokens_seen": 29090880, + "step": 50120 + }, + { + "epoch": 7.465743223115878, + "grad_norm": 0.04736328125, + "learning_rate": 0.02367727525273015, + "loss": 0.7976, + "num_input_tokens_seen": 29093824, + "step": 50125 + }, + { + "epoch": 7.466487935656836, + "grad_norm": 0.03857421875, + "learning_rate": 0.02367568486121878, + "loss": 0.7899, + "num_input_tokens_seen": 29096736, + "step": 50130 + }, + { + "epoch": 7.467232648197796, + "grad_norm": 0.04296875, + "learning_rate": 0.023674094323140528, + "loss": 0.7886, + "num_input_tokens_seen": 29099776, + "step": 50135 + }, + { + "epoch": 7.467977360738755, + "grad_norm": 0.10302734375, + "learning_rate": 0.023672503638522264, + "loss": 0.8028, + "num_input_tokens_seen": 29103008, + "step": 50140 + }, + { + "epoch": 7.468722073279714, + "grad_norm": 0.051513671875, + "learning_rate": 0.02367091280739087, + "loss": 0.7853, + "num_input_tokens_seen": 29106144, + "step": 50145 + }, + { + "epoch": 7.469466785820673, + "grad_norm": 0.04248046875, + "learning_rate": 0.02366932182977322, + "loss": 0.7698, + "num_input_tokens_seen": 29108800, + "step": 50150 + }, + { + "epoch": 7.470211498361633, + "grad_norm": 0.1318359375, + "learning_rate": 0.02366773070569618, + "loss": 0.814, + "num_input_tokens_seen": 29111872, + "step": 50155 + }, + { + "epoch": 7.4709562109025915, + "grad_norm": 0.034423828125, + "learning_rate": 0.023666139435186646, + "loss": 0.8459, + "num_input_tokens_seen": 29114688, + "step": 50160 + }, + { + "epoch": 7.471700923443551, + "grad_norm": 0.030029296875, + "learning_rate": 0.023664548018271496, + "loss": 0.7979, + "num_input_tokens_seen": 29117856, + "step": 50165 + }, + { + "epoch": 7.47244563598451, + "grad_norm": 0.042724609375, + "learning_rate": 0.02366295645497761, + "loss": 0.8093, + "num_input_tokens_seen": 29120640, + "step": 50170 + }, + { + "epoch": 7.473190348525469, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02366136474533188, + "loss": 0.8203, + "num_input_tokens_seen": 29123712, + "step": 50175 + }, + { + "epoch": 7.473935061066428, + "grad_norm": 0.031982421875, + "learning_rate": 0.023659772889361195, + "loss": 0.7945, + "num_input_tokens_seen": 29126656, + "step": 50180 + }, + { + "epoch": 7.474679773607388, + "grad_norm": 0.025390625, + "learning_rate": 0.02365818088709245, + "loss": 0.7947, + "num_input_tokens_seen": 29129344, + "step": 50185 + }, + { + "epoch": 7.475424486148347, + "grad_norm": 0.039794921875, + "learning_rate": 0.023656588738552534, + "loss": 0.8341, + "num_input_tokens_seen": 29131968, + "step": 50190 + }, + { + "epoch": 7.4761691986893055, + "grad_norm": 0.0255126953125, + "learning_rate": 0.023654996443768353, + "loss": 0.8421, + "num_input_tokens_seen": 29134880, + "step": 50195 + }, + { + "epoch": 7.476913911230265, + "grad_norm": 0.024169921875, + "learning_rate": 0.023653404002766807, + "loss": 0.7963, + "num_input_tokens_seen": 29137664, + "step": 50200 + }, + { + "epoch": 7.477658623771224, + "grad_norm": 0.0277099609375, + "learning_rate": 0.023651811415574788, + "loss": 0.8241, + "num_input_tokens_seen": 29140416, + "step": 50205 + }, + { + "epoch": 7.478403336312184, + "grad_norm": 0.018310546875, + "learning_rate": 0.023650218682219212, + "loss": 0.7906, + "num_input_tokens_seen": 29143648, + "step": 50210 + }, + { + "epoch": 7.479148048853142, + "grad_norm": 0.033447265625, + "learning_rate": 0.023648625802726985, + "loss": 0.7988, + "num_input_tokens_seen": 29146464, + "step": 50215 + }, + { + "epoch": 7.479892761394102, + "grad_norm": 0.0220947265625, + "learning_rate": 0.023647032777125012, + "loss": 0.7906, + "num_input_tokens_seen": 29148896, + "step": 50220 + }, + { + "epoch": 7.480637473935061, + "grad_norm": 0.0224609375, + "learning_rate": 0.02364543960544021, + "loss": 0.7894, + "num_input_tokens_seen": 29151840, + "step": 50225 + }, + { + "epoch": 7.48138218647602, + "grad_norm": 0.0272216796875, + "learning_rate": 0.023643846287699492, + "loss": 0.7769, + "num_input_tokens_seen": 29154720, + "step": 50230 + }, + { + "epoch": 7.482126899016979, + "grad_norm": 0.0291748046875, + "learning_rate": 0.023642252823929778, + "loss": 0.7829, + "num_input_tokens_seen": 29157504, + "step": 50235 + }, + { + "epoch": 7.482871611557939, + "grad_norm": 0.02734375, + "learning_rate": 0.023640659214157988, + "loss": 0.7871, + "num_input_tokens_seen": 29160480, + "step": 50240 + }, + { + "epoch": 7.4836163240988975, + "grad_norm": 0.03662109375, + "learning_rate": 0.02363906545841104, + "loss": 0.8022, + "num_input_tokens_seen": 29163552, + "step": 50245 + }, + { + "epoch": 7.484361036639857, + "grad_norm": 0.024658203125, + "learning_rate": 0.023637471556715858, + "loss": 0.7919, + "num_input_tokens_seen": 29166336, + "step": 50250 + }, + { + "epoch": 7.485105749180816, + "grad_norm": 0.045166015625, + "learning_rate": 0.023635877509099377, + "loss": 0.7731, + "num_input_tokens_seen": 29169216, + "step": 50255 + }, + { + "epoch": 7.485850461721776, + "grad_norm": 0.047119140625, + "learning_rate": 0.02363428331558852, + "loss": 0.8377, + "num_input_tokens_seen": 29172064, + "step": 50260 + }, + { + "epoch": 7.486595174262734, + "grad_norm": 0.035888671875, + "learning_rate": 0.023632688976210226, + "loss": 0.7607, + "num_input_tokens_seen": 29174848, + "step": 50265 + }, + { + "epoch": 7.487339886803694, + "grad_norm": 0.03173828125, + "learning_rate": 0.02363109449099142, + "loss": 0.8031, + "num_input_tokens_seen": 29178176, + "step": 50270 + }, + { + "epoch": 7.488084599344653, + "grad_norm": 0.0296630859375, + "learning_rate": 0.023629499859959047, + "loss": 0.8074, + "num_input_tokens_seen": 29181184, + "step": 50275 + }, + { + "epoch": 7.488829311885612, + "grad_norm": 0.050537109375, + "learning_rate": 0.023627905083140048, + "loss": 0.8004, + "num_input_tokens_seen": 29184256, + "step": 50280 + }, + { + "epoch": 7.489574024426571, + "grad_norm": 0.03173828125, + "learning_rate": 0.023626310160561357, + "loss": 0.8063, + "num_input_tokens_seen": 29187296, + "step": 50285 + }, + { + "epoch": 7.490318736967531, + "grad_norm": 0.0272216796875, + "learning_rate": 0.023624715092249924, + "loss": 0.7452, + "num_input_tokens_seen": 29190368, + "step": 50290 + }, + { + "epoch": 7.49106344950849, + "grad_norm": 0.03173828125, + "learning_rate": 0.023623119878232696, + "loss": 0.754, + "num_input_tokens_seen": 29193472, + "step": 50295 + }, + { + "epoch": 7.491808162049449, + "grad_norm": 0.0245361328125, + "learning_rate": 0.023621524518536625, + "loss": 0.8023, + "num_input_tokens_seen": 29196352, + "step": 50300 + }, + { + "epoch": 7.492552874590408, + "grad_norm": 0.040283203125, + "learning_rate": 0.023619929013188653, + "loss": 0.8283, + "num_input_tokens_seen": 29199392, + "step": 50305 + }, + { + "epoch": 7.493297587131368, + "grad_norm": 0.02001953125, + "learning_rate": 0.02361833336221575, + "loss": 0.7642, + "num_input_tokens_seen": 29202272, + "step": 50310 + }, + { + "epoch": 7.494042299672326, + "grad_norm": 0.028076171875, + "learning_rate": 0.023616737565644856, + "loss": 0.7901, + "num_input_tokens_seen": 29204960, + "step": 50315 + }, + { + "epoch": 7.494787012213286, + "grad_norm": 0.0264892578125, + "learning_rate": 0.023615141623502943, + "loss": 0.7836, + "num_input_tokens_seen": 29207808, + "step": 50320 + }, + { + "epoch": 7.495531724754245, + "grad_norm": 0.03076171875, + "learning_rate": 0.023613545535816967, + "loss": 0.7979, + "num_input_tokens_seen": 29210592, + "step": 50325 + }, + { + "epoch": 7.496276437295204, + "grad_norm": 0.05517578125, + "learning_rate": 0.023611949302613892, + "loss": 0.8711, + "num_input_tokens_seen": 29213536, + "step": 50330 + }, + { + "epoch": 7.497021149836163, + "grad_norm": 0.031005859375, + "learning_rate": 0.02361035292392069, + "loss": 0.8736, + "num_input_tokens_seen": 29216704, + "step": 50335 + }, + { + "epoch": 7.497765862377123, + "grad_norm": 0.0257568359375, + "learning_rate": 0.023608756399764324, + "loss": 0.8244, + "num_input_tokens_seen": 29219872, + "step": 50340 + }, + { + "epoch": 7.498510574918082, + "grad_norm": 0.0238037109375, + "learning_rate": 0.023607159730171766, + "loss": 0.8111, + "num_input_tokens_seen": 29222720, + "step": 50345 + }, + { + "epoch": 7.499255287459041, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02360556291517, + "loss": 0.8182, + "num_input_tokens_seen": 29225632, + "step": 50350 + }, + { + "epoch": 7.5, + "grad_norm": 0.021240234375, + "learning_rate": 0.023603965954785987, + "loss": 0.7815, + "num_input_tokens_seen": 29228704, + "step": 50355 + }, + { + "epoch": 7.500744712540959, + "grad_norm": 0.033447265625, + "learning_rate": 0.023602368849046717, + "loss": 0.7824, + "num_input_tokens_seen": 29231712, + "step": 50360 + }, + { + "epoch": 7.501489425081918, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02360077159797917, + "loss": 0.7964, + "num_input_tokens_seen": 29234560, + "step": 50365 + }, + { + "epoch": 7.502234137622878, + "grad_norm": 0.019775390625, + "learning_rate": 0.023599174201610326, + "loss": 0.7995, + "num_input_tokens_seen": 29237376, + "step": 50370 + }, + { + "epoch": 7.502978850163837, + "grad_norm": 0.0172119140625, + "learning_rate": 0.023597576659967173, + "loss": 0.8172, + "num_input_tokens_seen": 29240352, + "step": 50375 + }, + { + "epoch": 7.503723562704796, + "grad_norm": 0.029296875, + "learning_rate": 0.023595978973076703, + "loss": 0.8127, + "num_input_tokens_seen": 29243040, + "step": 50380 + }, + { + "epoch": 7.504468275245755, + "grad_norm": 0.02587890625, + "learning_rate": 0.0235943811409659, + "loss": 0.7895, + "num_input_tokens_seen": 29246080, + "step": 50385 + }, + { + "epoch": 7.505212987786714, + "grad_norm": 0.037353515625, + "learning_rate": 0.023592783163661767, + "loss": 0.8412, + "num_input_tokens_seen": 29248992, + "step": 50390 + }, + { + "epoch": 7.505957700327674, + "grad_norm": 0.014892578125, + "learning_rate": 0.023591185041191294, + "loss": 0.8129, + "num_input_tokens_seen": 29251680, + "step": 50395 + }, + { + "epoch": 7.506702412868632, + "grad_norm": 0.0216064453125, + "learning_rate": 0.023589586773581483, + "loss": 0.8159, + "num_input_tokens_seen": 29254528, + "step": 50400 + }, + { + "epoch": 7.507447125409592, + "grad_norm": 0.0213623046875, + "learning_rate": 0.023587988360859336, + "loss": 0.7986, + "num_input_tokens_seen": 29257408, + "step": 50405 + }, + { + "epoch": 7.508191837950551, + "grad_norm": 0.02392578125, + "learning_rate": 0.02358638980305185, + "loss": 0.8078, + "num_input_tokens_seen": 29260192, + "step": 50410 + }, + { + "epoch": 7.50893655049151, + "grad_norm": 0.03369140625, + "learning_rate": 0.023584791100186034, + "loss": 0.8218, + "num_input_tokens_seen": 29263168, + "step": 50415 + }, + { + "epoch": 7.509681263032469, + "grad_norm": 0.0223388671875, + "learning_rate": 0.023583192252288897, + "loss": 0.7918, + "num_input_tokens_seen": 29266208, + "step": 50420 + }, + { + "epoch": 7.510425975573429, + "grad_norm": 0.0150146484375, + "learning_rate": 0.023581593259387457, + "loss": 0.8058, + "num_input_tokens_seen": 29269376, + "step": 50425 + }, + { + "epoch": 7.511170688114388, + "grad_norm": 0.027099609375, + "learning_rate": 0.023579994121508713, + "loss": 0.7987, + "num_input_tokens_seen": 29271968, + "step": 50430 + }, + { + "epoch": 7.511915400655347, + "grad_norm": 0.0419921875, + "learning_rate": 0.02357839483867969, + "loss": 0.8097, + "num_input_tokens_seen": 29275264, + "step": 50435 + }, + { + "epoch": 7.512660113196306, + "grad_norm": 0.024658203125, + "learning_rate": 0.02357679541092741, + "loss": 0.8293, + "num_input_tokens_seen": 29278080, + "step": 50440 + }, + { + "epoch": 7.513404825737266, + "grad_norm": 0.0284423828125, + "learning_rate": 0.023575195838278887, + "loss": 0.7953, + "num_input_tokens_seen": 29280704, + "step": 50445 + }, + { + "epoch": 7.514149538278224, + "grad_norm": 0.0206298828125, + "learning_rate": 0.023573596120761146, + "loss": 0.8099, + "num_input_tokens_seen": 29283424, + "step": 50450 + }, + { + "epoch": 7.514894250819184, + "grad_norm": 0.024658203125, + "learning_rate": 0.02357199625840121, + "loss": 0.8149, + "num_input_tokens_seen": 29286432, + "step": 50455 + }, + { + "epoch": 7.515638963360143, + "grad_norm": 0.025146484375, + "learning_rate": 0.02357039625122611, + "loss": 0.7936, + "num_input_tokens_seen": 29289504, + "step": 50460 + }, + { + "epoch": 7.5163836759011025, + "grad_norm": 0.027099609375, + "learning_rate": 0.02356879609926287, + "loss": 0.8024, + "num_input_tokens_seen": 29292480, + "step": 50465 + }, + { + "epoch": 7.517128388442061, + "grad_norm": 0.02392578125, + "learning_rate": 0.023567195802538538, + "loss": 0.7986, + "num_input_tokens_seen": 29295488, + "step": 50470 + }, + { + "epoch": 7.517873100983021, + "grad_norm": 0.0220947265625, + "learning_rate": 0.023565595361080136, + "loss": 0.8009, + "num_input_tokens_seen": 29298208, + "step": 50475 + }, + { + "epoch": 7.51861781352398, + "grad_norm": 0.02392578125, + "learning_rate": 0.02356399477491471, + "loss": 0.8045, + "num_input_tokens_seen": 29301024, + "step": 50480 + }, + { + "epoch": 7.519362526064939, + "grad_norm": 0.01336669921875, + "learning_rate": 0.023562394044069297, + "loss": 0.7998, + "num_input_tokens_seen": 29303840, + "step": 50485 + }, + { + "epoch": 7.520107238605898, + "grad_norm": 0.0150146484375, + "learning_rate": 0.02356079316857093, + "loss": 0.8168, + "num_input_tokens_seen": 29306784, + "step": 50490 + }, + { + "epoch": 7.520851951146858, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02355919214844667, + "loss": 0.8075, + "num_input_tokens_seen": 29309536, + "step": 50495 + }, + { + "epoch": 7.521596663687816, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02355759098372356, + "loss": 0.8011, + "num_input_tokens_seen": 29312896, + "step": 50500 + }, + { + "epoch": 7.522341376228776, + "grad_norm": 0.0233154296875, + "learning_rate": 0.023555989674428644, + "loss": 0.7938, + "num_input_tokens_seen": 29315840, + "step": 50505 + }, + { + "epoch": 7.523086088769735, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02355438822058898, + "loss": 0.7943, + "num_input_tokens_seen": 29318880, + "step": 50510 + }, + { + "epoch": 7.5238308013106945, + "grad_norm": 0.03369140625, + "learning_rate": 0.02355278662223162, + "loss": 0.8115, + "num_input_tokens_seen": 29321568, + "step": 50515 + }, + { + "epoch": 7.524575513851653, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02355118487938362, + "loss": 0.8108, + "num_input_tokens_seen": 29324544, + "step": 50520 + }, + { + "epoch": 7.525320226392612, + "grad_norm": 0.021728515625, + "learning_rate": 0.02354958299207205, + "loss": 0.8103, + "num_input_tokens_seen": 29327232, + "step": 50525 + }, + { + "epoch": 7.526064938933572, + "grad_norm": 0.025146484375, + "learning_rate": 0.02354798096032396, + "loss": 0.804, + "num_input_tokens_seen": 29330144, + "step": 50530 + }, + { + "epoch": 7.526809651474531, + "grad_norm": 0.0291748046875, + "learning_rate": 0.023546378784166416, + "loss": 0.7936, + "num_input_tokens_seen": 29333152, + "step": 50535 + }, + { + "epoch": 7.52755436401549, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02354477646362649, + "loss": 0.8089, + "num_input_tokens_seen": 29335872, + "step": 50540 + }, + { + "epoch": 7.528299076556449, + "grad_norm": 0.01287841796875, + "learning_rate": 0.023543173998731255, + "loss": 0.8155, + "num_input_tokens_seen": 29338720, + "step": 50545 + }, + { + "epoch": 7.5290437890974085, + "grad_norm": 0.0123291015625, + "learning_rate": 0.023541571389507777, + "loss": 0.7904, + "num_input_tokens_seen": 29341600, + "step": 50550 + }, + { + "epoch": 7.529788501638367, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02353996863598313, + "loss": 0.8098, + "num_input_tokens_seen": 29344608, + "step": 50555 + }, + { + "epoch": 7.530533214179327, + "grad_norm": 0.023193359375, + "learning_rate": 0.023538365738184394, + "loss": 0.8044, + "num_input_tokens_seen": 29347680, + "step": 50560 + }, + { + "epoch": 7.531277926720286, + "grad_norm": 0.01409912109375, + "learning_rate": 0.023536762696138643, + "loss": 0.821, + "num_input_tokens_seen": 29350688, + "step": 50565 + }, + { + "epoch": 7.532022639261245, + "grad_norm": 0.0224609375, + "learning_rate": 0.02353515950987297, + "loss": 0.8001, + "num_input_tokens_seen": 29353632, + "step": 50570 + }, + { + "epoch": 7.532767351802204, + "grad_norm": 0.0220947265625, + "learning_rate": 0.023533556179414442, + "loss": 0.7939, + "num_input_tokens_seen": 29356448, + "step": 50575 + }, + { + "epoch": 7.533512064343164, + "grad_norm": 0.0155029296875, + "learning_rate": 0.023531952704790162, + "loss": 0.8005, + "num_input_tokens_seen": 29359488, + "step": 50580 + }, + { + "epoch": 7.534256776884122, + "grad_norm": 0.0257568359375, + "learning_rate": 0.023530349086027212, + "loss": 0.7997, + "num_input_tokens_seen": 29362176, + "step": 50585 + }, + { + "epoch": 7.535001489425082, + "grad_norm": 0.0244140625, + "learning_rate": 0.023528745323152682, + "loss": 0.8095, + "num_input_tokens_seen": 29364704, + "step": 50590 + }, + { + "epoch": 7.535746201966041, + "grad_norm": 0.01300048828125, + "learning_rate": 0.023527141416193668, + "loss": 0.7928, + "num_input_tokens_seen": 29367328, + "step": 50595 + }, + { + "epoch": 7.5364909145070005, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02352553736517727, + "loss": 0.815, + "num_input_tokens_seen": 29370464, + "step": 50600 + }, + { + "epoch": 7.537235627047959, + "grad_norm": 0.0150146484375, + "learning_rate": 0.02352393317013058, + "loss": 0.8061, + "num_input_tokens_seen": 29373344, + "step": 50605 + }, + { + "epoch": 7.537980339588919, + "grad_norm": 0.0223388671875, + "learning_rate": 0.023522328831080706, + "loss": 0.7883, + "num_input_tokens_seen": 29376256, + "step": 50610 + }, + { + "epoch": 7.538725052129878, + "grad_norm": 0.0235595703125, + "learning_rate": 0.023520724348054745, + "loss": 0.7964, + "num_input_tokens_seen": 29378944, + "step": 50615 + }, + { + "epoch": 7.539469764670837, + "grad_norm": 0.02392578125, + "learning_rate": 0.023519119721079805, + "loss": 0.8106, + "num_input_tokens_seen": 29381888, + "step": 50620 + }, + { + "epoch": 7.540214477211796, + "grad_norm": 0.023681640625, + "learning_rate": 0.023517514950182995, + "loss": 0.7911, + "num_input_tokens_seen": 29384768, + "step": 50625 + }, + { + "epoch": 7.540959189752756, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02351591003539143, + "loss": 0.8002, + "num_input_tokens_seen": 29387648, + "step": 50630 + }, + { + "epoch": 7.5417039022937145, + "grad_norm": 0.033447265625, + "learning_rate": 0.023514304976732216, + "loss": 0.8055, + "num_input_tokens_seen": 29390816, + "step": 50635 + }, + { + "epoch": 7.542448614834674, + "grad_norm": 0.035888671875, + "learning_rate": 0.02351269977423248, + "loss": 0.8074, + "num_input_tokens_seen": 29393632, + "step": 50640 + }, + { + "epoch": 7.543193327375633, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02351109442791933, + "loss": 0.8038, + "num_input_tokens_seen": 29396576, + "step": 50645 + }, + { + "epoch": 7.5439380399165925, + "grad_norm": 0.02685546875, + "learning_rate": 0.02350948893781989, + "loss": 0.7955, + "num_input_tokens_seen": 29399584, + "step": 50650 + }, + { + "epoch": 7.544682752457551, + "grad_norm": 0.03125, + "learning_rate": 0.023507883303961283, + "loss": 0.7997, + "num_input_tokens_seen": 29402336, + "step": 50655 + }, + { + "epoch": 7.545427464998511, + "grad_norm": 0.028564453125, + "learning_rate": 0.023506277526370634, + "loss": 0.7885, + "num_input_tokens_seen": 29405216, + "step": 50660 + }, + { + "epoch": 7.54617217753947, + "grad_norm": 0.0228271484375, + "learning_rate": 0.023504671605075068, + "loss": 0.7998, + "num_input_tokens_seen": 29407968, + "step": 50665 + }, + { + "epoch": 7.546916890080429, + "grad_norm": 0.023681640625, + "learning_rate": 0.02350306554010172, + "loss": 0.8044, + "num_input_tokens_seen": 29410976, + "step": 50670 + }, + { + "epoch": 7.547661602621388, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02350145933147773, + "loss": 0.8002, + "num_input_tokens_seen": 29413568, + "step": 50675 + }, + { + "epoch": 7.548406315162348, + "grad_norm": 0.0130615234375, + "learning_rate": 0.023499852979230216, + "loss": 0.7831, + "num_input_tokens_seen": 29416224, + "step": 50680 + }, + { + "epoch": 7.5491510277033065, + "grad_norm": 0.0283203125, + "learning_rate": 0.02349824648338633, + "loss": 0.7807, + "num_input_tokens_seen": 29419328, + "step": 50685 + }, + { + "epoch": 7.549895740244265, + "grad_norm": 0.02880859375, + "learning_rate": 0.023496639843973204, + "loss": 0.7771, + "num_input_tokens_seen": 29422240, + "step": 50690 + }, + { + "epoch": 7.550640452785225, + "grad_norm": 0.0235595703125, + "learning_rate": 0.023495033061017988, + "loss": 0.8104, + "num_input_tokens_seen": 29425056, + "step": 50695 + }, + { + "epoch": 7.5513851653261845, + "grad_norm": 0.029541015625, + "learning_rate": 0.023493426134547815, + "loss": 0.8029, + "num_input_tokens_seen": 29427680, + "step": 50700 + }, + { + "epoch": 7.552129877867143, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02349181906458984, + "loss": 0.8251, + "num_input_tokens_seen": 29430304, + "step": 50705 + }, + { + "epoch": 7.552874590408102, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02349021185117122, + "loss": 0.8137, + "num_input_tokens_seen": 29433056, + "step": 50710 + }, + { + "epoch": 7.553619302949062, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0234886044943191, + "loss": 0.8137, + "num_input_tokens_seen": 29435776, + "step": 50715 + }, + { + "epoch": 7.554364015490021, + "grad_norm": 0.032958984375, + "learning_rate": 0.023486996994060625, + "loss": 0.8094, + "num_input_tokens_seen": 29438432, + "step": 50720 + }, + { + "epoch": 7.55510872803098, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02348538935042297, + "loss": 0.7893, + "num_input_tokens_seen": 29441568, + "step": 50725 + }, + { + "epoch": 7.555853440571939, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02348378156343328, + "loss": 0.7926, + "num_input_tokens_seen": 29444320, + "step": 50730 + }, + { + "epoch": 7.5565981531128985, + "grad_norm": 0.02294921875, + "learning_rate": 0.023482173633118726, + "loss": 0.8136, + "num_input_tokens_seen": 29447104, + "step": 50735 + }, + { + "epoch": 7.557342865653857, + "grad_norm": 0.021484375, + "learning_rate": 0.02348056555950647, + "loss": 0.8052, + "num_input_tokens_seen": 29450176, + "step": 50740 + }, + { + "epoch": 7.558087578194817, + "grad_norm": 0.02001953125, + "learning_rate": 0.02347895734262368, + "loss": 0.7928, + "num_input_tokens_seen": 29452992, + "step": 50745 + }, + { + "epoch": 7.558832290735776, + "grad_norm": 0.0262451171875, + "learning_rate": 0.023477348982497523, + "loss": 0.7758, + "num_input_tokens_seen": 29455680, + "step": 50750 + }, + { + "epoch": 7.559577003276735, + "grad_norm": 0.0146484375, + "learning_rate": 0.02347574047915517, + "loss": 0.8131, + "num_input_tokens_seen": 29458400, + "step": 50755 + }, + { + "epoch": 7.560321715817694, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02347413183262379, + "loss": 0.808, + "num_input_tokens_seen": 29461600, + "step": 50760 + }, + { + "epoch": 7.561066428358654, + "grad_norm": 0.018310546875, + "learning_rate": 0.02347252304293057, + "loss": 0.7849, + "num_input_tokens_seen": 29464448, + "step": 50765 + }, + { + "epoch": 7.5618111408996125, + "grad_norm": 0.040771484375, + "learning_rate": 0.02347091411010269, + "loss": 0.8484, + "num_input_tokens_seen": 29467424, + "step": 50770 + }, + { + "epoch": 7.562555853440572, + "grad_norm": 0.028564453125, + "learning_rate": 0.023469305034167318, + "loss": 0.7989, + "num_input_tokens_seen": 29469952, + "step": 50775 + }, + { + "epoch": 7.563300565981531, + "grad_norm": 0.021484375, + "learning_rate": 0.023467695815151648, + "loss": 0.7766, + "num_input_tokens_seen": 29473056, + "step": 50780 + }, + { + "epoch": 7.5640452785224905, + "grad_norm": 0.021240234375, + "learning_rate": 0.023466086453082863, + "loss": 0.8153, + "num_input_tokens_seen": 29475840, + "step": 50785 + }, + { + "epoch": 7.564789991063449, + "grad_norm": 0.029541015625, + "learning_rate": 0.02346447694798815, + "loss": 0.7995, + "num_input_tokens_seen": 29478784, + "step": 50790 + }, + { + "epoch": 7.565534703604409, + "grad_norm": 0.01513671875, + "learning_rate": 0.023462867299894705, + "loss": 0.8048, + "num_input_tokens_seen": 29481792, + "step": 50795 + }, + { + "epoch": 7.566279416145368, + "grad_norm": 0.0208740234375, + "learning_rate": 0.023461257508829717, + "loss": 0.8111, + "num_input_tokens_seen": 29484608, + "step": 50800 + }, + { + "epoch": 7.567024128686327, + "grad_norm": 0.028564453125, + "learning_rate": 0.023459647574820382, + "loss": 0.8073, + "num_input_tokens_seen": 29487456, + "step": 50805 + }, + { + "epoch": 7.567768841227286, + "grad_norm": 0.020751953125, + "learning_rate": 0.023458037497893904, + "loss": 0.7969, + "num_input_tokens_seen": 29490176, + "step": 50810 + }, + { + "epoch": 7.568513553768246, + "grad_norm": 0.0220947265625, + "learning_rate": 0.023456427278077473, + "loss": 0.8022, + "num_input_tokens_seen": 29493536, + "step": 50815 + }, + { + "epoch": 7.5692582663092045, + "grad_norm": 0.01483154296875, + "learning_rate": 0.023454816915398302, + "loss": 0.8005, + "num_input_tokens_seen": 29496480, + "step": 50820 + }, + { + "epoch": 7.570002978850164, + "grad_norm": 0.02001953125, + "learning_rate": 0.02345320640988359, + "loss": 0.8014, + "num_input_tokens_seen": 29499136, + "step": 50825 + }, + { + "epoch": 7.570747691391123, + "grad_norm": 0.017822265625, + "learning_rate": 0.023451595761560546, + "loss": 0.7897, + "num_input_tokens_seen": 29502208, + "step": 50830 + }, + { + "epoch": 7.571492403932083, + "grad_norm": 0.0179443359375, + "learning_rate": 0.023449984970456382, + "loss": 0.7897, + "num_input_tokens_seen": 29505216, + "step": 50835 + }, + { + "epoch": 7.572237116473041, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02344837403659831, + "loss": 0.8152, + "num_input_tokens_seen": 29508288, + "step": 50840 + }, + { + "epoch": 7.572981829014001, + "grad_norm": 0.032958984375, + "learning_rate": 0.023446762960013547, + "loss": 0.8006, + "num_input_tokens_seen": 29511200, + "step": 50845 + }, + { + "epoch": 7.57372654155496, + "grad_norm": 0.03955078125, + "learning_rate": 0.02344515174072931, + "loss": 0.7987, + "num_input_tokens_seen": 29514112, + "step": 50850 + }, + { + "epoch": 7.5744712540959185, + "grad_norm": 0.016357421875, + "learning_rate": 0.023443540378772814, + "loss": 0.7891, + "num_input_tokens_seen": 29517056, + "step": 50855 + }, + { + "epoch": 7.575215966636878, + "grad_norm": 0.0286865234375, + "learning_rate": 0.023441928874171288, + "loss": 0.8188, + "num_input_tokens_seen": 29519968, + "step": 50860 + }, + { + "epoch": 7.575960679177838, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02344031722695195, + "loss": 0.7995, + "num_input_tokens_seen": 29522784, + "step": 50865 + }, + { + "epoch": 7.5767053917187965, + "grad_norm": 0.027587890625, + "learning_rate": 0.02343870543714203, + "loss": 0.8025, + "num_input_tokens_seen": 29525440, + "step": 50870 + }, + { + "epoch": 7.577450104259755, + "grad_norm": 0.04052734375, + "learning_rate": 0.023437093504768764, + "loss": 0.8068, + "num_input_tokens_seen": 29528352, + "step": 50875 + }, + { + "epoch": 7.578194816800715, + "grad_norm": 0.0262451171875, + "learning_rate": 0.023435481429859375, + "loss": 0.8099, + "num_input_tokens_seen": 29531392, + "step": 50880 + }, + { + "epoch": 7.578939529341675, + "grad_norm": 0.0213623046875, + "learning_rate": 0.023433869212441102, + "loss": 0.7941, + "num_input_tokens_seen": 29534240, + "step": 50885 + }, + { + "epoch": 7.579684241882633, + "grad_norm": 0.031494140625, + "learning_rate": 0.02343225685254118, + "loss": 0.7888, + "num_input_tokens_seen": 29537472, + "step": 50890 + }, + { + "epoch": 7.580428954423592, + "grad_norm": 0.03515625, + "learning_rate": 0.02343064435018685, + "loss": 0.8158, + "num_input_tokens_seen": 29540224, + "step": 50895 + }, + { + "epoch": 7.581173666964552, + "grad_norm": 0.0279541015625, + "learning_rate": 0.023429031705405348, + "loss": 0.7966, + "num_input_tokens_seen": 29543200, + "step": 50900 + }, + { + "epoch": 7.5819183795055105, + "grad_norm": 0.02392578125, + "learning_rate": 0.023427418918223926, + "loss": 0.8065, + "num_input_tokens_seen": 29545664, + "step": 50905 + }, + { + "epoch": 7.58266309204647, + "grad_norm": 0.01806640625, + "learning_rate": 0.023425805988669824, + "loss": 0.802, + "num_input_tokens_seen": 29548576, + "step": 50910 + }, + { + "epoch": 7.583407804587429, + "grad_norm": 0.0264892578125, + "learning_rate": 0.023424192916770295, + "loss": 0.7867, + "num_input_tokens_seen": 29551648, + "step": 50915 + }, + { + "epoch": 7.584152517128389, + "grad_norm": 0.014404296875, + "learning_rate": 0.023422579702552588, + "loss": 0.8102, + "num_input_tokens_seen": 29554656, + "step": 50920 + }, + { + "epoch": 7.584897229669347, + "grad_norm": 0.039306640625, + "learning_rate": 0.02342096634604396, + "loss": 0.8283, + "num_input_tokens_seen": 29557440, + "step": 50925 + }, + { + "epoch": 7.585641942210307, + "grad_norm": 0.02734375, + "learning_rate": 0.023419352847271663, + "loss": 0.787, + "num_input_tokens_seen": 29560544, + "step": 50930 + }, + { + "epoch": 7.586386654751266, + "grad_norm": 0.021240234375, + "learning_rate": 0.02341773920626295, + "loss": 0.8114, + "num_input_tokens_seen": 29563328, + "step": 50935 + }, + { + "epoch": 7.587131367292225, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0234161254230451, + "loss": 0.7933, + "num_input_tokens_seen": 29566464, + "step": 50940 + }, + { + "epoch": 7.587876079833184, + "grad_norm": 0.0166015625, + "learning_rate": 0.023414511497645358, + "loss": 0.7975, + "num_input_tokens_seen": 29569280, + "step": 50945 + }, + { + "epoch": 7.588620792374144, + "grad_norm": 0.04345703125, + "learning_rate": 0.023412897430090996, + "loss": 0.8073, + "num_input_tokens_seen": 29572256, + "step": 50950 + }, + { + "epoch": 7.5893655049151025, + "grad_norm": 0.024169921875, + "learning_rate": 0.023411283220409283, + "loss": 0.7995, + "num_input_tokens_seen": 29575200, + "step": 50955 + }, + { + "epoch": 7.590110217456062, + "grad_norm": 0.024658203125, + "learning_rate": 0.02340966886862749, + "loss": 0.796, + "num_input_tokens_seen": 29578176, + "step": 50960 + }, + { + "epoch": 7.590854929997021, + "grad_norm": 0.03369140625, + "learning_rate": 0.02340805437477289, + "loss": 0.8095, + "num_input_tokens_seen": 29580960, + "step": 50965 + }, + { + "epoch": 7.591599642537981, + "grad_norm": 0.033447265625, + "learning_rate": 0.023406439738872756, + "loss": 0.822, + "num_input_tokens_seen": 29583776, + "step": 50970 + }, + { + "epoch": 7.592344355078939, + "grad_norm": 0.02294921875, + "learning_rate": 0.023404824960954366, + "loss": 0.8108, + "num_input_tokens_seen": 29586720, + "step": 50975 + }, + { + "epoch": 7.593089067619899, + "grad_norm": 0.02197265625, + "learning_rate": 0.023403210041044998, + "loss": 0.8024, + "num_input_tokens_seen": 29589600, + "step": 50980 + }, + { + "epoch": 7.593833780160858, + "grad_norm": 0.03369140625, + "learning_rate": 0.023401594979171936, + "loss": 0.776, + "num_input_tokens_seen": 29592288, + "step": 50985 + }, + { + "epoch": 7.594578492701817, + "grad_norm": 0.044189453125, + "learning_rate": 0.023399979775362473, + "loss": 0.8334, + "num_input_tokens_seen": 29595456, + "step": 50990 + }, + { + "epoch": 7.595323205242776, + "grad_norm": 0.0162353515625, + "learning_rate": 0.02339836442964388, + "loss": 0.8055, + "num_input_tokens_seen": 29598624, + "step": 50995 + }, + { + "epoch": 7.596067917783736, + "grad_norm": 0.0191650390625, + "learning_rate": 0.023396748942043463, + "loss": 0.8157, + "num_input_tokens_seen": 29601472, + "step": 51000 + }, + { + "epoch": 7.596812630324695, + "grad_norm": 0.0166015625, + "learning_rate": 0.02339513331258851, + "loss": 0.8135, + "num_input_tokens_seen": 29604416, + "step": 51005 + }, + { + "epoch": 7.597557342865654, + "grad_norm": 0.0211181640625, + "learning_rate": 0.023393517541306304, + "loss": 0.7849, + "num_input_tokens_seen": 29607264, + "step": 51010 + }, + { + "epoch": 7.598302055406613, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02339190162822415, + "loss": 0.8012, + "num_input_tokens_seen": 29609952, + "step": 51015 + }, + { + "epoch": 7.599046767947573, + "grad_norm": 0.0166015625, + "learning_rate": 0.02339028557336935, + "loss": 0.7829, + "num_input_tokens_seen": 29613152, + "step": 51020 + }, + { + "epoch": 7.599791480488531, + "grad_norm": 0.0205078125, + "learning_rate": 0.0233886693767692, + "loss": 0.7847, + "num_input_tokens_seen": 29616160, + "step": 51025 + }, + { + "epoch": 7.600536193029491, + "grad_norm": 0.028076171875, + "learning_rate": 0.02338705303845101, + "loss": 0.7941, + "num_input_tokens_seen": 29618976, + "step": 51030 + }, + { + "epoch": 7.60128090557045, + "grad_norm": 0.0257568359375, + "learning_rate": 0.023385436558442083, + "loss": 0.8035, + "num_input_tokens_seen": 29621952, + "step": 51035 + }, + { + "epoch": 7.6020256181114085, + "grad_norm": 0.01336669921875, + "learning_rate": 0.023383819936769728, + "loss": 0.8187, + "num_input_tokens_seen": 29624736, + "step": 51040 + }, + { + "epoch": 7.602770330652368, + "grad_norm": 0.0419921875, + "learning_rate": 0.023382203173461256, + "loss": 0.7887, + "num_input_tokens_seen": 29627840, + "step": 51045 + }, + { + "epoch": 7.603515043193328, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02338058626854398, + "loss": 0.7834, + "num_input_tokens_seen": 29630752, + "step": 51050 + }, + { + "epoch": 7.604259755734287, + "grad_norm": 0.035400390625, + "learning_rate": 0.02337896922204522, + "loss": 0.7952, + "num_input_tokens_seen": 29633600, + "step": 51055 + }, + { + "epoch": 7.605004468275245, + "grad_norm": 0.0247802734375, + "learning_rate": 0.023377352033992285, + "loss": 0.8228, + "num_input_tokens_seen": 29636640, + "step": 51060 + }, + { + "epoch": 7.605749180816205, + "grad_norm": 0.043212890625, + "learning_rate": 0.023375734704412503, + "loss": 0.8131, + "num_input_tokens_seen": 29639904, + "step": 51065 + }, + { + "epoch": 7.606493893357164, + "grad_norm": 0.02392578125, + "learning_rate": 0.023374117233333196, + "loss": 0.7956, + "num_input_tokens_seen": 29642592, + "step": 51070 + }, + { + "epoch": 7.607238605898123, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02337249962078169, + "loss": 0.7928, + "num_input_tokens_seen": 29645472, + "step": 51075 + }, + { + "epoch": 7.607983318439082, + "grad_norm": 0.024658203125, + "learning_rate": 0.023370881866785313, + "loss": 0.8199, + "num_input_tokens_seen": 29648800, + "step": 51080 + }, + { + "epoch": 7.608728030980042, + "grad_norm": 0.01495361328125, + "learning_rate": 0.023369263971371398, + "loss": 0.83, + "num_input_tokens_seen": 29651712, + "step": 51085 + }, + { + "epoch": 7.609472743521001, + "grad_norm": 0.01611328125, + "learning_rate": 0.02336764593456727, + "loss": 0.8176, + "num_input_tokens_seen": 29654624, + "step": 51090 + }, + { + "epoch": 7.61021745606196, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02336602775640027, + "loss": 0.7925, + "num_input_tokens_seen": 29657312, + "step": 51095 + }, + { + "epoch": 7.610962168602919, + "grad_norm": 0.01531982421875, + "learning_rate": 0.023364409436897732, + "loss": 0.8043, + "num_input_tokens_seen": 29660064, + "step": 51100 + }, + { + "epoch": 7.611706881143879, + "grad_norm": 0.03125, + "learning_rate": 0.023362790976087, + "loss": 0.8018, + "num_input_tokens_seen": 29663072, + "step": 51105 + }, + { + "epoch": 7.612451593684837, + "grad_norm": 0.0201416015625, + "learning_rate": 0.023361172373995414, + "loss": 0.7951, + "num_input_tokens_seen": 29666240, + "step": 51110 + }, + { + "epoch": 7.613196306225797, + "grad_norm": 0.0213623046875, + "learning_rate": 0.023359553630650315, + "loss": 0.8142, + "num_input_tokens_seen": 29669152, + "step": 51115 + }, + { + "epoch": 7.613941018766756, + "grad_norm": 0.025634765625, + "learning_rate": 0.023357934746079057, + "loss": 0.7972, + "num_input_tokens_seen": 29671776, + "step": 51120 + }, + { + "epoch": 7.614685731307715, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02335631572030899, + "loss": 0.829, + "num_input_tokens_seen": 29674496, + "step": 51125 + }, + { + "epoch": 7.615430443848674, + "grad_norm": 0.0308837890625, + "learning_rate": 0.023354696553367457, + "loss": 0.8019, + "num_input_tokens_seen": 29676896, + "step": 51130 + }, + { + "epoch": 7.616175156389634, + "grad_norm": 0.022216796875, + "learning_rate": 0.023353077245281816, + "loss": 0.805, + "num_input_tokens_seen": 29679712, + "step": 51135 + }, + { + "epoch": 7.616919868930593, + "grad_norm": 0.035888671875, + "learning_rate": 0.023351457796079424, + "loss": 0.8217, + "num_input_tokens_seen": 29682496, + "step": 51140 + }, + { + "epoch": 7.617664581471552, + "grad_norm": 0.0218505859375, + "learning_rate": 0.023349838205787645, + "loss": 0.8176, + "num_input_tokens_seen": 29685472, + "step": 51145 + }, + { + "epoch": 7.618409294012511, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02334821847443383, + "loss": 0.8075, + "num_input_tokens_seen": 29688160, + "step": 51150 + }, + { + "epoch": 7.619154006553471, + "grad_norm": 0.0218505859375, + "learning_rate": 0.023346598602045354, + "loss": 0.8055, + "num_input_tokens_seen": 29690976, + "step": 51155 + }, + { + "epoch": 7.619898719094429, + "grad_norm": 0.01904296875, + "learning_rate": 0.023344978588649572, + "loss": 0.7955, + "num_input_tokens_seen": 29693888, + "step": 51160 + }, + { + "epoch": 7.620643431635389, + "grad_norm": 0.019775390625, + "learning_rate": 0.02334335843427387, + "loss": 0.8105, + "num_input_tokens_seen": 29696320, + "step": 51165 + }, + { + "epoch": 7.621388144176348, + "grad_norm": 0.0140380859375, + "learning_rate": 0.023341738138945595, + "loss": 0.7961, + "num_input_tokens_seen": 29699232, + "step": 51170 + }, + { + "epoch": 7.6221328567173074, + "grad_norm": 0.03955078125, + "learning_rate": 0.02334011770269214, + "loss": 0.813, + "num_input_tokens_seen": 29702176, + "step": 51175 + }, + { + "epoch": 7.622877569258266, + "grad_norm": 0.0167236328125, + "learning_rate": 0.023338497125540866, + "loss": 0.8282, + "num_input_tokens_seen": 29704832, + "step": 51180 + }, + { + "epoch": 7.623622281799226, + "grad_norm": 0.0179443359375, + "learning_rate": 0.023336876407519164, + "loss": 0.8036, + "num_input_tokens_seen": 29707808, + "step": 51185 + }, + { + "epoch": 7.624366994340185, + "grad_norm": 0.01397705078125, + "learning_rate": 0.023335255548654406, + "loss": 0.8197, + "num_input_tokens_seen": 29710688, + "step": 51190 + }, + { + "epoch": 7.625111706881144, + "grad_norm": 0.0186767578125, + "learning_rate": 0.023333634548973973, + "loss": 0.809, + "num_input_tokens_seen": 29713472, + "step": 51195 + }, + { + "epoch": 7.625856419422103, + "grad_norm": 0.030517578125, + "learning_rate": 0.02333201340850526, + "loss": 0.798, + "num_input_tokens_seen": 29716480, + "step": 51200 + }, + { + "epoch": 7.626601131963062, + "grad_norm": 0.02197265625, + "learning_rate": 0.023330392127275643, + "loss": 0.8155, + "num_input_tokens_seen": 29719232, + "step": 51205 + }, + { + "epoch": 7.627345844504021, + "grad_norm": 0.02392578125, + "learning_rate": 0.023328770705312522, + "loss": 0.8105, + "num_input_tokens_seen": 29722016, + "step": 51210 + }, + { + "epoch": 7.628090557044981, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02332714914264329, + "loss": 0.7998, + "num_input_tokens_seen": 29724832, + "step": 51215 + }, + { + "epoch": 7.62883526958594, + "grad_norm": 0.022216796875, + "learning_rate": 0.023325527439295328, + "loss": 0.8061, + "num_input_tokens_seen": 29727680, + "step": 51220 + }, + { + "epoch": 7.629579982126899, + "grad_norm": 0.028564453125, + "learning_rate": 0.023323905595296044, + "loss": 0.8067, + "num_input_tokens_seen": 29730624, + "step": 51225 + }, + { + "epoch": 7.630324694667858, + "grad_norm": 0.0223388671875, + "learning_rate": 0.023322283610672834, + "loss": 0.8055, + "num_input_tokens_seen": 29733536, + "step": 51230 + }, + { + "epoch": 7.631069407208818, + "grad_norm": 0.0361328125, + "learning_rate": 0.023320661485453104, + "loss": 0.8099, + "num_input_tokens_seen": 29736608, + "step": 51235 + }, + { + "epoch": 7.631814119749777, + "grad_norm": 0.0240478515625, + "learning_rate": 0.023319039219664252, + "loss": 0.7925, + "num_input_tokens_seen": 29739648, + "step": 51240 + }, + { + "epoch": 7.632558832290735, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02331741681333369, + "loss": 0.8043, + "num_input_tokens_seen": 29742528, + "step": 51245 + }, + { + "epoch": 7.633303544831695, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02331579426648882, + "loss": 0.7981, + "num_input_tokens_seen": 29745504, + "step": 51250 + }, + { + "epoch": 7.634048257372654, + "grad_norm": 0.0230712890625, + "learning_rate": 0.023314171579157064, + "loss": 0.7994, + "num_input_tokens_seen": 29748384, + "step": 51255 + }, + { + "epoch": 7.6347929699136134, + "grad_norm": 0.0206298828125, + "learning_rate": 0.023312548751365826, + "loss": 0.819, + "num_input_tokens_seen": 29751328, + "step": 51260 + }, + { + "epoch": 7.635537682454572, + "grad_norm": 0.022216796875, + "learning_rate": 0.023310925783142523, + "loss": 0.8096, + "num_input_tokens_seen": 29754176, + "step": 51265 + }, + { + "epoch": 7.636282394995532, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02330930267451458, + "loss": 0.8095, + "num_input_tokens_seen": 29757024, + "step": 51270 + }, + { + "epoch": 7.637027107536491, + "grad_norm": 0.021728515625, + "learning_rate": 0.02330767942550941, + "loss": 0.7916, + "num_input_tokens_seen": 29760128, + "step": 51275 + }, + { + "epoch": 7.63777182007745, + "grad_norm": 0.01361083984375, + "learning_rate": 0.02330605603615444, + "loss": 0.8053, + "num_input_tokens_seen": 29762752, + "step": 51280 + }, + { + "epoch": 7.638516532618409, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0233044325064771, + "loss": 0.7937, + "num_input_tokens_seen": 29765984, + "step": 51285 + }, + { + "epoch": 7.639261245159369, + "grad_norm": 0.0234375, + "learning_rate": 0.02330280883650481, + "loss": 0.8, + "num_input_tokens_seen": 29768768, + "step": 51290 + }, + { + "epoch": 7.640005957700327, + "grad_norm": 0.028076171875, + "learning_rate": 0.023301185026265003, + "loss": 0.7977, + "num_input_tokens_seen": 29771520, + "step": 51295 + }, + { + "epoch": 7.640750670241287, + "grad_norm": 0.019287109375, + "learning_rate": 0.02329956107578511, + "loss": 0.7901, + "num_input_tokens_seen": 29774592, + "step": 51300 + }, + { + "epoch": 7.641495382782246, + "grad_norm": 0.0244140625, + "learning_rate": 0.023297936985092572, + "loss": 0.8041, + "num_input_tokens_seen": 29777344, + "step": 51305 + }, + { + "epoch": 7.6422400953232055, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02329631275421482, + "loss": 0.8001, + "num_input_tokens_seen": 29780256, + "step": 51310 + }, + { + "epoch": 7.642984807864164, + "grad_norm": 0.02294921875, + "learning_rate": 0.023294688383179295, + "loss": 0.804, + "num_input_tokens_seen": 29783296, + "step": 51315 + }, + { + "epoch": 7.643729520405124, + "grad_norm": 0.0294189453125, + "learning_rate": 0.023293063872013443, + "loss": 0.811, + "num_input_tokens_seen": 29786240, + "step": 51320 + }, + { + "epoch": 7.644474232946083, + "grad_norm": 0.0390625, + "learning_rate": 0.0232914392207447, + "loss": 0.8008, + "num_input_tokens_seen": 29789120, + "step": 51325 + }, + { + "epoch": 7.645218945487042, + "grad_norm": 0.023193359375, + "learning_rate": 0.023289814429400525, + "loss": 0.7979, + "num_input_tokens_seen": 29791840, + "step": 51330 + }, + { + "epoch": 7.645963658028001, + "grad_norm": 0.0228271484375, + "learning_rate": 0.023288189498008358, + "loss": 0.8065, + "num_input_tokens_seen": 29794624, + "step": 51335 + }, + { + "epoch": 7.646708370568961, + "grad_norm": 0.0167236328125, + "learning_rate": 0.023286564426595654, + "loss": 0.8076, + "num_input_tokens_seen": 29797376, + "step": 51340 + }, + { + "epoch": 7.6474530831099194, + "grad_norm": 0.02685546875, + "learning_rate": 0.02328493921518986, + "loss": 0.8059, + "num_input_tokens_seen": 29800256, + "step": 51345 + }, + { + "epoch": 7.648197795650879, + "grad_norm": 0.019775390625, + "learning_rate": 0.023283313863818444, + "loss": 0.8065, + "num_input_tokens_seen": 29803360, + "step": 51350 + }, + { + "epoch": 7.648942508191838, + "grad_norm": 0.0184326171875, + "learning_rate": 0.023281688372508862, + "loss": 0.8093, + "num_input_tokens_seen": 29806720, + "step": 51355 + }, + { + "epoch": 7.6496872207327975, + "grad_norm": 0.0234375, + "learning_rate": 0.023280062741288566, + "loss": 0.7756, + "num_input_tokens_seen": 29809408, + "step": 51360 + }, + { + "epoch": 7.650431933273756, + "grad_norm": 0.019287109375, + "learning_rate": 0.02327843697018503, + "loss": 0.8124, + "num_input_tokens_seen": 29812352, + "step": 51365 + }, + { + "epoch": 7.651176645814716, + "grad_norm": 0.0279541015625, + "learning_rate": 0.023276811059225713, + "loss": 0.7867, + "num_input_tokens_seen": 29815424, + "step": 51370 + }, + { + "epoch": 7.651921358355675, + "grad_norm": 0.0250244140625, + "learning_rate": 0.023275185008438084, + "loss": 0.8, + "num_input_tokens_seen": 29818016, + "step": 51375 + }, + { + "epoch": 7.652666070896634, + "grad_norm": 0.02294921875, + "learning_rate": 0.023273558817849614, + "loss": 0.8038, + "num_input_tokens_seen": 29821024, + "step": 51380 + }, + { + "epoch": 7.653410783437593, + "grad_norm": 0.01214599609375, + "learning_rate": 0.02327193248748778, + "loss": 0.8018, + "num_input_tokens_seen": 29824128, + "step": 51385 + }, + { + "epoch": 7.654155495978552, + "grad_norm": 0.033447265625, + "learning_rate": 0.023270306017380053, + "loss": 0.8114, + "num_input_tokens_seen": 29827008, + "step": 51390 + }, + { + "epoch": 7.6549002085195115, + "grad_norm": 0.014892578125, + "learning_rate": 0.02326867940755391, + "loss": 0.8065, + "num_input_tokens_seen": 29829920, + "step": 51395 + }, + { + "epoch": 7.655644921060471, + "grad_norm": 0.0113525390625, + "learning_rate": 0.023267052658036837, + "loss": 0.8106, + "num_input_tokens_seen": 29832800, + "step": 51400 + }, + { + "epoch": 7.65638963360143, + "grad_norm": 0.01361083984375, + "learning_rate": 0.023265425768856308, + "loss": 0.803, + "num_input_tokens_seen": 29835584, + "step": 51405 + }, + { + "epoch": 7.657134346142389, + "grad_norm": 0.030029296875, + "learning_rate": 0.02326379874003981, + "loss": 0.8246, + "num_input_tokens_seen": 29838784, + "step": 51410 + }, + { + "epoch": 7.657879058683348, + "grad_norm": 0.0262451171875, + "learning_rate": 0.023262171571614833, + "loss": 0.8126, + "num_input_tokens_seen": 29841760, + "step": 51415 + }, + { + "epoch": 7.658623771224307, + "grad_norm": 0.0238037109375, + "learning_rate": 0.023260544263608865, + "loss": 0.8277, + "num_input_tokens_seen": 29844608, + "step": 51420 + }, + { + "epoch": 7.659368483765267, + "grad_norm": 0.0274658203125, + "learning_rate": 0.023258916816049396, + "loss": 0.7953, + "num_input_tokens_seen": 29847488, + "step": 51425 + }, + { + "epoch": 7.6601131963062254, + "grad_norm": 0.01611328125, + "learning_rate": 0.02325728922896392, + "loss": 0.7896, + "num_input_tokens_seen": 29850560, + "step": 51430 + }, + { + "epoch": 7.660857908847185, + "grad_norm": 0.020751953125, + "learning_rate": 0.02325566150237994, + "loss": 0.8145, + "num_input_tokens_seen": 29853632, + "step": 51435 + }, + { + "epoch": 7.661602621388144, + "grad_norm": 0.01806640625, + "learning_rate": 0.023254033636324944, + "loss": 0.8015, + "num_input_tokens_seen": 29856512, + "step": 51440 + }, + { + "epoch": 7.6623473339291035, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02325240563082644, + "loss": 0.7881, + "num_input_tokens_seen": 29859424, + "step": 51445 + }, + { + "epoch": 7.663092046470062, + "grad_norm": 0.019287109375, + "learning_rate": 0.023250777485911936, + "loss": 0.7979, + "num_input_tokens_seen": 29862176, + "step": 51450 + }, + { + "epoch": 7.663836759011022, + "grad_norm": 0.0203857421875, + "learning_rate": 0.023249149201608924, + "loss": 0.8077, + "num_input_tokens_seen": 29865120, + "step": 51455 + }, + { + "epoch": 7.664581471551981, + "grad_norm": 0.0262451171875, + "learning_rate": 0.023247520777944924, + "loss": 0.7853, + "num_input_tokens_seen": 29868096, + "step": 51460 + }, + { + "epoch": 7.66532618409294, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02324589221494744, + "loss": 0.7948, + "num_input_tokens_seen": 29870880, + "step": 51465 + }, + { + "epoch": 7.666070896633899, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02324426351264399, + "loss": 0.7936, + "num_input_tokens_seen": 29873632, + "step": 51470 + }, + { + "epoch": 7.666815609174859, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02324263467106209, + "loss": 0.819, + "num_input_tokens_seen": 29876448, + "step": 51475 + }, + { + "epoch": 7.6675603217158175, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02324100569022925, + "loss": 0.8094, + "num_input_tokens_seen": 29879424, + "step": 51480 + }, + { + "epoch": 7.668305034256777, + "grad_norm": 0.029296875, + "learning_rate": 0.023239376570172998, + "loss": 0.8074, + "num_input_tokens_seen": 29882336, + "step": 51485 + }, + { + "epoch": 7.669049746797736, + "grad_norm": 0.02587890625, + "learning_rate": 0.023237747310920855, + "loss": 0.7883, + "num_input_tokens_seen": 29885088, + "step": 51490 + }, + { + "epoch": 7.6697944593386955, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02323611791250034, + "loss": 0.7986, + "num_input_tokens_seen": 29888032, + "step": 51495 + }, + { + "epoch": 7.670539171879654, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02323448837493898, + "loss": 0.7997, + "num_input_tokens_seen": 29890880, + "step": 51500 + }, + { + "epoch": 7.671283884420614, + "grad_norm": 0.024169921875, + "learning_rate": 0.023232858698264313, + "loss": 0.8057, + "num_input_tokens_seen": 29893952, + "step": 51505 + }, + { + "epoch": 7.672028596961573, + "grad_norm": 0.0216064453125, + "learning_rate": 0.023231228882503865, + "loss": 0.8056, + "num_input_tokens_seen": 29896960, + "step": 51510 + }, + { + "epoch": 7.672773309502532, + "grad_norm": 0.0189208984375, + "learning_rate": 0.023229598927685172, + "loss": 0.8059, + "num_input_tokens_seen": 29900288, + "step": 51515 + }, + { + "epoch": 7.673518022043491, + "grad_norm": 0.03662109375, + "learning_rate": 0.02322796883383577, + "loss": 0.8194, + "num_input_tokens_seen": 29903520, + "step": 51520 + }, + { + "epoch": 7.674262734584451, + "grad_norm": 0.0185546875, + "learning_rate": 0.02322633860098319, + "loss": 0.8254, + "num_input_tokens_seen": 29906368, + "step": 51525 + }, + { + "epoch": 7.6750074471254095, + "grad_norm": 0.0230712890625, + "learning_rate": 0.023224708229154987, + "loss": 0.7901, + "num_input_tokens_seen": 29909376, + "step": 51530 + }, + { + "epoch": 7.675752159666369, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0232230777183787, + "loss": 0.8105, + "num_input_tokens_seen": 29912544, + "step": 51535 + }, + { + "epoch": 7.676496872207328, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02322144706868187, + "loss": 0.8023, + "num_input_tokens_seen": 29915168, + "step": 51540 + }, + { + "epoch": 7.6772415847482876, + "grad_norm": 0.018798828125, + "learning_rate": 0.02321981628009204, + "loss": 0.7946, + "num_input_tokens_seen": 29918272, + "step": 51545 + }, + { + "epoch": 7.677986297289246, + "grad_norm": 0.0264892578125, + "learning_rate": 0.023218185352636776, + "loss": 0.8306, + "num_input_tokens_seen": 29921440, + "step": 51550 + }, + { + "epoch": 7.678731009830205, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02321655428634362, + "loss": 0.7955, + "num_input_tokens_seen": 29924352, + "step": 51555 + }, + { + "epoch": 7.679475722371165, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02321492308124013, + "loss": 0.8074, + "num_input_tokens_seen": 29927488, + "step": 51560 + }, + { + "epoch": 7.680220434912124, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02321329173735387, + "loss": 0.8061, + "num_input_tokens_seen": 29930272, + "step": 51565 + }, + { + "epoch": 7.680965147453083, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02321166025471239, + "loss": 0.8007, + "num_input_tokens_seen": 29933152, + "step": 51570 + }, + { + "epoch": 7.681709859994042, + "grad_norm": 0.0240478515625, + "learning_rate": 0.023210028633343253, + "loss": 0.7957, + "num_input_tokens_seen": 29935936, + "step": 51575 + }, + { + "epoch": 7.6824545725350015, + "grad_norm": 0.026611328125, + "learning_rate": 0.02320839687327403, + "loss": 0.7871, + "num_input_tokens_seen": 29938688, + "step": 51580 + }, + { + "epoch": 7.683199285075961, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02320676497453228, + "loss": 0.7906, + "num_input_tokens_seen": 29941472, + "step": 51585 + }, + { + "epoch": 7.68394399761692, + "grad_norm": 0.020751953125, + "learning_rate": 0.02320513293714558, + "loss": 0.8095, + "num_input_tokens_seen": 29944224, + "step": 51590 + }, + { + "epoch": 7.684688710157879, + "grad_norm": 0.0159912109375, + "learning_rate": 0.023203500761141502, + "loss": 0.8057, + "num_input_tokens_seen": 29947296, + "step": 51595 + }, + { + "epoch": 7.685433422698838, + "grad_norm": 0.0174560546875, + "learning_rate": 0.023201868446547612, + "loss": 0.8059, + "num_input_tokens_seen": 29950336, + "step": 51600 + }, + { + "epoch": 7.686178135239797, + "grad_norm": 0.0181884765625, + "learning_rate": 0.02320023599339149, + "loss": 0.8032, + "num_input_tokens_seen": 29953184, + "step": 51605 + }, + { + "epoch": 7.686922847780757, + "grad_norm": 0.035400390625, + "learning_rate": 0.023198603401700716, + "loss": 0.8065, + "num_input_tokens_seen": 29956000, + "step": 51610 + }, + { + "epoch": 7.6876675603217155, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02319697067150287, + "loss": 0.7958, + "num_input_tokens_seen": 29958944, + "step": 51615 + }, + { + "epoch": 7.688412272862675, + "grad_norm": 0.028564453125, + "learning_rate": 0.02319533780282554, + "loss": 0.817, + "num_input_tokens_seen": 29962016, + "step": 51620 + }, + { + "epoch": 7.689156985403634, + "grad_norm": 0.021240234375, + "learning_rate": 0.0231937047956963, + "loss": 0.8029, + "num_input_tokens_seen": 29964928, + "step": 51625 + }, + { + "epoch": 7.6899016979445936, + "grad_norm": 0.0218505859375, + "learning_rate": 0.023192071650142747, + "loss": 0.8039, + "num_input_tokens_seen": 29968000, + "step": 51630 + }, + { + "epoch": 7.690646410485552, + "grad_norm": 0.029541015625, + "learning_rate": 0.023190438366192476, + "loss": 0.791, + "num_input_tokens_seen": 29970912, + "step": 51635 + }, + { + "epoch": 7.691391123026512, + "grad_norm": 0.03173828125, + "learning_rate": 0.023188804943873067, + "loss": 0.8054, + "num_input_tokens_seen": 29973824, + "step": 51640 + }, + { + "epoch": 7.692135835567471, + "grad_norm": 0.0220947265625, + "learning_rate": 0.023187171383212118, + "loss": 0.7913, + "num_input_tokens_seen": 29976736, + "step": 51645 + }, + { + "epoch": 7.69288054810843, + "grad_norm": 0.0341796875, + "learning_rate": 0.02318553768423724, + "loss": 0.8083, + "num_input_tokens_seen": 29979712, + "step": 51650 + }, + { + "epoch": 7.693625260649389, + "grad_norm": 0.01708984375, + "learning_rate": 0.023183903846976016, + "loss": 0.798, + "num_input_tokens_seen": 29982720, + "step": 51655 + }, + { + "epoch": 7.694369973190349, + "grad_norm": 0.026611328125, + "learning_rate": 0.02318226987145605, + "loss": 0.8116, + "num_input_tokens_seen": 29985632, + "step": 51660 + }, + { + "epoch": 7.6951146857313075, + "grad_norm": 0.01806640625, + "learning_rate": 0.02318063575770496, + "loss": 0.7911, + "num_input_tokens_seen": 29989120, + "step": 51665 + }, + { + "epoch": 7.695859398272267, + "grad_norm": 0.0208740234375, + "learning_rate": 0.023179001505750335, + "loss": 0.8079, + "num_input_tokens_seen": 29991968, + "step": 51670 + }, + { + "epoch": 7.696604110813226, + "grad_norm": 0.033203125, + "learning_rate": 0.023177367115619796, + "loss": 0.8153, + "num_input_tokens_seen": 29994720, + "step": 51675 + }, + { + "epoch": 7.697348823354186, + "grad_norm": 0.0218505859375, + "learning_rate": 0.023175732587340948, + "loss": 0.8117, + "num_input_tokens_seen": 29997728, + "step": 51680 + }, + { + "epoch": 7.698093535895144, + "grad_norm": 0.021484375, + "learning_rate": 0.023174097920941415, + "loss": 0.8159, + "num_input_tokens_seen": 30000480, + "step": 51685 + }, + { + "epoch": 7.698838248436104, + "grad_norm": 0.01611328125, + "learning_rate": 0.023172463116448803, + "loss": 0.8093, + "num_input_tokens_seen": 30003648, + "step": 51690 + }, + { + "epoch": 7.699582960977063, + "grad_norm": 0.0250244140625, + "learning_rate": 0.023170828173890726, + "loss": 0.811, + "num_input_tokens_seen": 30006432, + "step": 51695 + }, + { + "epoch": 7.700327673518022, + "grad_norm": 0.0302734375, + "learning_rate": 0.023169193093294814, + "loss": 0.8112, + "num_input_tokens_seen": 30009248, + "step": 51700 + }, + { + "epoch": 7.701072386058981, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02316755787468869, + "loss": 0.7903, + "num_input_tokens_seen": 30011968, + "step": 51705 + }, + { + "epoch": 7.701817098599941, + "grad_norm": 0.0289306640625, + "learning_rate": 0.023165922518099978, + "loss": 0.8225, + "num_input_tokens_seen": 30014816, + "step": 51710 + }, + { + "epoch": 7.7025618111408996, + "grad_norm": 0.016357421875, + "learning_rate": 0.023164287023556298, + "loss": 0.8021, + "num_input_tokens_seen": 30017696, + "step": 51715 + }, + { + "epoch": 7.703306523681858, + "grad_norm": 0.0306396484375, + "learning_rate": 0.023162651391085292, + "loss": 0.797, + "num_input_tokens_seen": 30020608, + "step": 51720 + }, + { + "epoch": 7.704051236222818, + "grad_norm": 0.029052734375, + "learning_rate": 0.023161015620714584, + "loss": 0.8026, + "num_input_tokens_seen": 30023744, + "step": 51725 + }, + { + "epoch": 7.704795948763778, + "grad_norm": 0.0162353515625, + "learning_rate": 0.02315937971247182, + "loss": 0.8006, + "num_input_tokens_seen": 30026880, + "step": 51730 + }, + { + "epoch": 7.705540661304736, + "grad_norm": 0.023193359375, + "learning_rate": 0.02315774366638462, + "loss": 0.8118, + "num_input_tokens_seen": 30029888, + "step": 51735 + }, + { + "epoch": 7.706285373845695, + "grad_norm": 0.017333984375, + "learning_rate": 0.023156107482480634, + "loss": 0.7998, + "num_input_tokens_seen": 30033024, + "step": 51740 + }, + { + "epoch": 7.707030086386655, + "grad_norm": 0.01470947265625, + "learning_rate": 0.023154471160787497, + "loss": 0.8213, + "num_input_tokens_seen": 30035744, + "step": 51745 + }, + { + "epoch": 7.707774798927614, + "grad_norm": 0.01953125, + "learning_rate": 0.023152834701332864, + "loss": 0.7949, + "num_input_tokens_seen": 30038496, + "step": 51750 + }, + { + "epoch": 7.708519511468573, + "grad_norm": 0.02392578125, + "learning_rate": 0.02315119810414437, + "loss": 0.7892, + "num_input_tokens_seen": 30041376, + "step": 51755 + }, + { + "epoch": 7.709264224009532, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02314956136924967, + "loss": 0.7993, + "num_input_tokens_seen": 30044224, + "step": 51760 + }, + { + "epoch": 7.710008936550492, + "grad_norm": 0.0196533203125, + "learning_rate": 0.023147924496676422, + "loss": 0.8025, + "num_input_tokens_seen": 30046816, + "step": 51765 + }, + { + "epoch": 7.71075364909145, + "grad_norm": 0.025634765625, + "learning_rate": 0.02314628748645226, + "loss": 0.7979, + "num_input_tokens_seen": 30049600, + "step": 51770 + }, + { + "epoch": 7.71149836163241, + "grad_norm": 0.017822265625, + "learning_rate": 0.023144650338604855, + "loss": 0.7918, + "num_input_tokens_seen": 30052736, + "step": 51775 + }, + { + "epoch": 7.712243074173369, + "grad_norm": 0.0311279296875, + "learning_rate": 0.02314301305316186, + "loss": 0.7996, + "num_input_tokens_seen": 30055520, + "step": 51780 + }, + { + "epoch": 7.712987786714328, + "grad_norm": 0.025146484375, + "learning_rate": 0.023141375630150937, + "loss": 0.8031, + "num_input_tokens_seen": 30058528, + "step": 51785 + }, + { + "epoch": 7.713732499255287, + "grad_norm": 0.0225830078125, + "learning_rate": 0.023139738069599743, + "loss": 0.8012, + "num_input_tokens_seen": 30061312, + "step": 51790 + }, + { + "epoch": 7.714477211796247, + "grad_norm": 0.0198974609375, + "learning_rate": 0.023138100371535953, + "loss": 0.8213, + "num_input_tokens_seen": 30064320, + "step": 51795 + }, + { + "epoch": 7.7152219243372056, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02313646253598722, + "loss": 0.792, + "num_input_tokens_seen": 30067168, + "step": 51800 + }, + { + "epoch": 7.715966636878165, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02313482456298123, + "loss": 0.8118, + "num_input_tokens_seen": 30070144, + "step": 51805 + }, + { + "epoch": 7.716711349419124, + "grad_norm": 0.034423828125, + "learning_rate": 0.02313318645254564, + "loss": 0.7691, + "num_input_tokens_seen": 30073088, + "step": 51810 + }, + { + "epoch": 7.717456061960084, + "grad_norm": 0.0255126953125, + "learning_rate": 0.023131548204708138, + "loss": 0.8079, + "num_input_tokens_seen": 30075872, + "step": 51815 + }, + { + "epoch": 7.718200774501042, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02312990981949639, + "loss": 0.8064, + "num_input_tokens_seen": 30078816, + "step": 51820 + }, + { + "epoch": 7.718945487042002, + "grad_norm": 0.029541015625, + "learning_rate": 0.023128271296938077, + "loss": 0.7826, + "num_input_tokens_seen": 30081472, + "step": 51825 + }, + { + "epoch": 7.719690199582961, + "grad_norm": 0.02685546875, + "learning_rate": 0.023126632637060888, + "loss": 0.7916, + "num_input_tokens_seen": 30084352, + "step": 51830 + }, + { + "epoch": 7.72043491212392, + "grad_norm": 0.019775390625, + "learning_rate": 0.023124993839892492, + "loss": 0.7978, + "num_input_tokens_seen": 30087296, + "step": 51835 + }, + { + "epoch": 7.721179624664879, + "grad_norm": 0.0306396484375, + "learning_rate": 0.02312335490546059, + "loss": 0.8062, + "num_input_tokens_seen": 30090304, + "step": 51840 + }, + { + "epoch": 7.721924337205839, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02312171583379286, + "loss": 0.8035, + "num_input_tokens_seen": 30092928, + "step": 51845 + }, + { + "epoch": 7.722669049746798, + "grad_norm": 0.035888671875, + "learning_rate": 0.023120076624916996, + "loss": 0.8111, + "num_input_tokens_seen": 30096032, + "step": 51850 + }, + { + "epoch": 7.723413762287757, + "grad_norm": 0.025146484375, + "learning_rate": 0.02311843727886069, + "loss": 0.8334, + "num_input_tokens_seen": 30098592, + "step": 51855 + }, + { + "epoch": 7.724158474828716, + "grad_norm": 0.01904296875, + "learning_rate": 0.023116797795651636, + "loss": 0.8009, + "num_input_tokens_seen": 30101632, + "step": 51860 + }, + { + "epoch": 7.724903187369676, + "grad_norm": 0.0296630859375, + "learning_rate": 0.023115158175317535, + "loss": 0.8014, + "num_input_tokens_seen": 30104608, + "step": 51865 + }, + { + "epoch": 7.725647899910634, + "grad_norm": 0.0224609375, + "learning_rate": 0.023113518417886083, + "loss": 0.8006, + "num_input_tokens_seen": 30107200, + "step": 51870 + }, + { + "epoch": 7.726392612451594, + "grad_norm": 0.017578125, + "learning_rate": 0.023111878523384984, + "loss": 0.7984, + "num_input_tokens_seen": 30110112, + "step": 51875 + }, + { + "epoch": 7.727137324992553, + "grad_norm": 0.031982421875, + "learning_rate": 0.02311023849184194, + "loss": 0.8035, + "num_input_tokens_seen": 30112928, + "step": 51880 + }, + { + "epoch": 7.727882037533512, + "grad_norm": 0.016845703125, + "learning_rate": 0.023108598323284664, + "loss": 0.7826, + "num_input_tokens_seen": 30116096, + "step": 51885 + }, + { + "epoch": 7.728626750074471, + "grad_norm": 0.02490234375, + "learning_rate": 0.023106958017740858, + "loss": 0.8138, + "num_input_tokens_seen": 30119552, + "step": 51890 + }, + { + "epoch": 7.729371462615431, + "grad_norm": 0.0255126953125, + "learning_rate": 0.023105317575238232, + "loss": 0.7994, + "num_input_tokens_seen": 30122336, + "step": 51895 + }, + { + "epoch": 7.73011617515639, + "grad_norm": 0.0283203125, + "learning_rate": 0.02310367699580451, + "loss": 0.8094, + "num_input_tokens_seen": 30125536, + "step": 51900 + }, + { + "epoch": 7.730860887697348, + "grad_norm": 0.032958984375, + "learning_rate": 0.023102036279467397, + "loss": 0.8004, + "num_input_tokens_seen": 30128352, + "step": 51905 + }, + { + "epoch": 7.731605600238308, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02310039542625462, + "loss": 0.7869, + "num_input_tokens_seen": 30131072, + "step": 51910 + }, + { + "epoch": 7.732350312779268, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02309875443619389, + "loss": 0.7908, + "num_input_tokens_seen": 30133856, + "step": 51915 + }, + { + "epoch": 7.733095025320226, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02309711330931294, + "loss": 0.8134, + "num_input_tokens_seen": 30136864, + "step": 51920 + }, + { + "epoch": 7.733839737861185, + "grad_norm": 0.02880859375, + "learning_rate": 0.023095472045639484, + "loss": 0.8066, + "num_input_tokens_seen": 30139808, + "step": 51925 + }, + { + "epoch": 7.734584450402145, + "grad_norm": 0.029052734375, + "learning_rate": 0.02309383064520126, + "loss": 0.8019, + "num_input_tokens_seen": 30142624, + "step": 51930 + }, + { + "epoch": 7.735329162943104, + "grad_norm": 0.0213623046875, + "learning_rate": 0.023092189108025993, + "loss": 0.8028, + "num_input_tokens_seen": 30145472, + "step": 51935 + }, + { + "epoch": 7.736073875484063, + "grad_norm": 0.037109375, + "learning_rate": 0.023090547434141413, + "loss": 0.817, + "num_input_tokens_seen": 30148032, + "step": 51940 + }, + { + "epoch": 7.736818588025022, + "grad_norm": 0.01544189453125, + "learning_rate": 0.02308890562357526, + "loss": 0.7976, + "num_input_tokens_seen": 30150848, + "step": 51945 + }, + { + "epoch": 7.737563300565982, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02308726367635527, + "loss": 0.8063, + "num_input_tokens_seen": 30153728, + "step": 51950 + }, + { + "epoch": 7.73830801310694, + "grad_norm": 0.0218505859375, + "learning_rate": 0.023085621592509174, + "loss": 0.7879, + "num_input_tokens_seen": 30156512, + "step": 51955 + }, + { + "epoch": 7.7390527256479, + "grad_norm": 0.013427734375, + "learning_rate": 0.02308397937206472, + "loss": 0.8068, + "num_input_tokens_seen": 30159360, + "step": 51960 + }, + { + "epoch": 7.739797438188859, + "grad_norm": 0.037841796875, + "learning_rate": 0.023082337015049654, + "loss": 0.8175, + "num_input_tokens_seen": 30162208, + "step": 51965 + }, + { + "epoch": 7.740542150729818, + "grad_norm": 0.02099609375, + "learning_rate": 0.023080694521491713, + "loss": 0.8123, + "num_input_tokens_seen": 30165216, + "step": 51970 + }, + { + "epoch": 7.741286863270777, + "grad_norm": 0.030517578125, + "learning_rate": 0.023079051891418657, + "loss": 0.7945, + "num_input_tokens_seen": 30168256, + "step": 51975 + }, + { + "epoch": 7.742031575811737, + "grad_norm": 0.031982421875, + "learning_rate": 0.023077409124858235, + "loss": 0.8003, + "num_input_tokens_seen": 30171104, + "step": 51980 + }, + { + "epoch": 7.742776288352696, + "grad_norm": 0.0257568359375, + "learning_rate": 0.023075766221838188, + "loss": 0.8023, + "num_input_tokens_seen": 30174112, + "step": 51985 + }, + { + "epoch": 7.743521000893655, + "grad_norm": 0.0213623046875, + "learning_rate": 0.023074123182386275, + "loss": 0.7979, + "num_input_tokens_seen": 30176928, + "step": 51990 + }, + { + "epoch": 7.744265713434614, + "grad_norm": 0.03857421875, + "learning_rate": 0.02307248000653026, + "loss": 0.8126, + "num_input_tokens_seen": 30179872, + "step": 51995 + }, + { + "epoch": 7.745010425975574, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0230708366942979, + "loss": 0.8123, + "num_input_tokens_seen": 30182912, + "step": 52000 + }, + { + "epoch": 7.745755138516532, + "grad_norm": 0.028564453125, + "learning_rate": 0.023069193245716962, + "loss": 0.8017, + "num_input_tokens_seen": 30186272, + "step": 52005 + }, + { + "epoch": 7.746499851057492, + "grad_norm": 0.0244140625, + "learning_rate": 0.0230675496608152, + "loss": 0.8245, + "num_input_tokens_seen": 30189152, + "step": 52010 + }, + { + "epoch": 7.747244563598451, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02306590593962039, + "loss": 0.7928, + "num_input_tokens_seen": 30192128, + "step": 52015 + }, + { + "epoch": 7.7479892761394105, + "grad_norm": 0.03369140625, + "learning_rate": 0.02306426208216029, + "loss": 0.7995, + "num_input_tokens_seen": 30194848, + "step": 52020 + }, + { + "epoch": 7.748733988680369, + "grad_norm": 0.02392578125, + "learning_rate": 0.02306261808846268, + "loss": 0.7929, + "num_input_tokens_seen": 30197760, + "step": 52025 + }, + { + "epoch": 7.749478701221329, + "grad_norm": 0.020751953125, + "learning_rate": 0.023060973958555338, + "loss": 0.7959, + "num_input_tokens_seen": 30200704, + "step": 52030 + }, + { + "epoch": 7.750223413762288, + "grad_norm": 0.024169921875, + "learning_rate": 0.02305932969246603, + "loss": 0.8069, + "num_input_tokens_seen": 30203904, + "step": 52035 + }, + { + "epoch": 7.750968126303247, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02305768529022254, + "loss": 0.7856, + "num_input_tokens_seen": 30207104, + "step": 52040 + }, + { + "epoch": 7.751712838844206, + "grad_norm": 0.0242919921875, + "learning_rate": 0.023056040751852643, + "loss": 0.7975, + "num_input_tokens_seen": 30209888, + "step": 52045 + }, + { + "epoch": 7.752457551385166, + "grad_norm": 0.0235595703125, + "learning_rate": 0.023054396077384125, + "loss": 0.8193, + "num_input_tokens_seen": 30212672, + "step": 52050 + }, + { + "epoch": 7.753202263926124, + "grad_norm": 0.0238037109375, + "learning_rate": 0.023052751266844775, + "loss": 0.8013, + "num_input_tokens_seen": 30215616, + "step": 52055 + }, + { + "epoch": 7.753946976467084, + "grad_norm": 0.026123046875, + "learning_rate": 0.023051106320262374, + "loss": 0.7935, + "num_input_tokens_seen": 30219456, + "step": 52060 + }, + { + "epoch": 7.754691689008043, + "grad_norm": 0.034912109375, + "learning_rate": 0.023049461237664715, + "loss": 0.7978, + "num_input_tokens_seen": 30222656, + "step": 52065 + }, + { + "epoch": 7.755436401549002, + "grad_norm": 0.0166015625, + "learning_rate": 0.02304781601907959, + "loss": 0.8234, + "num_input_tokens_seen": 30225504, + "step": 52070 + }, + { + "epoch": 7.756181114089961, + "grad_norm": 0.0147705078125, + "learning_rate": 0.023046170664534796, + "loss": 0.8154, + "num_input_tokens_seen": 30228256, + "step": 52075 + }, + { + "epoch": 7.756925826630921, + "grad_norm": 0.0223388671875, + "learning_rate": 0.023044525174058122, + "loss": 0.7985, + "num_input_tokens_seen": 30231200, + "step": 52080 + }, + { + "epoch": 7.75767053917188, + "grad_norm": 0.038330078125, + "learning_rate": 0.02304287954767737, + "loss": 0.7943, + "num_input_tokens_seen": 30234272, + "step": 52085 + }, + { + "epoch": 7.758415251712838, + "grad_norm": 0.0245361328125, + "learning_rate": 0.023041233785420345, + "loss": 0.8041, + "num_input_tokens_seen": 30236992, + "step": 52090 + }, + { + "epoch": 7.759159964253798, + "grad_norm": 0.46875, + "learning_rate": 0.02303958788731485, + "loss": 0.7793, + "num_input_tokens_seen": 30240064, + "step": 52095 + }, + { + "epoch": 7.759904676794758, + "grad_norm": 0.016845703125, + "learning_rate": 0.02303794185338869, + "loss": 0.7804, + "num_input_tokens_seen": 30243424, + "step": 52100 + }, + { + "epoch": 7.7606493893357165, + "grad_norm": 0.023681640625, + "learning_rate": 0.02303629568366966, + "loss": 0.7959, + "num_input_tokens_seen": 30246048, + "step": 52105 + }, + { + "epoch": 7.761394101876675, + "grad_norm": 0.050537109375, + "learning_rate": 0.023034649378185598, + "loss": 0.8089, + "num_input_tokens_seen": 30248832, + "step": 52110 + }, + { + "epoch": 7.762138814417635, + "grad_norm": 0.02197265625, + "learning_rate": 0.02303300293696429, + "loss": 0.8183, + "num_input_tokens_seen": 30251552, + "step": 52115 + }, + { + "epoch": 7.762883526958594, + "grad_norm": 0.043212890625, + "learning_rate": 0.02303135636003356, + "loss": 0.8257, + "num_input_tokens_seen": 30254304, + "step": 52120 + }, + { + "epoch": 7.763628239499553, + "grad_norm": 0.019287109375, + "learning_rate": 0.023029709647421234, + "loss": 0.8107, + "num_input_tokens_seen": 30257312, + "step": 52125 + }, + { + "epoch": 7.764372952040512, + "grad_norm": 0.033447265625, + "learning_rate": 0.023028062799155122, + "loss": 0.8115, + "num_input_tokens_seen": 30260160, + "step": 52130 + }, + { + "epoch": 7.765117664581472, + "grad_norm": 0.06298828125, + "learning_rate": 0.023026415815263053, + "loss": 0.8286, + "num_input_tokens_seen": 30263200, + "step": 52135 + }, + { + "epoch": 7.76586237712243, + "grad_norm": 0.0191650390625, + "learning_rate": 0.023024768695772843, + "loss": 0.7944, + "num_input_tokens_seen": 30266048, + "step": 52140 + }, + { + "epoch": 7.76660708966339, + "grad_norm": 0.033935546875, + "learning_rate": 0.02302312144071232, + "loss": 0.8002, + "num_input_tokens_seen": 30269024, + "step": 52145 + }, + { + "epoch": 7.767351802204349, + "grad_norm": 0.03515625, + "learning_rate": 0.023021474050109316, + "loss": 0.8043, + "num_input_tokens_seen": 30271904, + "step": 52150 + }, + { + "epoch": 7.7680965147453085, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02301982652399166, + "loss": 0.7745, + "num_input_tokens_seen": 30274752, + "step": 52155 + }, + { + "epoch": 7.768841227286267, + "grad_norm": 0.0206298828125, + "learning_rate": 0.023018178862387186, + "loss": 0.8124, + "num_input_tokens_seen": 30277696, + "step": 52160 + }, + { + "epoch": 7.769585939827227, + "grad_norm": 0.03173828125, + "learning_rate": 0.02301653106532373, + "loss": 0.8295, + "num_input_tokens_seen": 30280832, + "step": 52165 + }, + { + "epoch": 7.770330652368186, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02301488313282913, + "loss": 0.7899, + "num_input_tokens_seen": 30283872, + "step": 52170 + }, + { + "epoch": 7.771075364909145, + "grad_norm": 0.01171875, + "learning_rate": 0.023013235064931224, + "loss": 0.8286, + "num_input_tokens_seen": 30286944, + "step": 52175 + }, + { + "epoch": 7.771820077450104, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02301158686165786, + "loss": 0.8054, + "num_input_tokens_seen": 30289856, + "step": 52180 + }, + { + "epoch": 7.772564789991064, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02300993852303687, + "loss": 0.8166, + "num_input_tokens_seen": 30292736, + "step": 52185 + }, + { + "epoch": 7.7733095025320225, + "grad_norm": 0.0201416015625, + "learning_rate": 0.023008290049096117, + "loss": 0.8119, + "num_input_tokens_seen": 30295520, + "step": 52190 + }, + { + "epoch": 7.774054215072982, + "grad_norm": 0.02978515625, + "learning_rate": 0.023006641439863442, + "loss": 0.7912, + "num_input_tokens_seen": 30298976, + "step": 52195 + }, + { + "epoch": 7.774798927613941, + "grad_norm": 0.0162353515625, + "learning_rate": 0.023004992695366693, + "loss": 0.8113, + "num_input_tokens_seen": 30301792, + "step": 52200 + }, + { + "epoch": 7.7755436401549005, + "grad_norm": 0.0135498046875, + "learning_rate": 0.023003343815633733, + "loss": 0.802, + "num_input_tokens_seen": 30304416, + "step": 52205 + }, + { + "epoch": 7.776288352695859, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02300169480069241, + "loss": 0.8023, + "num_input_tokens_seen": 30307552, + "step": 52210 + }, + { + "epoch": 7.777033065236819, + "grad_norm": 0.026123046875, + "learning_rate": 0.02300004565057059, + "loss": 0.8089, + "num_input_tokens_seen": 30310560, + "step": 52215 + }, + { + "epoch": 7.777777777777778, + "grad_norm": 0.0269775390625, + "learning_rate": 0.022998396365296127, + "loss": 0.823, + "num_input_tokens_seen": 30313472, + "step": 52220 + }, + { + "epoch": 7.778522490318737, + "grad_norm": 0.01287841796875, + "learning_rate": 0.022996746944896885, + "loss": 0.8019, + "num_input_tokens_seen": 30316480, + "step": 52225 + }, + { + "epoch": 7.779267202859696, + "grad_norm": 0.0302734375, + "learning_rate": 0.022995097389400736, + "loss": 0.8095, + "num_input_tokens_seen": 30319776, + "step": 52230 + }, + { + "epoch": 7.780011915400656, + "grad_norm": 0.0211181640625, + "learning_rate": 0.022993447698835535, + "loss": 0.8113, + "num_input_tokens_seen": 30322752, + "step": 52235 + }, + { + "epoch": 7.7807566279416145, + "grad_norm": 0.0244140625, + "learning_rate": 0.022991797873229164, + "loss": 0.7991, + "num_input_tokens_seen": 30325472, + "step": 52240 + }, + { + "epoch": 7.781501340482574, + "grad_norm": 0.0140380859375, + "learning_rate": 0.022990147912609494, + "loss": 0.8045, + "num_input_tokens_seen": 30328320, + "step": 52245 + }, + { + "epoch": 7.782246053023533, + "grad_norm": 0.034423828125, + "learning_rate": 0.022988497817004388, + "loss": 0.8085, + "num_input_tokens_seen": 30331168, + "step": 52250 + }, + { + "epoch": 7.782990765564492, + "grad_norm": 0.0228271484375, + "learning_rate": 0.022986847586441737, + "loss": 0.8017, + "num_input_tokens_seen": 30333856, + "step": 52255 + }, + { + "epoch": 7.783735478105451, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02298519722094941, + "loss": 0.7811, + "num_input_tokens_seen": 30336640, + "step": 52260 + }, + { + "epoch": 7.784480190646411, + "grad_norm": 0.0235595703125, + "learning_rate": 0.022983546720555293, + "loss": 0.784, + "num_input_tokens_seen": 30339968, + "step": 52265 + }, + { + "epoch": 7.78522490318737, + "grad_norm": 0.01953125, + "learning_rate": 0.022981896085287265, + "loss": 0.8017, + "num_input_tokens_seen": 30342688, + "step": 52270 + }, + { + "epoch": 7.7859696157283285, + "grad_norm": 0.01544189453125, + "learning_rate": 0.02298024531517322, + "loss": 0.7751, + "num_input_tokens_seen": 30345856, + "step": 52275 + }, + { + "epoch": 7.786714328269288, + "grad_norm": 0.01312255859375, + "learning_rate": 0.022978594410241037, + "loss": 0.7995, + "num_input_tokens_seen": 30348896, + "step": 52280 + }, + { + "epoch": 7.787459040810247, + "grad_norm": 0.0189208984375, + "learning_rate": 0.022976943370518615, + "loss": 0.7966, + "num_input_tokens_seen": 30351872, + "step": 52285 + }, + { + "epoch": 7.7882037533512065, + "grad_norm": 0.0218505859375, + "learning_rate": 0.022975292196033836, + "loss": 0.7827, + "num_input_tokens_seen": 30354784, + "step": 52290 + }, + { + "epoch": 7.788948465892165, + "grad_norm": 0.0242919921875, + "learning_rate": 0.022973640886814608, + "loss": 0.7974, + "num_input_tokens_seen": 30357600, + "step": 52295 + }, + { + "epoch": 7.789693178433125, + "grad_norm": 0.0223388671875, + "learning_rate": 0.022971989442888818, + "loss": 0.8035, + "num_input_tokens_seen": 30360384, + "step": 52300 + }, + { + "epoch": 7.790437890974084, + "grad_norm": 0.0198974609375, + "learning_rate": 0.022970337864284367, + "loss": 0.7926, + "num_input_tokens_seen": 30363008, + "step": 52305 + }, + { + "epoch": 7.791182603515043, + "grad_norm": 0.0242919921875, + "learning_rate": 0.022968686151029163, + "loss": 0.7975, + "num_input_tokens_seen": 30366016, + "step": 52310 + }, + { + "epoch": 7.791927316056002, + "grad_norm": 0.021484375, + "learning_rate": 0.022967034303151102, + "loss": 0.787, + "num_input_tokens_seen": 30368928, + "step": 52315 + }, + { + "epoch": 7.792672028596962, + "grad_norm": 0.0296630859375, + "learning_rate": 0.022965382320678095, + "loss": 0.7823, + "num_input_tokens_seen": 30372224, + "step": 52320 + }, + { + "epoch": 7.7934167411379205, + "grad_norm": 0.04150390625, + "learning_rate": 0.022963730203638046, + "loss": 0.8224, + "num_input_tokens_seen": 30374880, + "step": 52325 + }, + { + "epoch": 7.79416145367888, + "grad_norm": 0.0284423828125, + "learning_rate": 0.022962077952058874, + "loss": 0.7983, + "num_input_tokens_seen": 30377824, + "step": 52330 + }, + { + "epoch": 7.794906166219839, + "grad_norm": 0.0164794921875, + "learning_rate": 0.022960425565968487, + "loss": 0.7831, + "num_input_tokens_seen": 30380896, + "step": 52335 + }, + { + "epoch": 7.7956508787607985, + "grad_norm": 0.0234375, + "learning_rate": 0.0229587730453948, + "loss": 0.8135, + "num_input_tokens_seen": 30383616, + "step": 52340 + }, + { + "epoch": 7.796395591301757, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02295712039036573, + "loss": 0.8284, + "num_input_tokens_seen": 30386624, + "step": 52345 + }, + { + "epoch": 7.797140303842717, + "grad_norm": 0.03076171875, + "learning_rate": 0.022955467600909194, + "loss": 0.7862, + "num_input_tokens_seen": 30389728, + "step": 52350 + }, + { + "epoch": 7.797885016383676, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02295381467705313, + "loss": 0.8092, + "num_input_tokens_seen": 30392608, + "step": 52355 + }, + { + "epoch": 7.798629728924635, + "grad_norm": 0.0263671875, + "learning_rate": 0.022952161618825438, + "loss": 0.8059, + "num_input_tokens_seen": 30395712, + "step": 52360 + }, + { + "epoch": 7.799374441465594, + "grad_norm": 0.02099609375, + "learning_rate": 0.022950508426254065, + "loss": 0.784, + "num_input_tokens_seen": 30398560, + "step": 52365 + }, + { + "epoch": 7.800119154006554, + "grad_norm": 0.028564453125, + "learning_rate": 0.022948855099366932, + "loss": 0.7691, + "num_input_tokens_seen": 30401280, + "step": 52370 + }, + { + "epoch": 7.8008638665475125, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02294720163819197, + "loss": 0.7903, + "num_input_tokens_seen": 30403840, + "step": 52375 + }, + { + "epoch": 7.801608579088472, + "grad_norm": 0.0244140625, + "learning_rate": 0.022945548042757117, + "loss": 0.7863, + "num_input_tokens_seen": 30406848, + "step": 52380 + }, + { + "epoch": 7.802353291629431, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0229438943130903, + "loss": 0.7978, + "num_input_tokens_seen": 30409600, + "step": 52385 + }, + { + "epoch": 7.803098004170391, + "grad_norm": 0.017578125, + "learning_rate": 0.022942240449219466, + "loss": 0.8065, + "num_input_tokens_seen": 30412416, + "step": 52390 + }, + { + "epoch": 7.803842716711349, + "grad_norm": 0.024658203125, + "learning_rate": 0.02294058645117255, + "loss": 0.7912, + "num_input_tokens_seen": 30415200, + "step": 52395 + }, + { + "epoch": 7.804587429252309, + "grad_norm": 0.0419921875, + "learning_rate": 0.022938932318977492, + "loss": 0.8125, + "num_input_tokens_seen": 30418080, + "step": 52400 + }, + { + "epoch": 7.805332141793268, + "grad_norm": 0.026123046875, + "learning_rate": 0.022937278052662248, + "loss": 0.8, + "num_input_tokens_seen": 30420896, + "step": 52405 + }, + { + "epoch": 7.806076854334227, + "grad_norm": 0.0263671875, + "learning_rate": 0.022935623652254756, + "loss": 0.8135, + "num_input_tokens_seen": 30423904, + "step": 52410 + }, + { + "epoch": 7.806821566875186, + "grad_norm": 0.03369140625, + "learning_rate": 0.02293396911778297, + "loss": 0.8067, + "num_input_tokens_seen": 30426720, + "step": 52415 + }, + { + "epoch": 7.807566279416145, + "grad_norm": 0.0164794921875, + "learning_rate": 0.02293231444927484, + "loss": 0.785, + "num_input_tokens_seen": 30429504, + "step": 52420 + }, + { + "epoch": 7.8083109919571045, + "grad_norm": 0.033447265625, + "learning_rate": 0.022930659646758316, + "loss": 0.796, + "num_input_tokens_seen": 30432640, + "step": 52425 + }, + { + "epoch": 7.809055704498064, + "grad_norm": 0.0322265625, + "learning_rate": 0.022929004710261356, + "loss": 0.7857, + "num_input_tokens_seen": 30435520, + "step": 52430 + }, + { + "epoch": 7.809800417039023, + "grad_norm": 0.03466796875, + "learning_rate": 0.022927349639811922, + "loss": 0.803, + "num_input_tokens_seen": 30438272, + "step": 52435 + }, + { + "epoch": 7.810545129579982, + "grad_norm": 0.0223388671875, + "learning_rate": 0.022925694435437977, + "loss": 0.8008, + "num_input_tokens_seen": 30441312, + "step": 52440 + }, + { + "epoch": 7.811289842120941, + "grad_norm": 0.0235595703125, + "learning_rate": 0.022924039097167476, + "loss": 0.8128, + "num_input_tokens_seen": 30444384, + "step": 52445 + }, + { + "epoch": 7.812034554661901, + "grad_norm": 0.023681640625, + "learning_rate": 0.022922383625028384, + "loss": 0.7844, + "num_input_tokens_seen": 30447296, + "step": 52450 + }, + { + "epoch": 7.81277926720286, + "grad_norm": 0.02734375, + "learning_rate": 0.022920728019048677, + "loss": 0.7839, + "num_input_tokens_seen": 30450624, + "step": 52455 + }, + { + "epoch": 7.8135239797438185, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02291907227925632, + "loss": 0.7739, + "num_input_tokens_seen": 30453248, + "step": 52460 + }, + { + "epoch": 7.814268692284778, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02291741640567929, + "loss": 0.7574, + "num_input_tokens_seen": 30456096, + "step": 52465 + }, + { + "epoch": 7.815013404825737, + "grad_norm": 0.038330078125, + "learning_rate": 0.02291576039834555, + "loss": 0.8414, + "num_input_tokens_seen": 30459008, + "step": 52470 + }, + { + "epoch": 7.815758117366697, + "grad_norm": 0.0185546875, + "learning_rate": 0.02291410425728309, + "loss": 0.7804, + "num_input_tokens_seen": 30461824, + "step": 52475 + }, + { + "epoch": 7.816502829907655, + "grad_norm": 0.0269775390625, + "learning_rate": 0.022912447982519873, + "loss": 0.7818, + "num_input_tokens_seen": 30464704, + "step": 52480 + }, + { + "epoch": 7.817247542448615, + "grad_norm": 0.0240478515625, + "learning_rate": 0.022910791574083895, + "loss": 0.8036, + "num_input_tokens_seen": 30467584, + "step": 52485 + }, + { + "epoch": 7.817992254989574, + "grad_norm": 0.031982421875, + "learning_rate": 0.022909135032003133, + "loss": 0.8203, + "num_input_tokens_seen": 30470560, + "step": 52490 + }, + { + "epoch": 7.818736967530533, + "grad_norm": 0.0291748046875, + "learning_rate": 0.022907478356305574, + "loss": 0.775, + "num_input_tokens_seen": 30473568, + "step": 52495 + }, + { + "epoch": 7.819481680071492, + "grad_norm": 0.018798828125, + "learning_rate": 0.0229058215470192, + "loss": 0.7684, + "num_input_tokens_seen": 30476384, + "step": 52500 + }, + { + "epoch": 7.820226392612452, + "grad_norm": 0.025634765625, + "learning_rate": 0.022904164604172016, + "loss": 0.827, + "num_input_tokens_seen": 30479360, + "step": 52505 + }, + { + "epoch": 7.8209711051534105, + "grad_norm": 0.02783203125, + "learning_rate": 0.022902507527791997, + "loss": 0.8111, + "num_input_tokens_seen": 30482400, + "step": 52510 + }, + { + "epoch": 7.82171581769437, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02290085031790715, + "loss": 0.7772, + "num_input_tokens_seen": 30485344, + "step": 52515 + }, + { + "epoch": 7.822460530235329, + "grad_norm": 0.0206298828125, + "learning_rate": 0.022899192974545457, + "loss": 0.8095, + "num_input_tokens_seen": 30488384, + "step": 52520 + }, + { + "epoch": 7.823205242776289, + "grad_norm": 0.02001953125, + "learning_rate": 0.022897535497734933, + "loss": 0.7898, + "num_input_tokens_seen": 30491424, + "step": 52525 + }, + { + "epoch": 7.823949955317247, + "grad_norm": 0.0255126953125, + "learning_rate": 0.022895877887503573, + "loss": 0.7552, + "num_input_tokens_seen": 30494304, + "step": 52530 + }, + { + "epoch": 7.824694667858207, + "grad_norm": 0.033935546875, + "learning_rate": 0.022894220143879384, + "loss": 0.7861, + "num_input_tokens_seen": 30497312, + "step": 52535 + }, + { + "epoch": 7.825439380399166, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02289256226689037, + "loss": 0.7711, + "num_input_tokens_seen": 30500160, + "step": 52540 + }, + { + "epoch": 7.826184092940125, + "grad_norm": 0.03466796875, + "learning_rate": 0.022890904256564537, + "loss": 0.8249, + "num_input_tokens_seen": 30502944, + "step": 52545 + }, + { + "epoch": 7.826928805481084, + "grad_norm": 0.026123046875, + "learning_rate": 0.022889246112929893, + "loss": 0.8088, + "num_input_tokens_seen": 30505824, + "step": 52550 + }, + { + "epoch": 7.827673518022044, + "grad_norm": 0.034912109375, + "learning_rate": 0.022887587836014456, + "loss": 0.7998, + "num_input_tokens_seen": 30508832, + "step": 52555 + }, + { + "epoch": 7.828418230563003, + "grad_norm": 0.0322265625, + "learning_rate": 0.022885929425846236, + "loss": 0.8014, + "num_input_tokens_seen": 30511776, + "step": 52560 + }, + { + "epoch": 7.829162943103962, + "grad_norm": 0.034912109375, + "learning_rate": 0.022884270882453255, + "loss": 0.7989, + "num_input_tokens_seen": 30514592, + "step": 52565 + }, + { + "epoch": 7.829907655644921, + "grad_norm": 0.0289306640625, + "learning_rate": 0.022882612205863535, + "loss": 0.7747, + "num_input_tokens_seen": 30517504, + "step": 52570 + }, + { + "epoch": 7.830652368185881, + "grad_norm": 0.026123046875, + "learning_rate": 0.02288095339610509, + "loss": 0.82, + "num_input_tokens_seen": 30520096, + "step": 52575 + }, + { + "epoch": 7.831397080726839, + "grad_norm": 0.033203125, + "learning_rate": 0.02287929445320595, + "loss": 0.7811, + "num_input_tokens_seen": 30523264, + "step": 52580 + }, + { + "epoch": 7.832141793267798, + "grad_norm": 0.017578125, + "learning_rate": 0.022877635377194137, + "loss": 0.7777, + "num_input_tokens_seen": 30526048, + "step": 52585 + }, + { + "epoch": 7.832886505808758, + "grad_norm": 0.025634765625, + "learning_rate": 0.022875976168097674, + "loss": 0.8235, + "num_input_tokens_seen": 30529088, + "step": 52590 + }, + { + "epoch": 7.833631218349717, + "grad_norm": 0.025146484375, + "learning_rate": 0.022874316825944603, + "loss": 0.7733, + "num_input_tokens_seen": 30531936, + "step": 52595 + }, + { + "epoch": 7.834375930890676, + "grad_norm": 0.022216796875, + "learning_rate": 0.022872657350762955, + "loss": 0.8009, + "num_input_tokens_seen": 30534912, + "step": 52600 + }, + { + "epoch": 7.835120643431635, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02287099774258076, + "loss": 0.8553, + "num_input_tokens_seen": 30538944, + "step": 52605 + }, + { + "epoch": 7.835865355972595, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02286933800142606, + "loss": 0.7957, + "num_input_tokens_seen": 30541888, + "step": 52610 + }, + { + "epoch": 7.836610068513554, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02286767812732689, + "loss": 0.8074, + "num_input_tokens_seen": 30544576, + "step": 52615 + }, + { + "epoch": 7.837354781054513, + "grad_norm": 0.0218505859375, + "learning_rate": 0.022866018120311297, + "loss": 0.7749, + "num_input_tokens_seen": 30547360, + "step": 52620 + }, + { + "epoch": 7.838099493595472, + "grad_norm": 0.024658203125, + "learning_rate": 0.022864357980407323, + "loss": 0.8331, + "num_input_tokens_seen": 30550816, + "step": 52625 + }, + { + "epoch": 7.838844206136431, + "grad_norm": 0.022705078125, + "learning_rate": 0.022862697707643008, + "loss": 0.8259, + "num_input_tokens_seen": 30553536, + "step": 52630 + }, + { + "epoch": 7.83958891867739, + "grad_norm": 0.036865234375, + "learning_rate": 0.02286103730204641, + "loss": 0.8143, + "num_input_tokens_seen": 30556032, + "step": 52635 + }, + { + "epoch": 7.84033363121835, + "grad_norm": 0.01495361328125, + "learning_rate": 0.02285937676364558, + "loss": 0.8142, + "num_input_tokens_seen": 30558784, + "step": 52640 + }, + { + "epoch": 7.841078343759309, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02285771609246856, + "loss": 0.8204, + "num_input_tokens_seen": 30561920, + "step": 52645 + }, + { + "epoch": 7.841823056300268, + "grad_norm": 0.01953125, + "learning_rate": 0.02285605528854342, + "loss": 0.8011, + "num_input_tokens_seen": 30564800, + "step": 52650 + }, + { + "epoch": 7.842567768841227, + "grad_norm": 0.0194091796875, + "learning_rate": 0.022854394351898205, + "loss": 0.796, + "num_input_tokens_seen": 30567968, + "step": 52655 + }, + { + "epoch": 7.843312481382187, + "grad_norm": 0.029296875, + "learning_rate": 0.022852733282560985, + "loss": 0.7698, + "num_input_tokens_seen": 30570592, + "step": 52660 + }, + { + "epoch": 7.844057193923145, + "grad_norm": 0.019287109375, + "learning_rate": 0.022851072080559816, + "loss": 0.812, + "num_input_tokens_seen": 30573440, + "step": 52665 + }, + { + "epoch": 7.844801906464105, + "grad_norm": 0.021484375, + "learning_rate": 0.02284941074592276, + "loss": 0.7763, + "num_input_tokens_seen": 30576128, + "step": 52670 + }, + { + "epoch": 7.845546619005064, + "grad_norm": 0.0194091796875, + "learning_rate": 0.022847749278677894, + "loss": 0.8267, + "num_input_tokens_seen": 30579072, + "step": 52675 + }, + { + "epoch": 7.846291331546023, + "grad_norm": 0.021240234375, + "learning_rate": 0.022846087678853277, + "loss": 0.816, + "num_input_tokens_seen": 30582624, + "step": 52680 + }, + { + "epoch": 7.847036044086982, + "grad_norm": 0.0240478515625, + "learning_rate": 0.022844425946476978, + "loss": 0.7793, + "num_input_tokens_seen": 30585344, + "step": 52685 + }, + { + "epoch": 7.847780756627942, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02284276408157708, + "loss": 0.7982, + "num_input_tokens_seen": 30587968, + "step": 52690 + }, + { + "epoch": 7.848525469168901, + "grad_norm": 0.02197265625, + "learning_rate": 0.022841102084181653, + "loss": 0.8185, + "num_input_tokens_seen": 30590976, + "step": 52695 + }, + { + "epoch": 7.84927018170986, + "grad_norm": 0.019287109375, + "learning_rate": 0.022839439954318776, + "loss": 0.8189, + "num_input_tokens_seen": 30593888, + "step": 52700 + }, + { + "epoch": 7.850014894250819, + "grad_norm": 0.0283203125, + "learning_rate": 0.022837777692016534, + "loss": 0.7797, + "num_input_tokens_seen": 30596480, + "step": 52705 + }, + { + "epoch": 7.850759606791779, + "grad_norm": 0.032958984375, + "learning_rate": 0.022836115297303, + "loss": 0.8072, + "num_input_tokens_seen": 30599744, + "step": 52710 + }, + { + "epoch": 7.851504319332737, + "grad_norm": 0.0184326171875, + "learning_rate": 0.022834452770206258, + "loss": 0.7833, + "num_input_tokens_seen": 30602656, + "step": 52715 + }, + { + "epoch": 7.852249031873697, + "grad_norm": 0.0255126953125, + "learning_rate": 0.022832790110754397, + "loss": 0.7872, + "num_input_tokens_seen": 30605280, + "step": 52720 + }, + { + "epoch": 7.852993744414656, + "grad_norm": 0.0213623046875, + "learning_rate": 0.022831127318975516, + "loss": 0.7995, + "num_input_tokens_seen": 30608608, + "step": 52725 + }, + { + "epoch": 7.8537384569556155, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02282946439489769, + "loss": 0.8197, + "num_input_tokens_seen": 30612096, + "step": 52730 + }, + { + "epoch": 7.854483169496574, + "grad_norm": 0.0125732421875, + "learning_rate": 0.022827801338549034, + "loss": 0.7893, + "num_input_tokens_seen": 30614848, + "step": 52735 + }, + { + "epoch": 7.855227882037534, + "grad_norm": 0.0184326171875, + "learning_rate": 0.022826138149957622, + "loss": 0.7906, + "num_input_tokens_seen": 30617632, + "step": 52740 + }, + { + "epoch": 7.855972594578493, + "grad_norm": 0.031982421875, + "learning_rate": 0.022824474829151557, + "loss": 0.8177, + "num_input_tokens_seen": 30620480, + "step": 52745 + }, + { + "epoch": 7.856717307119452, + "grad_norm": 0.028564453125, + "learning_rate": 0.022822811376158943, + "loss": 0.7845, + "num_input_tokens_seen": 30623392, + "step": 52750 + }, + { + "epoch": 7.857462019660411, + "grad_norm": 0.028564453125, + "learning_rate": 0.022821147791007884, + "loss": 0.7869, + "num_input_tokens_seen": 30626144, + "step": 52755 + }, + { + "epoch": 7.858206732201371, + "grad_norm": 0.029296875, + "learning_rate": 0.02281948407372648, + "loss": 0.8352, + "num_input_tokens_seen": 30628928, + "step": 52760 + }, + { + "epoch": 7.858951444742329, + "grad_norm": 0.026611328125, + "learning_rate": 0.022817820224342842, + "loss": 0.8156, + "num_input_tokens_seen": 30632000, + "step": 52765 + }, + { + "epoch": 7.859696157283288, + "grad_norm": 0.02880859375, + "learning_rate": 0.02281615624288508, + "loss": 0.8319, + "num_input_tokens_seen": 30634944, + "step": 52770 + }, + { + "epoch": 7.860440869824248, + "grad_norm": 0.026611328125, + "learning_rate": 0.022814492129381296, + "loss": 0.8269, + "num_input_tokens_seen": 30637472, + "step": 52775 + }, + { + "epoch": 7.8611855823652075, + "grad_norm": 0.03125, + "learning_rate": 0.022812827883859618, + "loss": 0.7861, + "num_input_tokens_seen": 30640512, + "step": 52780 + }, + { + "epoch": 7.861930294906166, + "grad_norm": 0.0185546875, + "learning_rate": 0.022811163506348146, + "loss": 0.7887, + "num_input_tokens_seen": 30643264, + "step": 52785 + }, + { + "epoch": 7.862675007447125, + "grad_norm": 0.0255126953125, + "learning_rate": 0.022809498996875007, + "loss": 0.802, + "num_input_tokens_seen": 30646080, + "step": 52790 + }, + { + "epoch": 7.863419719988085, + "grad_norm": 0.025390625, + "learning_rate": 0.02280783435546832, + "loss": 0.7781, + "num_input_tokens_seen": 30648928, + "step": 52795 + }, + { + "epoch": 7.864164432529043, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02280616958215621, + "loss": 0.8111, + "num_input_tokens_seen": 30652000, + "step": 52800 + }, + { + "epoch": 7.864909145070003, + "grad_norm": 0.02685546875, + "learning_rate": 0.022804504676966795, + "loss": 0.7946, + "num_input_tokens_seen": 30654752, + "step": 52805 + }, + { + "epoch": 7.865653857610962, + "grad_norm": 0.0224609375, + "learning_rate": 0.022802839639928206, + "loss": 0.7985, + "num_input_tokens_seen": 30657568, + "step": 52810 + }, + { + "epoch": 7.8663985701519215, + "grad_norm": 0.02099609375, + "learning_rate": 0.022801174471068573, + "loss": 0.7982, + "num_input_tokens_seen": 30660288, + "step": 52815 + }, + { + "epoch": 7.86714328269288, + "grad_norm": 0.020263671875, + "learning_rate": 0.02279950917041603, + "loss": 0.7971, + "num_input_tokens_seen": 30663296, + "step": 52820 + }, + { + "epoch": 7.86788799523384, + "grad_norm": 0.02685546875, + "learning_rate": 0.022797843737998702, + "loss": 0.8458, + "num_input_tokens_seen": 30666144, + "step": 52825 + }, + { + "epoch": 7.868632707774799, + "grad_norm": 0.021240234375, + "learning_rate": 0.022796178173844724, + "loss": 0.7943, + "num_input_tokens_seen": 30669120, + "step": 52830 + }, + { + "epoch": 7.869377420315758, + "grad_norm": 0.0150146484375, + "learning_rate": 0.022794512477982243, + "loss": 0.8023, + "num_input_tokens_seen": 30672608, + "step": 52835 + }, + { + "epoch": 7.870122132856717, + "grad_norm": 0.0247802734375, + "learning_rate": 0.022792846650439402, + "loss": 0.7754, + "num_input_tokens_seen": 30675360, + "step": 52840 + }, + { + "epoch": 7.870866845397677, + "grad_norm": 0.024658203125, + "learning_rate": 0.022791180691244334, + "loss": 0.8006, + "num_input_tokens_seen": 30678304, + "step": 52845 + }, + { + "epoch": 7.871611557938635, + "grad_norm": 0.029541015625, + "learning_rate": 0.022789514600425186, + "loss": 0.8079, + "num_input_tokens_seen": 30681440, + "step": 52850 + }, + { + "epoch": 7.872356270479595, + "grad_norm": 0.03466796875, + "learning_rate": 0.022787848378010105, + "loss": 0.8006, + "num_input_tokens_seen": 30684480, + "step": 52855 + }, + { + "epoch": 7.873100983020554, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02278618202402724, + "loss": 0.8077, + "num_input_tokens_seen": 30687456, + "step": 52860 + }, + { + "epoch": 7.8738456955615135, + "grad_norm": 0.0220947265625, + "learning_rate": 0.022784515538504746, + "loss": 0.7944, + "num_input_tokens_seen": 30690560, + "step": 52865 + }, + { + "epoch": 7.874590408102472, + "grad_norm": 0.043701171875, + "learning_rate": 0.022782848921470776, + "loss": 0.8273, + "num_input_tokens_seen": 30693408, + "step": 52870 + }, + { + "epoch": 7.875335120643432, + "grad_norm": 0.02001953125, + "learning_rate": 0.02278118217295348, + "loss": 0.8082, + "num_input_tokens_seen": 30696352, + "step": 52875 + }, + { + "epoch": 7.876079833184391, + "grad_norm": 0.025146484375, + "learning_rate": 0.022779515292981018, + "loss": 0.8284, + "num_input_tokens_seen": 30699456, + "step": 52880 + }, + { + "epoch": 7.87682454572535, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02277784828158155, + "loss": 0.7799, + "num_input_tokens_seen": 30702368, + "step": 52885 + }, + { + "epoch": 7.877569258266309, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02277618113878324, + "loss": 0.786, + "num_input_tokens_seen": 30705024, + "step": 52890 + }, + { + "epoch": 7.878313970807269, + "grad_norm": 0.020263671875, + "learning_rate": 0.022774513864614262, + "loss": 0.7987, + "num_input_tokens_seen": 30707936, + "step": 52895 + }, + { + "epoch": 7.8790586833482275, + "grad_norm": 0.01904296875, + "learning_rate": 0.02277284645910277, + "loss": 0.814, + "num_input_tokens_seen": 30710848, + "step": 52900 + }, + { + "epoch": 7.879803395889187, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02277117892227694, + "loss": 0.8148, + "num_input_tokens_seen": 30713664, + "step": 52905 + }, + { + "epoch": 7.880548108430146, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02276951125416493, + "loss": 0.7962, + "num_input_tokens_seen": 30716512, + "step": 52910 + }, + { + "epoch": 7.8812928209711055, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02276784345479493, + "loss": 0.797, + "num_input_tokens_seen": 30719520, + "step": 52915 + }, + { + "epoch": 7.882037533512064, + "grad_norm": 0.023681640625, + "learning_rate": 0.022766175524195107, + "loss": 0.819, + "num_input_tokens_seen": 30722752, + "step": 52920 + }, + { + "epoch": 7.882782246053024, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02276450746239365, + "loss": 0.8159, + "num_input_tokens_seen": 30725824, + "step": 52925 + }, + { + "epoch": 7.883526958593983, + "grad_norm": 0.0211181640625, + "learning_rate": 0.022762839269418723, + "loss": 0.7855, + "num_input_tokens_seen": 30728672, + "step": 52930 + }, + { + "epoch": 7.884271671134941, + "grad_norm": 0.03662109375, + "learning_rate": 0.02276117094529852, + "loss": 0.8175, + "num_input_tokens_seen": 30731776, + "step": 52935 + }, + { + "epoch": 7.885016383675901, + "grad_norm": 0.030517578125, + "learning_rate": 0.022759502490061225, + "loss": 0.8074, + "num_input_tokens_seen": 30734880, + "step": 52940 + }, + { + "epoch": 7.885761096216861, + "grad_norm": 0.035400390625, + "learning_rate": 0.022757833903735016, + "loss": 0.8163, + "num_input_tokens_seen": 30737632, + "step": 52945 + }, + { + "epoch": 7.8865058087578195, + "grad_norm": 0.0255126953125, + "learning_rate": 0.022756165186348088, + "loss": 0.7968, + "num_input_tokens_seen": 30740256, + "step": 52950 + }, + { + "epoch": 7.887250521298778, + "grad_norm": 0.0220947265625, + "learning_rate": 0.022754496337928636, + "loss": 0.8206, + "num_input_tokens_seen": 30743008, + "step": 52955 + }, + { + "epoch": 7.887995233839738, + "grad_norm": 0.0306396484375, + "learning_rate": 0.022752827358504844, + "loss": 0.8044, + "num_input_tokens_seen": 30745664, + "step": 52960 + }, + { + "epoch": 7.8887399463806975, + "grad_norm": 0.020751953125, + "learning_rate": 0.022751158248104916, + "loss": 0.811, + "num_input_tokens_seen": 30748416, + "step": 52965 + }, + { + "epoch": 7.889484658921656, + "grad_norm": 0.0302734375, + "learning_rate": 0.022749489006757048, + "loss": 0.8084, + "num_input_tokens_seen": 30751776, + "step": 52970 + }, + { + "epoch": 7.890229371462615, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02274781963448944, + "loss": 0.8034, + "num_input_tokens_seen": 30754720, + "step": 52975 + }, + { + "epoch": 7.890974084003575, + "grad_norm": 0.0185546875, + "learning_rate": 0.022746150131330294, + "loss": 0.8073, + "num_input_tokens_seen": 30757504, + "step": 52980 + }, + { + "epoch": 7.8917187965445335, + "grad_norm": 0.02099609375, + "learning_rate": 0.022744480497307814, + "loss": 0.8119, + "num_input_tokens_seen": 30760416, + "step": 52985 + }, + { + "epoch": 7.892463509085493, + "grad_norm": 0.020263671875, + "learning_rate": 0.022742810732450207, + "loss": 0.8078, + "num_input_tokens_seen": 30763264, + "step": 52990 + }, + { + "epoch": 7.893208221626452, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02274114083678568, + "loss": 0.8156, + "num_input_tokens_seen": 30765952, + "step": 52995 + }, + { + "epoch": 7.8939529341674115, + "grad_norm": 0.0283203125, + "learning_rate": 0.022739470810342447, + "loss": 0.8156, + "num_input_tokens_seen": 30768864, + "step": 53000 + }, + { + "epoch": 7.89469764670837, + "grad_norm": 0.032958984375, + "learning_rate": 0.022737800653148725, + "loss": 0.795, + "num_input_tokens_seen": 30771680, + "step": 53005 + }, + { + "epoch": 7.89544235924933, + "grad_norm": 0.015625, + "learning_rate": 0.022736130365232723, + "loss": 0.8082, + "num_input_tokens_seen": 30774720, + "step": 53010 + }, + { + "epoch": 7.896187071790289, + "grad_norm": 0.01611328125, + "learning_rate": 0.02273445994662266, + "loss": 0.8006, + "num_input_tokens_seen": 30777696, + "step": 53015 + }, + { + "epoch": 7.896931784331248, + "grad_norm": 0.022216796875, + "learning_rate": 0.022732789397346756, + "loss": 0.7889, + "num_input_tokens_seen": 30780448, + "step": 53020 + }, + { + "epoch": 7.897676496872207, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02273111871743324, + "loss": 0.8206, + "num_input_tokens_seen": 30783104, + "step": 53025 + }, + { + "epoch": 7.898421209413167, + "grad_norm": 0.0289306640625, + "learning_rate": 0.022729447906910324, + "loss": 0.807, + "num_input_tokens_seen": 30786176, + "step": 53030 + }, + { + "epoch": 7.8991659219541255, + "grad_norm": 0.02001953125, + "learning_rate": 0.022727776965806246, + "loss": 0.7968, + "num_input_tokens_seen": 30788992, + "step": 53035 + }, + { + "epoch": 7.899910634495085, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02272610589414923, + "loss": 0.8034, + "num_input_tokens_seen": 30792000, + "step": 53040 + }, + { + "epoch": 7.900655347036044, + "grad_norm": 0.01483154296875, + "learning_rate": 0.022724434691967502, + "loss": 0.818, + "num_input_tokens_seen": 30794880, + "step": 53045 + }, + { + "epoch": 7.9014000595770035, + "grad_norm": 0.0225830078125, + "learning_rate": 0.022722763359289307, + "loss": 0.7987, + "num_input_tokens_seen": 30797856, + "step": 53050 + }, + { + "epoch": 7.902144772117962, + "grad_norm": 0.0302734375, + "learning_rate": 0.02272109189614287, + "loss": 0.7899, + "num_input_tokens_seen": 30800608, + "step": 53055 + }, + { + "epoch": 7.902889484658922, + "grad_norm": 0.0145263671875, + "learning_rate": 0.022719420302556435, + "loss": 0.8121, + "num_input_tokens_seen": 30803616, + "step": 53060 + }, + { + "epoch": 7.903634197199881, + "grad_norm": 0.025634765625, + "learning_rate": 0.022717748578558242, + "loss": 0.7982, + "num_input_tokens_seen": 30806528, + "step": 53065 + }, + { + "epoch": 7.90437890974084, + "grad_norm": 0.0220947265625, + "learning_rate": 0.022716076724176526, + "loss": 0.8096, + "num_input_tokens_seen": 30809344, + "step": 53070 + }, + { + "epoch": 7.905123622281799, + "grad_norm": 0.0302734375, + "learning_rate": 0.022714404739439538, + "loss": 0.811, + "num_input_tokens_seen": 30812224, + "step": 53075 + }, + { + "epoch": 7.905868334822759, + "grad_norm": 0.0186767578125, + "learning_rate": 0.022712732624375523, + "loss": 0.822, + "num_input_tokens_seen": 30815296, + "step": 53080 + }, + { + "epoch": 7.9066130473637175, + "grad_norm": 0.0322265625, + "learning_rate": 0.02271106037901273, + "loss": 0.7898, + "num_input_tokens_seen": 30818240, + "step": 53085 + }, + { + "epoch": 7.907357759904677, + "grad_norm": 0.024658203125, + "learning_rate": 0.022709388003379405, + "loss": 0.8233, + "num_input_tokens_seen": 30821216, + "step": 53090 + }, + { + "epoch": 7.908102472445636, + "grad_norm": 0.0322265625, + "learning_rate": 0.02270771549750381, + "loss": 0.7794, + "num_input_tokens_seen": 30823872, + "step": 53095 + }, + { + "epoch": 7.908847184986596, + "grad_norm": 0.023193359375, + "learning_rate": 0.022706042861414193, + "loss": 0.7972, + "num_input_tokens_seen": 30826720, + "step": 53100 + }, + { + "epoch": 7.909591897527554, + "grad_norm": 0.022216796875, + "learning_rate": 0.02270437009513882, + "loss": 0.7943, + "num_input_tokens_seen": 30829536, + "step": 53105 + }, + { + "epoch": 7.910336610068514, + "grad_norm": 0.0291748046875, + "learning_rate": 0.022702697198705935, + "loss": 0.8157, + "num_input_tokens_seen": 30832352, + "step": 53110 + }, + { + "epoch": 7.911081322609473, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02270102417214381, + "loss": 0.7909, + "num_input_tokens_seen": 30835296, + "step": 53115 + }, + { + "epoch": 7.9118260351504315, + "grad_norm": 0.0283203125, + "learning_rate": 0.022699351015480713, + "loss": 0.7952, + "num_input_tokens_seen": 30838240, + "step": 53120 + }, + { + "epoch": 7.912570747691391, + "grad_norm": 0.021484375, + "learning_rate": 0.0226976777287449, + "loss": 0.808, + "num_input_tokens_seen": 30841120, + "step": 53125 + }, + { + "epoch": 7.913315460232351, + "grad_norm": 0.0242919921875, + "learning_rate": 0.022696004311964654, + "loss": 0.8162, + "num_input_tokens_seen": 30843904, + "step": 53130 + }, + { + "epoch": 7.9140601727733095, + "grad_norm": 0.0228271484375, + "learning_rate": 0.022694330765168233, + "loss": 0.812, + "num_input_tokens_seen": 30846816, + "step": 53135 + }, + { + "epoch": 7.914804885314268, + "grad_norm": 0.0234375, + "learning_rate": 0.022692657088383916, + "loss": 0.8062, + "num_input_tokens_seen": 30849696, + "step": 53140 + }, + { + "epoch": 7.915549597855228, + "grad_norm": 0.021728515625, + "learning_rate": 0.022690983281639977, + "loss": 0.8184, + "num_input_tokens_seen": 30852480, + "step": 53145 + }, + { + "epoch": 7.916294310396187, + "grad_norm": 0.016357421875, + "learning_rate": 0.02268930934496469, + "loss": 0.8043, + "num_input_tokens_seen": 30855264, + "step": 53150 + }, + { + "epoch": 7.917039022937146, + "grad_norm": 0.0157470703125, + "learning_rate": 0.022687635278386332, + "loss": 0.7912, + "num_input_tokens_seen": 30857888, + "step": 53155 + }, + { + "epoch": 7.917783735478105, + "grad_norm": 0.01458740234375, + "learning_rate": 0.022685961081933194, + "loss": 0.818, + "num_input_tokens_seen": 30860832, + "step": 53160 + }, + { + "epoch": 7.918528448019065, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02268428675563355, + "loss": 0.813, + "num_input_tokens_seen": 30863584, + "step": 53165 + }, + { + "epoch": 7.9192731605600235, + "grad_norm": 0.0167236328125, + "learning_rate": 0.022682612299515702, + "loss": 0.7896, + "num_input_tokens_seen": 30866720, + "step": 53170 + }, + { + "epoch": 7.920017873100983, + "grad_norm": 0.025146484375, + "learning_rate": 0.022680937713607918, + "loss": 0.8246, + "num_input_tokens_seen": 30869600, + "step": 53175 + }, + { + "epoch": 7.920762585641942, + "grad_norm": 0.0166015625, + "learning_rate": 0.0226792629979385, + "loss": 0.8027, + "num_input_tokens_seen": 30872480, + "step": 53180 + }, + { + "epoch": 7.921507298182902, + "grad_norm": 0.017822265625, + "learning_rate": 0.02267758815253574, + "loss": 0.7984, + "num_input_tokens_seen": 30875168, + "step": 53185 + }, + { + "epoch": 7.92225201072386, + "grad_norm": 0.0157470703125, + "learning_rate": 0.022675913177427936, + "loss": 0.8036, + "num_input_tokens_seen": 30878176, + "step": 53190 + }, + { + "epoch": 7.92299672326482, + "grad_norm": 0.0247802734375, + "learning_rate": 0.022674238072643373, + "loss": 0.8013, + "num_input_tokens_seen": 30881408, + "step": 53195 + }, + { + "epoch": 7.923741435805779, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02267256283821036, + "loss": 0.8295, + "num_input_tokens_seen": 30884224, + "step": 53200 + }, + { + "epoch": 7.924486148346738, + "grad_norm": 0.0238037109375, + "learning_rate": 0.022670887474157197, + "loss": 0.7992, + "num_input_tokens_seen": 30887072, + "step": 53205 + }, + { + "epoch": 7.925230860887697, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02266921198051218, + "loss": 0.7947, + "num_input_tokens_seen": 30889824, + "step": 53210 + }, + { + "epoch": 7.925975573428657, + "grad_norm": 0.015869140625, + "learning_rate": 0.022667536357303626, + "loss": 0.8045, + "num_input_tokens_seen": 30892736, + "step": 53215 + }, + { + "epoch": 7.9267202859696155, + "grad_norm": 0.0308837890625, + "learning_rate": 0.022665860604559836, + "loss": 0.8094, + "num_input_tokens_seen": 30895744, + "step": 53220 + }, + { + "epoch": 7.927464998510575, + "grad_norm": 0.020751953125, + "learning_rate": 0.02266418472230913, + "loss": 0.8011, + "num_input_tokens_seen": 30898528, + "step": 53225 + }, + { + "epoch": 7.928209711051534, + "grad_norm": 0.019287109375, + "learning_rate": 0.022662508710579808, + "loss": 0.8144, + "num_input_tokens_seen": 30901440, + "step": 53230 + }, + { + "epoch": 7.928954423592494, + "grad_norm": 0.031494140625, + "learning_rate": 0.02266083256940019, + "loss": 0.7955, + "num_input_tokens_seen": 30904384, + "step": 53235 + }, + { + "epoch": 7.929699136133452, + "grad_norm": 0.038330078125, + "learning_rate": 0.02265915629879859, + "loss": 0.815, + "num_input_tokens_seen": 30907264, + "step": 53240 + }, + { + "epoch": 7.930443848674412, + "grad_norm": 0.0164794921875, + "learning_rate": 0.022657479898803325, + "loss": 0.8016, + "num_input_tokens_seen": 30910304, + "step": 53245 + }, + { + "epoch": 7.931188561215371, + "grad_norm": 0.0286865234375, + "learning_rate": 0.022655803369442725, + "loss": 0.8143, + "num_input_tokens_seen": 30913120, + "step": 53250 + }, + { + "epoch": 7.93193327375633, + "grad_norm": 0.0213623046875, + "learning_rate": 0.022654126710745105, + "loss": 0.7973, + "num_input_tokens_seen": 30916096, + "step": 53255 + }, + { + "epoch": 7.932677986297289, + "grad_norm": 0.0225830078125, + "learning_rate": 0.022652449922738793, + "loss": 0.7959, + "num_input_tokens_seen": 30919200, + "step": 53260 + }, + { + "epoch": 7.933422698838249, + "grad_norm": 0.0242919921875, + "learning_rate": 0.022650773005452124, + "loss": 0.8072, + "num_input_tokens_seen": 30922048, + "step": 53265 + }, + { + "epoch": 7.934167411379208, + "grad_norm": 0.0166015625, + "learning_rate": 0.02264909595891342, + "loss": 0.7975, + "num_input_tokens_seen": 30925120, + "step": 53270 + }, + { + "epoch": 7.934912123920167, + "grad_norm": 0.01806640625, + "learning_rate": 0.022647418783151012, + "loss": 0.7979, + "num_input_tokens_seen": 30927872, + "step": 53275 + }, + { + "epoch": 7.935656836461126, + "grad_norm": 0.0250244140625, + "learning_rate": 0.022645741478193235, + "loss": 0.8026, + "num_input_tokens_seen": 30930784, + "step": 53280 + }, + { + "epoch": 7.936401549002085, + "grad_norm": 0.03173828125, + "learning_rate": 0.022644064044068424, + "loss": 0.7988, + "num_input_tokens_seen": 30933760, + "step": 53285 + }, + { + "epoch": 7.937146261543044, + "grad_norm": 0.0245361328125, + "learning_rate": 0.022642386480804924, + "loss": 0.7847, + "num_input_tokens_seen": 30937088, + "step": 53290 + }, + { + "epoch": 7.937890974084004, + "grad_norm": 0.021240234375, + "learning_rate": 0.022640708788431066, + "loss": 0.8011, + "num_input_tokens_seen": 30939744, + "step": 53295 + }, + { + "epoch": 7.938635686624963, + "grad_norm": 0.0137939453125, + "learning_rate": 0.022639030966975206, + "loss": 0.8083, + "num_input_tokens_seen": 30942528, + "step": 53300 + }, + { + "epoch": 7.9393803991659215, + "grad_norm": 0.03466796875, + "learning_rate": 0.022637353016465682, + "loss": 0.8, + "num_input_tokens_seen": 30945664, + "step": 53305 + }, + { + "epoch": 7.940125111706881, + "grad_norm": 0.0194091796875, + "learning_rate": 0.022635674936930836, + "loss": 0.8055, + "num_input_tokens_seen": 30948416, + "step": 53310 + }, + { + "epoch": 7.940869824247841, + "grad_norm": 0.02197265625, + "learning_rate": 0.02263399672839902, + "loss": 0.8107, + "num_input_tokens_seen": 30951264, + "step": 53315 + }, + { + "epoch": 7.9416145367888, + "grad_norm": 0.01239013671875, + "learning_rate": 0.022632318390898593, + "loss": 0.7808, + "num_input_tokens_seen": 30954176, + "step": 53320 + }, + { + "epoch": 7.942359249329758, + "grad_norm": 0.0244140625, + "learning_rate": 0.0226306399244579, + "loss": 0.8031, + "num_input_tokens_seen": 30957088, + "step": 53325 + }, + { + "epoch": 7.943103961870718, + "grad_norm": 0.01556396484375, + "learning_rate": 0.022628961329105305, + "loss": 0.8165, + "num_input_tokens_seen": 30959936, + "step": 53330 + }, + { + "epoch": 7.943848674411677, + "grad_norm": 0.020263671875, + "learning_rate": 0.022627282604869164, + "loss": 0.7896, + "num_input_tokens_seen": 30962752, + "step": 53335 + }, + { + "epoch": 7.944593386952636, + "grad_norm": 0.015869140625, + "learning_rate": 0.022625603751777832, + "loss": 0.7935, + "num_input_tokens_seen": 30965920, + "step": 53340 + }, + { + "epoch": 7.945338099493595, + "grad_norm": 0.01190185546875, + "learning_rate": 0.02262392476985968, + "loss": 0.8063, + "num_input_tokens_seen": 30968704, + "step": 53345 + }, + { + "epoch": 7.946082812034555, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02262224565914306, + "loss": 0.7989, + "num_input_tokens_seen": 30971424, + "step": 53350 + }, + { + "epoch": 7.946827524575514, + "grad_norm": 0.023681640625, + "learning_rate": 0.02262056641965635, + "loss": 0.8104, + "num_input_tokens_seen": 30974176, + "step": 53355 + }, + { + "epoch": 7.947572237116473, + "grad_norm": 0.0225830078125, + "learning_rate": 0.022618887051427915, + "loss": 0.7959, + "num_input_tokens_seen": 30977120, + "step": 53360 + }, + { + "epoch": 7.948316949657432, + "grad_norm": 0.031982421875, + "learning_rate": 0.02261720755448613, + "loss": 0.809, + "num_input_tokens_seen": 30979936, + "step": 53365 + }, + { + "epoch": 7.949061662198392, + "grad_norm": 0.03466796875, + "learning_rate": 0.02261552792885936, + "loss": 0.8244, + "num_input_tokens_seen": 30982944, + "step": 53370 + }, + { + "epoch": 7.94980637473935, + "grad_norm": 0.019775390625, + "learning_rate": 0.022613848174575988, + "loss": 0.7859, + "num_input_tokens_seen": 30985888, + "step": 53375 + }, + { + "epoch": 7.95055108728031, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02261216829166439, + "loss": 0.809, + "num_input_tokens_seen": 30989056, + "step": 53380 + }, + { + "epoch": 7.951295799821269, + "grad_norm": 0.0218505859375, + "learning_rate": 0.022610488280152944, + "loss": 0.803, + "num_input_tokens_seen": 30991968, + "step": 53385 + }, + { + "epoch": 7.952040512362228, + "grad_norm": 0.01239013671875, + "learning_rate": 0.022608808140070033, + "loss": 0.7974, + "num_input_tokens_seen": 30994816, + "step": 53390 + }, + { + "epoch": 7.952785224903187, + "grad_norm": 0.03662109375, + "learning_rate": 0.02260712787144404, + "loss": 0.8171, + "num_input_tokens_seen": 30997760, + "step": 53395 + }, + { + "epoch": 7.953529937444147, + "grad_norm": 0.0220947265625, + "learning_rate": 0.022605447474303358, + "loss": 0.7938, + "num_input_tokens_seen": 31000640, + "step": 53400 + }, + { + "epoch": 7.954274649985106, + "grad_norm": 0.01434326171875, + "learning_rate": 0.02260376694867637, + "loss": 0.794, + "num_input_tokens_seen": 31003808, + "step": 53405 + }, + { + "epoch": 7.955019362526065, + "grad_norm": 0.020263671875, + "learning_rate": 0.022602086294591466, + "loss": 0.8027, + "num_input_tokens_seen": 31006720, + "step": 53410 + }, + { + "epoch": 7.955764075067024, + "grad_norm": 0.03466796875, + "learning_rate": 0.022600405512077038, + "loss": 0.8203, + "num_input_tokens_seen": 31009568, + "step": 53415 + }, + { + "epoch": 7.956508787607984, + "grad_norm": 0.028076171875, + "learning_rate": 0.02259872460116149, + "loss": 0.7768, + "num_input_tokens_seen": 31012288, + "step": 53420 + }, + { + "epoch": 7.957253500148942, + "grad_norm": 0.0191650390625, + "learning_rate": 0.022597043561873212, + "loss": 0.8024, + "num_input_tokens_seen": 31015296, + "step": 53425 + }, + { + "epoch": 7.957998212689902, + "grad_norm": 0.02978515625, + "learning_rate": 0.0225953623942406, + "loss": 0.8109, + "num_input_tokens_seen": 31018048, + "step": 53430 + }, + { + "epoch": 7.958742925230861, + "grad_norm": 0.0255126953125, + "learning_rate": 0.022593681098292063, + "loss": 0.819, + "num_input_tokens_seen": 31020960, + "step": 53435 + }, + { + "epoch": 7.9594876377718204, + "grad_norm": 0.01513671875, + "learning_rate": 0.022591999674055997, + "loss": 0.7901, + "num_input_tokens_seen": 31024064, + "step": 53440 + }, + { + "epoch": 7.960232350312779, + "grad_norm": 0.0250244140625, + "learning_rate": 0.022590318121560815, + "loss": 0.7992, + "num_input_tokens_seen": 31026688, + "step": 53445 + }, + { + "epoch": 7.960977062853738, + "grad_norm": 0.0203857421875, + "learning_rate": 0.022588636440834923, + "loss": 0.7853, + "num_input_tokens_seen": 31029536, + "step": 53450 + }, + { + "epoch": 7.961721775394698, + "grad_norm": 0.0185546875, + "learning_rate": 0.022586954631906735, + "loss": 0.8076, + "num_input_tokens_seen": 31032160, + "step": 53455 + }, + { + "epoch": 7.962466487935657, + "grad_norm": 0.021240234375, + "learning_rate": 0.022585272694804655, + "loss": 0.7885, + "num_input_tokens_seen": 31035200, + "step": 53460 + }, + { + "epoch": 7.963211200476616, + "grad_norm": 0.0140380859375, + "learning_rate": 0.022583590629557105, + "loss": 0.7942, + "num_input_tokens_seen": 31038240, + "step": 53465 + }, + { + "epoch": 7.963955913017575, + "grad_norm": 0.0294189453125, + "learning_rate": 0.022581908436192497, + "loss": 0.8236, + "num_input_tokens_seen": 31041504, + "step": 53470 + }, + { + "epoch": 7.964700625558534, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02258022611473925, + "loss": 0.778, + "num_input_tokens_seen": 31044448, + "step": 53475 + }, + { + "epoch": 7.965445338099494, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02257854366522579, + "loss": 0.8001, + "num_input_tokens_seen": 31047904, + "step": 53480 + }, + { + "epoch": 7.966190050640453, + "grad_norm": 0.02734375, + "learning_rate": 0.022576861087680537, + "loss": 0.8239, + "num_input_tokens_seen": 31050784, + "step": 53485 + }, + { + "epoch": 7.966934763181412, + "grad_norm": 0.01129150390625, + "learning_rate": 0.022575178382131918, + "loss": 0.8022, + "num_input_tokens_seen": 31053536, + "step": 53490 + }, + { + "epoch": 7.967679475722371, + "grad_norm": 0.02197265625, + "learning_rate": 0.022573495548608352, + "loss": 0.7992, + "num_input_tokens_seen": 31056672, + "step": 53495 + }, + { + "epoch": 7.96842418826333, + "grad_norm": 0.048095703125, + "learning_rate": 0.022571812587138286, + "loss": 0.7722, + "num_input_tokens_seen": 31059808, + "step": 53500 + }, + { + "epoch": 7.96916890080429, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02257012949775014, + "loss": 0.7794, + "num_input_tokens_seen": 31062656, + "step": 53505 + }, + { + "epoch": 7.969913613345248, + "grad_norm": 0.0133056640625, + "learning_rate": 0.022568446280472346, + "loss": 0.8215, + "num_input_tokens_seen": 31065312, + "step": 53510 + }, + { + "epoch": 7.970658325886208, + "grad_norm": 0.032470703125, + "learning_rate": 0.022566762935333346, + "loss": 0.7942, + "num_input_tokens_seen": 31068736, + "step": 53515 + }, + { + "epoch": 7.971403038427167, + "grad_norm": 0.0274658203125, + "learning_rate": 0.022565079462361574, + "loss": 0.7997, + "num_input_tokens_seen": 31071456, + "step": 53520 + }, + { + "epoch": 7.9721477509681264, + "grad_norm": 0.02880859375, + "learning_rate": 0.022563395861585477, + "loss": 0.7738, + "num_input_tokens_seen": 31074080, + "step": 53525 + }, + { + "epoch": 7.972892463509085, + "grad_norm": 0.021240234375, + "learning_rate": 0.022561712133033495, + "loss": 0.8083, + "num_input_tokens_seen": 31076960, + "step": 53530 + }, + { + "epoch": 7.973637176050045, + "grad_norm": 0.0174560546875, + "learning_rate": 0.022560028276734066, + "loss": 0.834, + "num_input_tokens_seen": 31080000, + "step": 53535 + }, + { + "epoch": 7.974381888591004, + "grad_norm": 0.028564453125, + "learning_rate": 0.02255834429271565, + "loss": 0.7977, + "num_input_tokens_seen": 31083104, + "step": 53540 + }, + { + "epoch": 7.975126601131963, + "grad_norm": 0.0228271484375, + "learning_rate": 0.022556660181006688, + "loss": 0.8106, + "num_input_tokens_seen": 31085888, + "step": 53545 + }, + { + "epoch": 7.975871313672922, + "grad_norm": 0.0196533203125, + "learning_rate": 0.022554975941635625, + "loss": 0.7837, + "num_input_tokens_seen": 31088576, + "step": 53550 + }, + { + "epoch": 7.976616026213882, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02255329157463093, + "loss": 0.7958, + "num_input_tokens_seen": 31091424, + "step": 53555 + }, + { + "epoch": 7.97736073875484, + "grad_norm": 0.0250244140625, + "learning_rate": 0.022551607080021047, + "loss": 0.8287, + "num_input_tokens_seen": 31094432, + "step": 53560 + }, + { + "epoch": 7.9781054512958, + "grad_norm": 0.026611328125, + "learning_rate": 0.022549922457834437, + "loss": 0.7935, + "num_input_tokens_seen": 31097280, + "step": 53565 + }, + { + "epoch": 7.978850163836759, + "grad_norm": 0.0191650390625, + "learning_rate": 0.022548237708099564, + "loss": 0.7999, + "num_input_tokens_seen": 31099904, + "step": 53570 + }, + { + "epoch": 7.9795948763777185, + "grad_norm": 0.01458740234375, + "learning_rate": 0.022546552830844883, + "loss": 0.8044, + "num_input_tokens_seen": 31102976, + "step": 53575 + }, + { + "epoch": 7.980339588918677, + "grad_norm": 0.032470703125, + "learning_rate": 0.022544867826098863, + "loss": 0.8233, + "num_input_tokens_seen": 31105632, + "step": 53580 + }, + { + "epoch": 7.981084301459637, + "grad_norm": 0.029541015625, + "learning_rate": 0.02254318269388997, + "loss": 0.788, + "num_input_tokens_seen": 31108480, + "step": 53585 + }, + { + "epoch": 7.981829014000596, + "grad_norm": 0.020263671875, + "learning_rate": 0.022541497434246674, + "loss": 0.7997, + "num_input_tokens_seen": 31111328, + "step": 53590 + }, + { + "epoch": 7.982573726541555, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02253981204719744, + "loss": 0.7988, + "num_input_tokens_seen": 31113888, + "step": 53595 + }, + { + "epoch": 7.983318439082514, + "grad_norm": 0.0308837890625, + "learning_rate": 0.02253812653277075, + "loss": 0.7774, + "num_input_tokens_seen": 31117056, + "step": 53600 + }, + { + "epoch": 7.984063151623474, + "grad_norm": 0.016357421875, + "learning_rate": 0.022536440890995067, + "loss": 0.8037, + "num_input_tokens_seen": 31119968, + "step": 53605 + }, + { + "epoch": 7.9848078641644324, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02253475512189888, + "loss": 0.8146, + "num_input_tokens_seen": 31122816, + "step": 53610 + }, + { + "epoch": 7.985552576705392, + "grad_norm": 0.033203125, + "learning_rate": 0.022533069225510658, + "loss": 0.8037, + "num_input_tokens_seen": 31125728, + "step": 53615 + }, + { + "epoch": 7.986297289246351, + "grad_norm": 0.019775390625, + "learning_rate": 0.022531383201858886, + "loss": 0.7921, + "num_input_tokens_seen": 31128704, + "step": 53620 + }, + { + "epoch": 7.9870420017873105, + "grad_norm": 0.01251220703125, + "learning_rate": 0.022529697050972058, + "loss": 0.777, + "num_input_tokens_seen": 31131424, + "step": 53625 + }, + { + "epoch": 7.987786714328269, + "grad_norm": 0.0260009765625, + "learning_rate": 0.022528010772878644, + "loss": 0.7994, + "num_input_tokens_seen": 31134176, + "step": 53630 + }, + { + "epoch": 7.988531426869228, + "grad_norm": 0.0361328125, + "learning_rate": 0.022526324367607146, + "loss": 0.8114, + "num_input_tokens_seen": 31136960, + "step": 53635 + }, + { + "epoch": 7.989276139410188, + "grad_norm": 0.0201416015625, + "learning_rate": 0.022524637835186045, + "loss": 0.7852, + "num_input_tokens_seen": 31139968, + "step": 53640 + }, + { + "epoch": 7.990020851951147, + "grad_norm": 0.018798828125, + "learning_rate": 0.022522951175643832, + "loss": 0.8006, + "num_input_tokens_seen": 31142784, + "step": 53645 + }, + { + "epoch": 7.990765564492106, + "grad_norm": 0.02490234375, + "learning_rate": 0.022521264389009005, + "loss": 0.7964, + "num_input_tokens_seen": 31145568, + "step": 53650 + }, + { + "epoch": 7.991510277033065, + "grad_norm": 0.021484375, + "learning_rate": 0.02251957747531006, + "loss": 0.8141, + "num_input_tokens_seen": 31148608, + "step": 53655 + }, + { + "epoch": 7.9922549895740245, + "grad_norm": 0.0203857421875, + "learning_rate": 0.022517890434575496, + "loss": 0.7894, + "num_input_tokens_seen": 31151424, + "step": 53660 + }, + { + "epoch": 7.992999702114983, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02251620326683382, + "loss": 0.7948, + "num_input_tokens_seen": 31154368, + "step": 53665 + }, + { + "epoch": 7.993744414655943, + "grad_norm": 0.047119140625, + "learning_rate": 0.02251451597211353, + "loss": 0.8293, + "num_input_tokens_seen": 31157440, + "step": 53670 + }, + { + "epoch": 7.994489127196902, + "grad_norm": 0.035400390625, + "learning_rate": 0.022512828550443127, + "loss": 0.792, + "num_input_tokens_seen": 31160384, + "step": 53675 + }, + { + "epoch": 7.995233839737861, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02251114100185111, + "loss": 0.8049, + "num_input_tokens_seen": 31163008, + "step": 53680 + }, + { + "epoch": 7.99597855227882, + "grad_norm": 0.0157470703125, + "learning_rate": 0.022509453326366013, + "loss": 0.8149, + "num_input_tokens_seen": 31166048, + "step": 53685 + }, + { + "epoch": 7.99672326481978, + "grad_norm": 0.0162353515625, + "learning_rate": 0.02250776552401633, + "loss": 0.8177, + "num_input_tokens_seen": 31169152, + "step": 53690 + }, + { + "epoch": 7.9974679773607384, + "grad_norm": 0.02392578125, + "learning_rate": 0.022506077594830583, + "loss": 0.7925, + "num_input_tokens_seen": 31171936, + "step": 53695 + }, + { + "epoch": 7.998212689901698, + "grad_norm": 0.022216796875, + "learning_rate": 0.02250438953883728, + "loss": 0.8188, + "num_input_tokens_seen": 31174496, + "step": 53700 + }, + { + "epoch": 7.998957402442657, + "grad_norm": 0.0308837890625, + "learning_rate": 0.022502701356064945, + "loss": 0.8215, + "num_input_tokens_seen": 31177248, + "step": 53705 + }, + { + "epoch": 7.9997021149836165, + "grad_norm": 0.0361328125, + "learning_rate": 0.022501013046542093, + "loss": 0.7873, + "num_input_tokens_seen": 31180224, + "step": 53710 + }, + { + "epoch": 8.0, + "eval_loss": 0.8020382523536682, + "eval_runtime": 70.7731, + "eval_samples_per_second": 42.163, + "eval_steps_per_second": 10.541, + "num_input_tokens_seen": 31180904, + "step": 53712 + }, + { + "epoch": 8.000446827524575, + "grad_norm": 0.01611328125, + "learning_rate": 0.022499324610297248, + "loss": 0.804, + "num_input_tokens_seen": 31182472, + "step": 53715 + }, + { + "epoch": 8.001191540065534, + "grad_norm": 0.022705078125, + "learning_rate": 0.02249763604735894, + "loss": 0.8004, + "num_input_tokens_seen": 31185448, + "step": 53720 + }, + { + "epoch": 8.001936252606495, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02249594735775569, + "loss": 0.7829, + "num_input_tokens_seen": 31188168, + "step": 53725 + }, + { + "epoch": 8.002680965147453, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02249425854151603, + "loss": 0.7923, + "num_input_tokens_seen": 31190984, + "step": 53730 + }, + { + "epoch": 8.003425677688412, + "grad_norm": 0.0291748046875, + "learning_rate": 0.022492569598668487, + "loss": 0.7797, + "num_input_tokens_seen": 31193960, + "step": 53735 + }, + { + "epoch": 8.00417039022937, + "grad_norm": 0.03515625, + "learning_rate": 0.02249088052924159, + "loss": 0.8292, + "num_input_tokens_seen": 31197192, + "step": 53740 + }, + { + "epoch": 8.004915102770331, + "grad_norm": 0.0255126953125, + "learning_rate": 0.022489191333263887, + "loss": 0.7918, + "num_input_tokens_seen": 31199944, + "step": 53745 + }, + { + "epoch": 8.00565981531129, + "grad_norm": 0.0242919921875, + "learning_rate": 0.022487502010763907, + "loss": 0.8241, + "num_input_tokens_seen": 31202888, + "step": 53750 + }, + { + "epoch": 8.006404527852249, + "grad_norm": 0.0400390625, + "learning_rate": 0.022485812561770187, + "loss": 0.8301, + "num_input_tokens_seen": 31205960, + "step": 53755 + }, + { + "epoch": 8.007149240393208, + "grad_norm": 0.0113525390625, + "learning_rate": 0.02248412298631127, + "loss": 0.7769, + "num_input_tokens_seen": 31208968, + "step": 53760 + }, + { + "epoch": 8.007893952934168, + "grad_norm": 0.03271484375, + "learning_rate": 0.02248243328441571, + "loss": 0.8259, + "num_input_tokens_seen": 31211880, + "step": 53765 + }, + { + "epoch": 8.008638665475127, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02248074345611204, + "loss": 0.8146, + "num_input_tokens_seen": 31214760, + "step": 53770 + }, + { + "epoch": 8.009383378016086, + "grad_norm": 0.0169677734375, + "learning_rate": 0.022479053501428814, + "loss": 0.8088, + "num_input_tokens_seen": 31217704, + "step": 53775 + }, + { + "epoch": 8.010128090557044, + "grad_norm": 0.02490234375, + "learning_rate": 0.02247736342039458, + "loss": 0.7821, + "num_input_tokens_seen": 31220520, + "step": 53780 + }, + { + "epoch": 8.010872803098005, + "grad_norm": 0.02392578125, + "learning_rate": 0.022475673213037888, + "loss": 0.7828, + "num_input_tokens_seen": 31223432, + "step": 53785 + }, + { + "epoch": 8.011617515638964, + "grad_norm": 0.0194091796875, + "learning_rate": 0.0224739828793873, + "loss": 0.8364, + "num_input_tokens_seen": 31226152, + "step": 53790 + }, + { + "epoch": 8.012362228179922, + "grad_norm": 0.021484375, + "learning_rate": 0.022472292419471362, + "loss": 0.8121, + "num_input_tokens_seen": 31228904, + "step": 53795 + }, + { + "epoch": 8.013106940720881, + "grad_norm": 0.020751953125, + "learning_rate": 0.022470601833318643, + "loss": 0.8066, + "num_input_tokens_seen": 31231816, + "step": 53800 + }, + { + "epoch": 8.013851653261842, + "grad_norm": 0.0242919921875, + "learning_rate": 0.022468911120957696, + "loss": 0.7875, + "num_input_tokens_seen": 31234952, + "step": 53805 + }, + { + "epoch": 8.0145963658028, + "grad_norm": 0.014892578125, + "learning_rate": 0.022467220282417086, + "loss": 0.8233, + "num_input_tokens_seen": 31238088, + "step": 53810 + }, + { + "epoch": 8.01534107834376, + "grad_norm": 0.02001953125, + "learning_rate": 0.02246552931772538, + "loss": 0.7931, + "num_input_tokens_seen": 31240936, + "step": 53815 + }, + { + "epoch": 8.016085790884718, + "grad_norm": 0.01446533203125, + "learning_rate": 0.022463838226911143, + "loss": 0.8051, + "num_input_tokens_seen": 31243848, + "step": 53820 + }, + { + "epoch": 8.016830503425677, + "grad_norm": 0.01348876953125, + "learning_rate": 0.022462147010002945, + "loss": 0.7965, + "num_input_tokens_seen": 31246920, + "step": 53825 + }, + { + "epoch": 8.017575215966637, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02246045566702936, + "loss": 0.7851, + "num_input_tokens_seen": 31249768, + "step": 53830 + }, + { + "epoch": 8.018319928507596, + "grad_norm": 0.034912109375, + "learning_rate": 0.022458764198018955, + "loss": 0.8002, + "num_input_tokens_seen": 31252328, + "step": 53835 + }, + { + "epoch": 8.019064641048555, + "grad_norm": 0.0234375, + "learning_rate": 0.022457072603000314, + "loss": 0.7882, + "num_input_tokens_seen": 31255432, + "step": 53840 + }, + { + "epoch": 8.019809353589514, + "grad_norm": 0.022216796875, + "learning_rate": 0.02245538088200201, + "loss": 0.8293, + "num_input_tokens_seen": 31258504, + "step": 53845 + }, + { + "epoch": 8.020554066130474, + "grad_norm": 0.027587890625, + "learning_rate": 0.02245368903505262, + "loss": 0.7924, + "num_input_tokens_seen": 31261480, + "step": 53850 + }, + { + "epoch": 8.021298778671433, + "grad_norm": 0.0274658203125, + "learning_rate": 0.022451997062180727, + "loss": 0.825, + "num_input_tokens_seen": 31264328, + "step": 53855 + }, + { + "epoch": 8.022043491212392, + "grad_norm": 0.02685546875, + "learning_rate": 0.022450304963414925, + "loss": 0.8087, + "num_input_tokens_seen": 31267368, + "step": 53860 + }, + { + "epoch": 8.02278820375335, + "grad_norm": 0.0205078125, + "learning_rate": 0.022448612738783787, + "loss": 0.7949, + "num_input_tokens_seen": 31270344, + "step": 53865 + }, + { + "epoch": 8.023532916294311, + "grad_norm": 0.013427734375, + "learning_rate": 0.022446920388315913, + "loss": 0.7906, + "num_input_tokens_seen": 31273192, + "step": 53870 + }, + { + "epoch": 8.02427762883527, + "grad_norm": 0.029052734375, + "learning_rate": 0.022445227912039883, + "loss": 0.818, + "num_input_tokens_seen": 31275720, + "step": 53875 + }, + { + "epoch": 8.025022341376228, + "grad_norm": 0.05615234375, + "learning_rate": 0.022443535309984292, + "loss": 0.8266, + "num_input_tokens_seen": 31278600, + "step": 53880 + }, + { + "epoch": 8.025767053917187, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02244184258217774, + "loss": 0.8403, + "num_input_tokens_seen": 31281288, + "step": 53885 + }, + { + "epoch": 8.026511766458148, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02244014972864882, + "loss": 0.8197, + "num_input_tokens_seen": 31284488, + "step": 53890 + }, + { + "epoch": 8.027256478999107, + "grad_norm": 0.041015625, + "learning_rate": 0.022438456749426138, + "loss": 0.8115, + "num_input_tokens_seen": 31287624, + "step": 53895 + }, + { + "epoch": 8.028001191540065, + "grad_norm": 0.0228271484375, + "learning_rate": 0.022436763644538282, + "loss": 0.7939, + "num_input_tokens_seen": 31290760, + "step": 53900 + }, + { + "epoch": 8.028745904081024, + "grad_norm": 0.022216796875, + "learning_rate": 0.022435070414013866, + "loss": 0.8076, + "num_input_tokens_seen": 31293768, + "step": 53905 + }, + { + "epoch": 8.029490616621985, + "grad_norm": 0.02978515625, + "learning_rate": 0.0224333770578815, + "loss": 0.8106, + "num_input_tokens_seen": 31296680, + "step": 53910 + }, + { + "epoch": 8.030235329162943, + "grad_norm": 0.026123046875, + "learning_rate": 0.02243168357616977, + "loss": 0.8269, + "num_input_tokens_seen": 31299368, + "step": 53915 + }, + { + "epoch": 8.030980041703902, + "grad_norm": 0.0201416015625, + "learning_rate": 0.022429989968907302, + "loss": 0.7927, + "num_input_tokens_seen": 31302632, + "step": 53920 + }, + { + "epoch": 8.03172475424486, + "grad_norm": 0.0341796875, + "learning_rate": 0.02242829623612271, + "loss": 0.8106, + "num_input_tokens_seen": 31305608, + "step": 53925 + }, + { + "epoch": 8.032469466785821, + "grad_norm": 0.0146484375, + "learning_rate": 0.022426602377844603, + "loss": 0.807, + "num_input_tokens_seen": 31308360, + "step": 53930 + }, + { + "epoch": 8.03321417932678, + "grad_norm": 0.0235595703125, + "learning_rate": 0.022424908394101595, + "loss": 0.7911, + "num_input_tokens_seen": 31311144, + "step": 53935 + }, + { + "epoch": 8.033958891867739, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0224232142849223, + "loss": 0.8078, + "num_input_tokens_seen": 31313960, + "step": 53940 + }, + { + "epoch": 8.034703604408698, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02242152005033535, + "loss": 0.8005, + "num_input_tokens_seen": 31317160, + "step": 53945 + }, + { + "epoch": 8.035448316949658, + "grad_norm": 0.0303955078125, + "learning_rate": 0.022419825690369367, + "loss": 0.8128, + "num_input_tokens_seen": 31320424, + "step": 53950 + }, + { + "epoch": 8.036193029490617, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02241813120505296, + "loss": 0.816, + "num_input_tokens_seen": 31323112, + "step": 53955 + }, + { + "epoch": 8.036937742031576, + "grad_norm": 0.0224609375, + "learning_rate": 0.02241643659441477, + "loss": 0.7907, + "num_input_tokens_seen": 31325864, + "step": 53960 + }, + { + "epoch": 8.037682454572534, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02241474185848342, + "loss": 0.805, + "num_input_tokens_seen": 31329032, + "step": 53965 + }, + { + "epoch": 8.038427167113495, + "grad_norm": 0.028076171875, + "learning_rate": 0.022413046997287547, + "loss": 0.8096, + "num_input_tokens_seen": 31332296, + "step": 53970 + }, + { + "epoch": 8.039171879654454, + "grad_norm": 0.0257568359375, + "learning_rate": 0.022411352010855777, + "loss": 0.8151, + "num_input_tokens_seen": 31335272, + "step": 53975 + }, + { + "epoch": 8.039916592195413, + "grad_norm": 0.022216796875, + "learning_rate": 0.022409656899216746, + "loss": 0.8094, + "num_input_tokens_seen": 31338184, + "step": 53980 + }, + { + "epoch": 8.040661304736371, + "grad_norm": 0.0277099609375, + "learning_rate": 0.022407961662399088, + "loss": 0.808, + "num_input_tokens_seen": 31341064, + "step": 53985 + }, + { + "epoch": 8.041406017277332, + "grad_norm": 0.029296875, + "learning_rate": 0.022406266300431455, + "loss": 0.812, + "num_input_tokens_seen": 31344104, + "step": 53990 + }, + { + "epoch": 8.04215072981829, + "grad_norm": 0.041015625, + "learning_rate": 0.022404570813342475, + "loss": 0.8031, + "num_input_tokens_seen": 31347112, + "step": 53995 + }, + { + "epoch": 8.04289544235925, + "grad_norm": 0.044921875, + "learning_rate": 0.022402875201160802, + "loss": 0.8078, + "num_input_tokens_seen": 31350088, + "step": 54000 + }, + { + "epoch": 8.043640154900208, + "grad_norm": 0.03125, + "learning_rate": 0.022401179463915077, + "loss": 0.798, + "num_input_tokens_seen": 31352872, + "step": 54005 + }, + { + "epoch": 8.044384867441167, + "grad_norm": 0.0242919921875, + "learning_rate": 0.022399483601633942, + "loss": 0.8036, + "num_input_tokens_seen": 31355720, + "step": 54010 + }, + { + "epoch": 8.045129579982127, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02239778761434605, + "loss": 0.8098, + "num_input_tokens_seen": 31358600, + "step": 54015 + }, + { + "epoch": 8.045874292523086, + "grad_norm": 0.025634765625, + "learning_rate": 0.022396091502080058, + "loss": 0.7994, + "num_input_tokens_seen": 31361480, + "step": 54020 + }, + { + "epoch": 8.046619005064045, + "grad_norm": 0.036865234375, + "learning_rate": 0.02239439526486462, + "loss": 0.8116, + "num_input_tokens_seen": 31364648, + "step": 54025 + }, + { + "epoch": 8.047363717605004, + "grad_norm": 0.0238037109375, + "learning_rate": 0.022392698902728388, + "loss": 0.8047, + "num_input_tokens_seen": 31367272, + "step": 54030 + }, + { + "epoch": 8.048108430145964, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02239100241570002, + "loss": 0.8024, + "num_input_tokens_seen": 31370472, + "step": 54035 + }, + { + "epoch": 8.048853142686923, + "grad_norm": 0.02392578125, + "learning_rate": 0.02238930580380818, + "loss": 0.8101, + "num_input_tokens_seen": 31373608, + "step": 54040 + }, + { + "epoch": 8.049597855227882, + "grad_norm": 0.028564453125, + "learning_rate": 0.022387609067081527, + "loss": 0.8121, + "num_input_tokens_seen": 31376552, + "step": 54045 + }, + { + "epoch": 8.05034256776884, + "grad_norm": 0.0240478515625, + "learning_rate": 0.022385912205548725, + "loss": 0.7968, + "num_input_tokens_seen": 31379304, + "step": 54050 + }, + { + "epoch": 8.051087280309801, + "grad_norm": 0.0240478515625, + "learning_rate": 0.022384215219238446, + "loss": 0.8131, + "num_input_tokens_seen": 31382152, + "step": 54055 + }, + { + "epoch": 8.05183199285076, + "grad_norm": 0.016845703125, + "learning_rate": 0.02238251810817936, + "loss": 0.7913, + "num_input_tokens_seen": 31385224, + "step": 54060 + }, + { + "epoch": 8.052576705391719, + "grad_norm": 0.0284423828125, + "learning_rate": 0.022380820872400127, + "loss": 0.8063, + "num_input_tokens_seen": 31388168, + "step": 54065 + }, + { + "epoch": 8.053321417932677, + "grad_norm": 0.03271484375, + "learning_rate": 0.02237912351192943, + "loss": 0.7928, + "num_input_tokens_seen": 31390888, + "step": 54070 + }, + { + "epoch": 8.054066130473638, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02237742602679594, + "loss": 0.8025, + "num_input_tokens_seen": 31393640, + "step": 54075 + }, + { + "epoch": 8.054810843014597, + "grad_norm": 0.0289306640625, + "learning_rate": 0.022375728417028333, + "loss": 0.7929, + "num_input_tokens_seen": 31396840, + "step": 54080 + }, + { + "epoch": 8.055555555555555, + "grad_norm": 0.0267333984375, + "learning_rate": 0.022374030682655296, + "loss": 0.7971, + "num_input_tokens_seen": 31399688, + "step": 54085 + }, + { + "epoch": 8.056300268096514, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0223723328237055, + "loss": 0.8073, + "num_input_tokens_seen": 31402376, + "step": 54090 + }, + { + "epoch": 8.057044980637475, + "grad_norm": 0.02734375, + "learning_rate": 0.02237063484020764, + "loss": 0.7998, + "num_input_tokens_seen": 31405640, + "step": 54095 + }, + { + "epoch": 8.057789693178433, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02236893673219039, + "loss": 0.796, + "num_input_tokens_seen": 31408712, + "step": 54100 + }, + { + "epoch": 8.058534405719392, + "grad_norm": 0.0133056640625, + "learning_rate": 0.022367238499682442, + "loss": 0.7983, + "num_input_tokens_seen": 31411496, + "step": 54105 + }, + { + "epoch": 8.059279118260351, + "grad_norm": 0.02880859375, + "learning_rate": 0.022365540142712494, + "loss": 0.798, + "num_input_tokens_seen": 31414440, + "step": 54110 + }, + { + "epoch": 8.060023830801311, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02236384166130923, + "loss": 0.7928, + "num_input_tokens_seen": 31417224, + "step": 54115 + }, + { + "epoch": 8.06076854334227, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02236214305550134, + "loss": 0.8182, + "num_input_tokens_seen": 31420168, + "step": 54120 + }, + { + "epoch": 8.061513255883229, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02236044432531753, + "loss": 0.8093, + "num_input_tokens_seen": 31423112, + "step": 54125 + }, + { + "epoch": 8.062257968424188, + "grad_norm": 0.0291748046875, + "learning_rate": 0.022358745470786496, + "loss": 0.8014, + "num_input_tokens_seen": 31426408, + "step": 54130 + }, + { + "epoch": 8.063002680965148, + "grad_norm": 0.02197265625, + "learning_rate": 0.02235704649193693, + "loss": 0.8011, + "num_input_tokens_seen": 31429384, + "step": 54135 + }, + { + "epoch": 8.063747393506107, + "grad_norm": 0.0302734375, + "learning_rate": 0.022355347388797544, + "loss": 0.7948, + "num_input_tokens_seen": 31432328, + "step": 54140 + }, + { + "epoch": 8.064492106047066, + "grad_norm": 0.031982421875, + "learning_rate": 0.02235364816139704, + "loss": 0.8358, + "num_input_tokens_seen": 31435400, + "step": 54145 + }, + { + "epoch": 8.065236818588025, + "grad_norm": 0.025634765625, + "learning_rate": 0.022351948809764124, + "loss": 0.8067, + "num_input_tokens_seen": 31438216, + "step": 54150 + }, + { + "epoch": 8.065981531128985, + "grad_norm": 0.0225830078125, + "learning_rate": 0.022350249333927506, + "loss": 0.7856, + "num_input_tokens_seen": 31441128, + "step": 54155 + }, + { + "epoch": 8.066726243669944, + "grad_norm": 0.033935546875, + "learning_rate": 0.022348549733915892, + "loss": 0.7974, + "num_input_tokens_seen": 31444136, + "step": 54160 + }, + { + "epoch": 8.067470956210903, + "grad_norm": 0.018310546875, + "learning_rate": 0.022346850009758005, + "loss": 0.8158, + "num_input_tokens_seen": 31447432, + "step": 54165 + }, + { + "epoch": 8.068215668751861, + "grad_norm": 0.026123046875, + "learning_rate": 0.02234515016148255, + "loss": 0.7832, + "num_input_tokens_seen": 31450440, + "step": 54170 + }, + { + "epoch": 8.06896038129282, + "grad_norm": 0.0260009765625, + "learning_rate": 0.022343450189118246, + "loss": 0.7759, + "num_input_tokens_seen": 31453288, + "step": 54175 + }, + { + "epoch": 8.06970509383378, + "grad_norm": 0.023681640625, + "learning_rate": 0.022341750092693822, + "loss": 0.8169, + "num_input_tokens_seen": 31456136, + "step": 54180 + }, + { + "epoch": 8.07044980637474, + "grad_norm": 0.014404296875, + "learning_rate": 0.022340049872237987, + "loss": 0.8222, + "num_input_tokens_seen": 31459048, + "step": 54185 + }, + { + "epoch": 8.071194518915698, + "grad_norm": 0.04248046875, + "learning_rate": 0.02233834952777947, + "loss": 0.817, + "num_input_tokens_seen": 31461928, + "step": 54190 + }, + { + "epoch": 8.071939231456657, + "grad_norm": 0.026611328125, + "learning_rate": 0.022336649059346996, + "loss": 0.8173, + "num_input_tokens_seen": 31464872, + "step": 54195 + }, + { + "epoch": 8.072683943997617, + "grad_norm": 0.0274658203125, + "learning_rate": 0.022334948466969293, + "loss": 0.7753, + "num_input_tokens_seen": 31467464, + "step": 54200 + }, + { + "epoch": 8.073428656538576, + "grad_norm": 0.0198974609375, + "learning_rate": 0.022333247750675092, + "loss": 0.7913, + "num_input_tokens_seen": 31470344, + "step": 54205 + }, + { + "epoch": 8.074173369079535, + "grad_norm": 0.0224609375, + "learning_rate": 0.022331546910493123, + "loss": 0.8002, + "num_input_tokens_seen": 31473352, + "step": 54210 + }, + { + "epoch": 8.074918081620494, + "grad_norm": 0.0234375, + "learning_rate": 0.02232984594645212, + "loss": 0.8202, + "num_input_tokens_seen": 31476328, + "step": 54215 + }, + { + "epoch": 8.075662794161454, + "grad_norm": 0.03466796875, + "learning_rate": 0.022328144858580815, + "loss": 0.7896, + "num_input_tokens_seen": 31479656, + "step": 54220 + }, + { + "epoch": 8.076407506702413, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02232644364690796, + "loss": 0.8159, + "num_input_tokens_seen": 31482536, + "step": 54225 + }, + { + "epoch": 8.077152219243372, + "grad_norm": 0.0311279296875, + "learning_rate": 0.022324742311462278, + "loss": 0.8148, + "num_input_tokens_seen": 31485480, + "step": 54230 + }, + { + "epoch": 8.07789693178433, + "grad_norm": 0.023681640625, + "learning_rate": 0.022323040852272522, + "loss": 0.8129, + "num_input_tokens_seen": 31488392, + "step": 54235 + }, + { + "epoch": 8.078641644325291, + "grad_norm": 0.024658203125, + "learning_rate": 0.022321339269367434, + "loss": 0.8021, + "num_input_tokens_seen": 31491304, + "step": 54240 + }, + { + "epoch": 8.07938635686625, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02231963756277576, + "loss": 0.7996, + "num_input_tokens_seen": 31494280, + "step": 54245 + }, + { + "epoch": 8.080131069407209, + "grad_norm": 0.0211181640625, + "learning_rate": 0.022317935732526245, + "loss": 0.7853, + "num_input_tokens_seen": 31497128, + "step": 54250 + }, + { + "epoch": 8.080875781948167, + "grad_norm": 0.03759765625, + "learning_rate": 0.022316233778647648, + "loss": 0.8027, + "num_input_tokens_seen": 31499848, + "step": 54255 + }, + { + "epoch": 8.081620494489128, + "grad_norm": 0.0240478515625, + "learning_rate": 0.022314531701168715, + "loss": 0.855, + "num_input_tokens_seen": 31502760, + "step": 54260 + }, + { + "epoch": 8.082365207030087, + "grad_norm": 0.021484375, + "learning_rate": 0.022312829500118207, + "loss": 0.7939, + "num_input_tokens_seen": 31505480, + "step": 54265 + }, + { + "epoch": 8.083109919571045, + "grad_norm": 0.026611328125, + "learning_rate": 0.022311127175524872, + "loss": 0.817, + "num_input_tokens_seen": 31508104, + "step": 54270 + }, + { + "epoch": 8.083854632112004, + "grad_norm": 0.03271484375, + "learning_rate": 0.022309424727417475, + "loss": 0.7977, + "num_input_tokens_seen": 31511272, + "step": 54275 + }, + { + "epoch": 8.084599344652965, + "grad_norm": 0.0224609375, + "learning_rate": 0.02230772215582478, + "loss": 0.8093, + "num_input_tokens_seen": 31514216, + "step": 54280 + }, + { + "epoch": 8.085344057193923, + "grad_norm": 0.01544189453125, + "learning_rate": 0.022306019460775543, + "loss": 0.8148, + "num_input_tokens_seen": 31517096, + "step": 54285 + }, + { + "epoch": 8.086088769734882, + "grad_norm": 0.0361328125, + "learning_rate": 0.02230431664229853, + "loss": 0.7919, + "num_input_tokens_seen": 31519912, + "step": 54290 + }, + { + "epoch": 8.086833482275841, + "grad_norm": 0.025146484375, + "learning_rate": 0.022302613700422518, + "loss": 0.7994, + "num_input_tokens_seen": 31522824, + "step": 54295 + }, + { + "epoch": 8.087578194816802, + "grad_norm": 0.035888671875, + "learning_rate": 0.022300910635176266, + "loss": 0.8156, + "num_input_tokens_seen": 31525960, + "step": 54300 + }, + { + "epoch": 8.08832290735776, + "grad_norm": 0.021240234375, + "learning_rate": 0.02229920744658855, + "loss": 0.8071, + "num_input_tokens_seen": 31528936, + "step": 54305 + }, + { + "epoch": 8.089067619898719, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02229750413468814, + "loss": 0.8145, + "num_input_tokens_seen": 31531656, + "step": 54310 + }, + { + "epoch": 8.089812332439678, + "grad_norm": 0.0238037109375, + "learning_rate": 0.022295800699503814, + "loss": 0.7879, + "num_input_tokens_seen": 31534440, + "step": 54315 + }, + { + "epoch": 8.090557044980638, + "grad_norm": 0.0174560546875, + "learning_rate": 0.022294097141064353, + "loss": 0.8027, + "num_input_tokens_seen": 31537128, + "step": 54320 + }, + { + "epoch": 8.091301757521597, + "grad_norm": 0.0301513671875, + "learning_rate": 0.022292393459398534, + "loss": 0.791, + "num_input_tokens_seen": 31539848, + "step": 54325 + }, + { + "epoch": 8.092046470062556, + "grad_norm": 0.0223388671875, + "learning_rate": 0.022290689654535144, + "loss": 0.797, + "num_input_tokens_seen": 31542792, + "step": 54330 + }, + { + "epoch": 8.092791182603515, + "grad_norm": 0.028564453125, + "learning_rate": 0.022288985726502956, + "loss": 0.8037, + "num_input_tokens_seen": 31545640, + "step": 54335 + }, + { + "epoch": 8.093535895144473, + "grad_norm": 0.0213623046875, + "learning_rate": 0.022287281675330763, + "loss": 0.8013, + "num_input_tokens_seen": 31548392, + "step": 54340 + }, + { + "epoch": 8.094280607685434, + "grad_norm": 0.0216064453125, + "learning_rate": 0.022285577501047354, + "loss": 0.8077, + "num_input_tokens_seen": 31551240, + "step": 54345 + }, + { + "epoch": 8.095025320226393, + "grad_norm": 0.0220947265625, + "learning_rate": 0.022283873203681512, + "loss": 0.8082, + "num_input_tokens_seen": 31554472, + "step": 54350 + }, + { + "epoch": 8.095770032767351, + "grad_norm": 0.034912109375, + "learning_rate": 0.022282168783262044, + "loss": 0.8071, + "num_input_tokens_seen": 31557224, + "step": 54355 + }, + { + "epoch": 8.09651474530831, + "grad_norm": 0.0234375, + "learning_rate": 0.02228046423981773, + "loss": 0.8058, + "num_input_tokens_seen": 31560168, + "step": 54360 + }, + { + "epoch": 8.09725945784927, + "grad_norm": 0.0218505859375, + "learning_rate": 0.022278759573377376, + "loss": 0.8055, + "num_input_tokens_seen": 31562920, + "step": 54365 + }, + { + "epoch": 8.09800417039023, + "grad_norm": 0.0257568359375, + "learning_rate": 0.022277054783969776, + "loss": 0.7852, + "num_input_tokens_seen": 31566056, + "step": 54370 + }, + { + "epoch": 8.098748882931188, + "grad_norm": 0.02294921875, + "learning_rate": 0.02227534987162373, + "loss": 0.7831, + "num_input_tokens_seen": 31568744, + "step": 54375 + }, + { + "epoch": 8.099493595472147, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02227364483636804, + "loss": 0.7902, + "num_input_tokens_seen": 31571688, + "step": 54380 + }, + { + "epoch": 8.100238308013108, + "grad_norm": 0.0220947265625, + "learning_rate": 0.022271939678231517, + "loss": 0.825, + "num_input_tokens_seen": 31574632, + "step": 54385 + }, + { + "epoch": 8.100983020554066, + "grad_norm": 0.0302734375, + "learning_rate": 0.022270234397242964, + "loss": 0.7977, + "num_input_tokens_seen": 31577672, + "step": 54390 + }, + { + "epoch": 8.101727733095025, + "grad_norm": 0.0283203125, + "learning_rate": 0.022268528993431187, + "loss": 0.8093, + "num_input_tokens_seen": 31580616, + "step": 54395 + }, + { + "epoch": 8.102472445635984, + "grad_norm": 0.022216796875, + "learning_rate": 0.022266823466825004, + "loss": 0.8093, + "num_input_tokens_seen": 31583368, + "step": 54400 + }, + { + "epoch": 8.103217158176944, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02226511781745322, + "loss": 0.7852, + "num_input_tokens_seen": 31586280, + "step": 54405 + }, + { + "epoch": 8.103961870717903, + "grad_norm": 0.0291748046875, + "learning_rate": 0.022263412045344655, + "loss": 0.7831, + "num_input_tokens_seen": 31589032, + "step": 54410 + }, + { + "epoch": 8.104706583258862, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02226170615052813, + "loss": 0.8008, + "num_input_tokens_seen": 31591848, + "step": 54415 + }, + { + "epoch": 8.10545129579982, + "grad_norm": 0.0206298828125, + "learning_rate": 0.022260000133032455, + "loss": 0.7819, + "num_input_tokens_seen": 31594888, + "step": 54420 + }, + { + "epoch": 8.106196008340781, + "grad_norm": 0.01556396484375, + "learning_rate": 0.022258293992886462, + "loss": 0.7859, + "num_input_tokens_seen": 31598216, + "step": 54425 + }, + { + "epoch": 8.10694072088174, + "grad_norm": 0.029052734375, + "learning_rate": 0.022256587730118967, + "loss": 0.7876, + "num_input_tokens_seen": 31601128, + "step": 54430 + }, + { + "epoch": 8.107685433422699, + "grad_norm": 0.031005859375, + "learning_rate": 0.022254881344758795, + "loss": 0.8056, + "num_input_tokens_seen": 31603912, + "step": 54435 + }, + { + "epoch": 8.108430145963657, + "grad_norm": 0.0303955078125, + "learning_rate": 0.022253174836834776, + "loss": 0.7892, + "num_input_tokens_seen": 31606856, + "step": 54440 + }, + { + "epoch": 8.109174858504618, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02225146820637574, + "loss": 0.7823, + "num_input_tokens_seen": 31609544, + "step": 54445 + }, + { + "epoch": 8.109919571045577, + "grad_norm": 0.0218505859375, + "learning_rate": 0.022249761453410517, + "loss": 0.7655, + "num_input_tokens_seen": 31612520, + "step": 54450 + }, + { + "epoch": 8.110664283586535, + "grad_norm": 0.0220947265625, + "learning_rate": 0.022248054577967948, + "loss": 0.7802, + "num_input_tokens_seen": 31615368, + "step": 54455 + }, + { + "epoch": 8.111408996127494, + "grad_norm": 0.0213623046875, + "learning_rate": 0.022246347580076865, + "loss": 0.8261, + "num_input_tokens_seen": 31618184, + "step": 54460 + }, + { + "epoch": 8.112153708668455, + "grad_norm": 0.022216796875, + "learning_rate": 0.0222446404597661, + "loss": 0.8033, + "num_input_tokens_seen": 31620968, + "step": 54465 + }, + { + "epoch": 8.112898421209414, + "grad_norm": 0.03271484375, + "learning_rate": 0.022242933217064494, + "loss": 0.8017, + "num_input_tokens_seen": 31623976, + "step": 54470 + }, + { + "epoch": 8.113643133750372, + "grad_norm": 0.0264892578125, + "learning_rate": 0.022241225852000904, + "loss": 0.7892, + "num_input_tokens_seen": 31626920, + "step": 54475 + }, + { + "epoch": 8.114387846291331, + "grad_norm": 0.020263671875, + "learning_rate": 0.022239518364604152, + "loss": 0.8022, + "num_input_tokens_seen": 31629832, + "step": 54480 + }, + { + "epoch": 8.115132558832292, + "grad_norm": 0.0220947265625, + "learning_rate": 0.0222378107549031, + "loss": 0.7679, + "num_input_tokens_seen": 31632776, + "step": 54485 + }, + { + "epoch": 8.11587727137325, + "grad_norm": 0.0152587890625, + "learning_rate": 0.022236103022926588, + "loss": 0.7816, + "num_input_tokens_seen": 31635656, + "step": 54490 + }, + { + "epoch": 8.116621983914209, + "grad_norm": 0.035888671875, + "learning_rate": 0.022234395168703475, + "loss": 0.8069, + "num_input_tokens_seen": 31638504, + "step": 54495 + }, + { + "epoch": 8.117366696455168, + "grad_norm": 0.0234375, + "learning_rate": 0.022232687192262603, + "loss": 0.7815, + "num_input_tokens_seen": 31641544, + "step": 54500 + }, + { + "epoch": 8.118111408996128, + "grad_norm": 0.0296630859375, + "learning_rate": 0.022230979093632836, + "loss": 0.7814, + "num_input_tokens_seen": 31644360, + "step": 54505 + }, + { + "epoch": 8.118856121537087, + "grad_norm": 0.022705078125, + "learning_rate": 0.022229270872843022, + "loss": 0.798, + "num_input_tokens_seen": 31647272, + "step": 54510 + }, + { + "epoch": 8.119600834078046, + "grad_norm": 0.03857421875, + "learning_rate": 0.02222756252992203, + "loss": 0.8257, + "num_input_tokens_seen": 31650440, + "step": 54515 + }, + { + "epoch": 8.120345546619005, + "grad_norm": 0.0247802734375, + "learning_rate": 0.022225854064898708, + "loss": 0.7714, + "num_input_tokens_seen": 31653576, + "step": 54520 + }, + { + "epoch": 8.121090259159963, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02222414547780193, + "loss": 0.827, + "num_input_tokens_seen": 31656488, + "step": 54525 + }, + { + "epoch": 8.121834971700924, + "grad_norm": 0.0294189453125, + "learning_rate": 0.022222436768660557, + "loss": 0.8096, + "num_input_tokens_seen": 31659112, + "step": 54530 + }, + { + "epoch": 8.122579684241883, + "grad_norm": 0.0235595703125, + "learning_rate": 0.022220727937503456, + "loss": 0.7991, + "num_input_tokens_seen": 31662056, + "step": 54535 + }, + { + "epoch": 8.123324396782841, + "grad_norm": 0.02490234375, + "learning_rate": 0.02221901898435949, + "loss": 0.8252, + "num_input_tokens_seen": 31664936, + "step": 54540 + }, + { + "epoch": 8.1240691093238, + "grad_norm": 0.017578125, + "learning_rate": 0.022217309909257542, + "loss": 0.8215, + "num_input_tokens_seen": 31668232, + "step": 54545 + }, + { + "epoch": 8.12481382186476, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02221560071222647, + "loss": 0.8229, + "num_input_tokens_seen": 31671144, + "step": 54550 + }, + { + "epoch": 8.12555853440572, + "grad_norm": 0.0206298828125, + "learning_rate": 0.022213891393295167, + "loss": 0.7762, + "num_input_tokens_seen": 31673992, + "step": 54555 + }, + { + "epoch": 8.126303246946678, + "grad_norm": 0.021240234375, + "learning_rate": 0.022212181952492496, + "loss": 0.8055, + "num_input_tokens_seen": 31676936, + "step": 54560 + }, + { + "epoch": 8.127047959487637, + "grad_norm": 0.0255126953125, + "learning_rate": 0.022210472389847344, + "loss": 0.7883, + "num_input_tokens_seen": 31680008, + "step": 54565 + }, + { + "epoch": 8.127792672028598, + "grad_norm": 0.027099609375, + "learning_rate": 0.022208762705388588, + "loss": 0.8019, + "num_input_tokens_seen": 31682760, + "step": 54570 + }, + { + "epoch": 8.128537384569556, + "grad_norm": 0.037841796875, + "learning_rate": 0.02220705289914511, + "loss": 0.802, + "num_input_tokens_seen": 31685672, + "step": 54575 + }, + { + "epoch": 8.129282097110515, + "grad_norm": 0.0341796875, + "learning_rate": 0.0222053429711458, + "loss": 0.7629, + "num_input_tokens_seen": 31688392, + "step": 54580 + }, + { + "epoch": 8.130026809651474, + "grad_norm": 0.020751953125, + "learning_rate": 0.022203632921419542, + "loss": 0.7764, + "num_input_tokens_seen": 31691528, + "step": 54585 + }, + { + "epoch": 8.130771522192434, + "grad_norm": 0.0142822265625, + "learning_rate": 0.02220192274999523, + "loss": 0.7744, + "num_input_tokens_seen": 31694568, + "step": 54590 + }, + { + "epoch": 8.131516234733393, + "grad_norm": 0.02490234375, + "learning_rate": 0.022200212456901756, + "loss": 0.7984, + "num_input_tokens_seen": 31698152, + "step": 54595 + }, + { + "epoch": 8.132260947274352, + "grad_norm": 0.0206298828125, + "learning_rate": 0.022198502042168004, + "loss": 0.8199, + "num_input_tokens_seen": 31700968, + "step": 54600 + }, + { + "epoch": 8.13300565981531, + "grad_norm": 0.038818359375, + "learning_rate": 0.02219679150582288, + "loss": 0.8046, + "num_input_tokens_seen": 31704136, + "step": 54605 + }, + { + "epoch": 8.133750372356271, + "grad_norm": 0.02294921875, + "learning_rate": 0.022195080847895274, + "loss": 0.7749, + "num_input_tokens_seen": 31706952, + "step": 54610 + }, + { + "epoch": 8.13449508489723, + "grad_norm": 0.017822265625, + "learning_rate": 0.02219337006841409, + "loss": 0.8061, + "num_input_tokens_seen": 31709864, + "step": 54615 + }, + { + "epoch": 8.135239797438189, + "grad_norm": 0.016845703125, + "learning_rate": 0.022191659167408232, + "loss": 0.8398, + "num_input_tokens_seen": 31713128, + "step": 54620 + }, + { + "epoch": 8.135984509979147, + "grad_norm": 0.028564453125, + "learning_rate": 0.022189948144906605, + "loss": 0.7973, + "num_input_tokens_seen": 31716008, + "step": 54625 + }, + { + "epoch": 8.136729222520108, + "grad_norm": 0.0223388671875, + "learning_rate": 0.022188237000938107, + "loss": 0.7849, + "num_input_tokens_seen": 31719048, + "step": 54630 + }, + { + "epoch": 8.137473935061067, + "grad_norm": 0.01287841796875, + "learning_rate": 0.022186525735531655, + "loss": 0.7814, + "num_input_tokens_seen": 31721608, + "step": 54635 + }, + { + "epoch": 8.138218647602026, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02218481434871615, + "loss": 0.7923, + "num_input_tokens_seen": 31724520, + "step": 54640 + }, + { + "epoch": 8.138963360142984, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02218310284052051, + "loss": 0.8073, + "num_input_tokens_seen": 31727240, + "step": 54645 + }, + { + "epoch": 8.139708072683945, + "grad_norm": 0.0169677734375, + "learning_rate": 0.02218139121097365, + "loss": 0.7825, + "num_input_tokens_seen": 31730120, + "step": 54650 + }, + { + "epoch": 8.140452785224904, + "grad_norm": 0.0174560546875, + "learning_rate": 0.022179679460104483, + "loss": 0.814, + "num_input_tokens_seen": 31732872, + "step": 54655 + }, + { + "epoch": 8.141197497765862, + "grad_norm": 0.033447265625, + "learning_rate": 0.02217796758794193, + "loss": 0.815, + "num_input_tokens_seen": 31735976, + "step": 54660 + }, + { + "epoch": 8.141942210306821, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02217625559451491, + "loss": 0.7858, + "num_input_tokens_seen": 31738984, + "step": 54665 + }, + { + "epoch": 8.142686922847782, + "grad_norm": 0.01275634765625, + "learning_rate": 0.02217454347985234, + "loss": 0.7793, + "num_input_tokens_seen": 31741736, + "step": 54670 + }, + { + "epoch": 8.14343163538874, + "grad_norm": 0.03076171875, + "learning_rate": 0.022172831243983154, + "loss": 0.8039, + "num_input_tokens_seen": 31745064, + "step": 54675 + }, + { + "epoch": 8.1441763479297, + "grad_norm": 0.02001953125, + "learning_rate": 0.022171118886936275, + "loss": 0.8294, + "num_input_tokens_seen": 31748072, + "step": 54680 + }, + { + "epoch": 8.144921060470658, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02216940640874063, + "loss": 0.8156, + "num_input_tokens_seen": 31750888, + "step": 54685 + }, + { + "epoch": 8.145665773011617, + "grad_norm": 0.022216796875, + "learning_rate": 0.022167693809425147, + "loss": 0.7717, + "num_input_tokens_seen": 31753896, + "step": 54690 + }, + { + "epoch": 8.146410485552577, + "grad_norm": 0.022705078125, + "learning_rate": 0.02216598108901877, + "loss": 0.8, + "num_input_tokens_seen": 31756584, + "step": 54695 + }, + { + "epoch": 8.147155198093536, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02216426824755042, + "loss": 0.7702, + "num_input_tokens_seen": 31759336, + "step": 54700 + }, + { + "epoch": 8.147899910634495, + "grad_norm": 0.03564453125, + "learning_rate": 0.022162555285049037, + "loss": 0.8465, + "num_input_tokens_seen": 31762152, + "step": 54705 + }, + { + "epoch": 8.148644623175453, + "grad_norm": 0.021240234375, + "learning_rate": 0.022160842201543567, + "loss": 0.8065, + "num_input_tokens_seen": 31765160, + "step": 54710 + }, + { + "epoch": 8.149389335716414, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02215912899706295, + "loss": 0.8141, + "num_input_tokens_seen": 31767976, + "step": 54715 + }, + { + "epoch": 8.150134048257373, + "grad_norm": 0.032470703125, + "learning_rate": 0.022157415671636115, + "loss": 0.7971, + "num_input_tokens_seen": 31770856, + "step": 54720 + }, + { + "epoch": 8.150878760798332, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02215570222529202, + "loss": 0.78, + "num_input_tokens_seen": 31773864, + "step": 54725 + }, + { + "epoch": 8.15162347333929, + "grad_norm": 0.0145263671875, + "learning_rate": 0.02215398865805961, + "loss": 0.7864, + "num_input_tokens_seen": 31776776, + "step": 54730 + }, + { + "epoch": 8.15236818588025, + "grad_norm": 0.01519775390625, + "learning_rate": 0.02215227496996783, + "loss": 0.8164, + "num_input_tokens_seen": 31779496, + "step": 54735 + }, + { + "epoch": 8.15311289842121, + "grad_norm": 0.016357421875, + "learning_rate": 0.022150561161045638, + "loss": 0.7839, + "num_input_tokens_seen": 31782536, + "step": 54740 + }, + { + "epoch": 8.153857610962168, + "grad_norm": 0.0181884765625, + "learning_rate": 0.02214884723132198, + "loss": 0.8211, + "num_input_tokens_seen": 31785448, + "step": 54745 + }, + { + "epoch": 8.154602323503127, + "grad_norm": 0.047607421875, + "learning_rate": 0.022147133180825812, + "loss": 0.8044, + "num_input_tokens_seen": 31788232, + "step": 54750 + }, + { + "epoch": 8.155347036044088, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0221454190095861, + "loss": 0.8193, + "num_input_tokens_seen": 31791240, + "step": 54755 + }, + { + "epoch": 8.156091748585046, + "grad_norm": 0.02294921875, + "learning_rate": 0.02214370471763179, + "loss": 0.8227, + "num_input_tokens_seen": 31794088, + "step": 54760 + }, + { + "epoch": 8.156836461126005, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02214199030499185, + "loss": 0.8052, + "num_input_tokens_seen": 31797128, + "step": 54765 + }, + { + "epoch": 8.157581173666964, + "grad_norm": 0.016845703125, + "learning_rate": 0.022140275771695243, + "loss": 0.8289, + "num_input_tokens_seen": 31799912, + "step": 54770 + }, + { + "epoch": 8.158325886207924, + "grad_norm": 0.0230712890625, + "learning_rate": 0.022138561117770936, + "loss": 0.7847, + "num_input_tokens_seen": 31803016, + "step": 54775 + }, + { + "epoch": 8.159070598748883, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02213684634324789, + "loss": 0.8013, + "num_input_tokens_seen": 31805832, + "step": 54780 + }, + { + "epoch": 8.159815311289842, + "grad_norm": 0.016845703125, + "learning_rate": 0.02213513144815508, + "loss": 0.8114, + "num_input_tokens_seen": 31808776, + "step": 54785 + }, + { + "epoch": 8.1605600238308, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02213341643252148, + "loss": 0.8045, + "num_input_tokens_seen": 31812200, + "step": 54790 + }, + { + "epoch": 8.161304736371761, + "grad_norm": 0.021728515625, + "learning_rate": 0.02213170129637606, + "loss": 0.7847, + "num_input_tokens_seen": 31814952, + "step": 54795 + }, + { + "epoch": 8.16204944891272, + "grad_norm": 0.0322265625, + "learning_rate": 0.02212998603974779, + "loss": 0.7987, + "num_input_tokens_seen": 31817864, + "step": 54800 + }, + { + "epoch": 8.162794161453679, + "grad_norm": 0.046875, + "learning_rate": 0.022128270662665653, + "loss": 0.8261, + "num_input_tokens_seen": 31820872, + "step": 54805 + }, + { + "epoch": 8.163538873994638, + "grad_norm": 0.02294921875, + "learning_rate": 0.02212655516515863, + "loss": 0.8085, + "num_input_tokens_seen": 31823656, + "step": 54810 + }, + { + "epoch": 8.164283586535598, + "grad_norm": 0.0216064453125, + "learning_rate": 0.022124839547255696, + "loss": 0.8053, + "num_input_tokens_seen": 31826760, + "step": 54815 + }, + { + "epoch": 8.165028299076557, + "grad_norm": 0.021484375, + "learning_rate": 0.022123123808985845, + "loss": 0.7833, + "num_input_tokens_seen": 31829768, + "step": 54820 + }, + { + "epoch": 8.165773011617516, + "grad_norm": 0.0301513671875, + "learning_rate": 0.022121407950378053, + "loss": 0.8148, + "num_input_tokens_seen": 31832776, + "step": 54825 + }, + { + "epoch": 8.166517724158474, + "grad_norm": 0.0306396484375, + "learning_rate": 0.02211969197146131, + "loss": 0.8037, + "num_input_tokens_seen": 31835720, + "step": 54830 + }, + { + "epoch": 8.167262436699435, + "grad_norm": 0.0274658203125, + "learning_rate": 0.022117975872264613, + "loss": 0.799, + "num_input_tokens_seen": 31838408, + "step": 54835 + }, + { + "epoch": 8.168007149240394, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02211625965281695, + "loss": 0.7958, + "num_input_tokens_seen": 31841416, + "step": 54840 + }, + { + "epoch": 8.168751861781352, + "grad_norm": 0.021728515625, + "learning_rate": 0.022114543313147307, + "loss": 0.7874, + "num_input_tokens_seen": 31844232, + "step": 54845 + }, + { + "epoch": 8.169496574322311, + "grad_norm": 0.02587890625, + "learning_rate": 0.022112826853284685, + "loss": 0.8192, + "num_input_tokens_seen": 31847208, + "step": 54850 + }, + { + "epoch": 8.17024128686327, + "grad_norm": 0.0185546875, + "learning_rate": 0.022111110273258083, + "loss": 0.7894, + "num_input_tokens_seen": 31849832, + "step": 54855 + }, + { + "epoch": 8.17098599940423, + "grad_norm": 0.024169921875, + "learning_rate": 0.022109393573096502, + "loss": 0.8086, + "num_input_tokens_seen": 31852776, + "step": 54860 + }, + { + "epoch": 8.17173071194519, + "grad_norm": 0.0140380859375, + "learning_rate": 0.02210767675282894, + "loss": 0.8064, + "num_input_tokens_seen": 31856008, + "step": 54865 + }, + { + "epoch": 8.172475424486148, + "grad_norm": 0.0191650390625, + "learning_rate": 0.022105959812484408, + "loss": 0.8096, + "num_input_tokens_seen": 31858888, + "step": 54870 + }, + { + "epoch": 8.173220137027107, + "grad_norm": 0.0223388671875, + "learning_rate": 0.022104242752091906, + "loss": 0.7896, + "num_input_tokens_seen": 31861672, + "step": 54875 + }, + { + "epoch": 8.173964849568067, + "grad_norm": 0.0185546875, + "learning_rate": 0.02210252557168044, + "loss": 0.7913, + "num_input_tokens_seen": 31864744, + "step": 54880 + }, + { + "epoch": 8.174709562109026, + "grad_norm": 0.031494140625, + "learning_rate": 0.022100808271279023, + "loss": 0.7934, + "num_input_tokens_seen": 31867912, + "step": 54885 + }, + { + "epoch": 8.175454274649985, + "grad_norm": 0.034912109375, + "learning_rate": 0.02209909085091667, + "loss": 0.7963, + "num_input_tokens_seen": 31870856, + "step": 54890 + }, + { + "epoch": 8.176198987190944, + "grad_norm": 0.0157470703125, + "learning_rate": 0.022097373310622392, + "loss": 0.8175, + "num_input_tokens_seen": 31873832, + "step": 54895 + }, + { + "epoch": 8.176943699731904, + "grad_norm": 0.0299072265625, + "learning_rate": 0.022095655650425202, + "loss": 0.7914, + "num_input_tokens_seen": 31876424, + "step": 54900 + }, + { + "epoch": 8.177688412272863, + "grad_norm": 0.0225830078125, + "learning_rate": 0.022093937870354123, + "loss": 0.8069, + "num_input_tokens_seen": 31879368, + "step": 54905 + }, + { + "epoch": 8.178433124813822, + "grad_norm": 0.025390625, + "learning_rate": 0.022092219970438174, + "loss": 0.8183, + "num_input_tokens_seen": 31882312, + "step": 54910 + }, + { + "epoch": 8.17917783735478, + "grad_norm": 0.02490234375, + "learning_rate": 0.022090501950706377, + "loss": 0.8044, + "num_input_tokens_seen": 31885224, + "step": 54915 + }, + { + "epoch": 8.17992254989574, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02208878381118776, + "loss": 0.7738, + "num_input_tokens_seen": 31888040, + "step": 54920 + }, + { + "epoch": 8.1806672624367, + "grad_norm": 0.0203857421875, + "learning_rate": 0.022087065551911342, + "loss": 0.8079, + "num_input_tokens_seen": 31891144, + "step": 54925 + }, + { + "epoch": 8.181411974977658, + "grad_norm": 0.0177001953125, + "learning_rate": 0.022085347172906156, + "loss": 0.8234, + "num_input_tokens_seen": 31894216, + "step": 54930 + }, + { + "epoch": 8.182156687518617, + "grad_norm": 0.0269775390625, + "learning_rate": 0.022083628674201227, + "loss": 0.8037, + "num_input_tokens_seen": 31897192, + "step": 54935 + }, + { + "epoch": 8.182901400059578, + "grad_norm": 0.021240234375, + "learning_rate": 0.02208191005582559, + "loss": 0.8161, + "num_input_tokens_seen": 31900072, + "step": 54940 + }, + { + "epoch": 8.183646112600536, + "grad_norm": 0.05908203125, + "learning_rate": 0.022080191317808282, + "loss": 0.7914, + "num_input_tokens_seen": 31903112, + "step": 54945 + }, + { + "epoch": 8.184390825141495, + "grad_norm": 0.02734375, + "learning_rate": 0.022078472460178338, + "loss": 0.7799, + "num_input_tokens_seen": 31905864, + "step": 54950 + }, + { + "epoch": 8.185135537682454, + "grad_norm": 0.023681640625, + "learning_rate": 0.0220767534829648, + "loss": 0.8059, + "num_input_tokens_seen": 31909096, + "step": 54955 + }, + { + "epoch": 8.185880250223414, + "grad_norm": 0.042724609375, + "learning_rate": 0.022075034386196703, + "loss": 0.8035, + "num_input_tokens_seen": 31911848, + "step": 54960 + }, + { + "epoch": 8.186624962764373, + "grad_norm": 0.037841796875, + "learning_rate": 0.022073315169903088, + "loss": 0.7823, + "num_input_tokens_seen": 31914536, + "step": 54965 + }, + { + "epoch": 8.187369675305332, + "grad_norm": 0.0322265625, + "learning_rate": 0.022071595834113, + "loss": 0.786, + "num_input_tokens_seen": 31917448, + "step": 54970 + }, + { + "epoch": 8.18811438784629, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02206987637885549, + "loss": 0.8182, + "num_input_tokens_seen": 31920456, + "step": 54975 + }, + { + "epoch": 8.188859100387251, + "grad_norm": 0.0233154296875, + "learning_rate": 0.022068156804159605, + "loss": 0.8094, + "num_input_tokens_seen": 31923368, + "step": 54980 + }, + { + "epoch": 8.18960381292821, + "grad_norm": 0.0250244140625, + "learning_rate": 0.022066437110054397, + "loss": 0.8023, + "num_input_tokens_seen": 31926376, + "step": 54985 + }, + { + "epoch": 8.190348525469169, + "grad_norm": 0.032958984375, + "learning_rate": 0.022064717296568916, + "loss": 0.7883, + "num_input_tokens_seen": 31929160, + "step": 54990 + }, + { + "epoch": 8.191093238010128, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02206299736373221, + "loss": 0.7918, + "num_input_tokens_seen": 31932040, + "step": 54995 + }, + { + "epoch": 8.191837950551088, + "grad_norm": 0.0279541015625, + "learning_rate": 0.022061277311573348, + "loss": 0.7946, + "num_input_tokens_seen": 31935432, + "step": 55000 + }, + { + "epoch": 8.192582663092047, + "grad_norm": 0.01409912109375, + "learning_rate": 0.022059557140121384, + "loss": 0.8045, + "num_input_tokens_seen": 31938120, + "step": 55005 + }, + { + "epoch": 8.193327375633006, + "grad_norm": 0.03173828125, + "learning_rate": 0.02205783684940537, + "loss": 0.7982, + "num_input_tokens_seen": 31940968, + "step": 55010 + }, + { + "epoch": 8.194072088173964, + "grad_norm": 0.016357421875, + "learning_rate": 0.02205611643945438, + "loss": 0.7895, + "num_input_tokens_seen": 31943720, + "step": 55015 + }, + { + "epoch": 8.194816800714925, + "grad_norm": 0.021484375, + "learning_rate": 0.02205439591029747, + "loss": 0.7944, + "num_input_tokens_seen": 31946664, + "step": 55020 + }, + { + "epoch": 8.195561513255884, + "grad_norm": 0.031005859375, + "learning_rate": 0.022052675261963715, + "loss": 0.7958, + "num_input_tokens_seen": 31949544, + "step": 55025 + }, + { + "epoch": 8.196306225796842, + "grad_norm": 0.0166015625, + "learning_rate": 0.022050954494482182, + "loss": 0.8142, + "num_input_tokens_seen": 31952392, + "step": 55030 + }, + { + "epoch": 8.197050938337801, + "grad_norm": 0.0303955078125, + "learning_rate": 0.022049233607881934, + "loss": 0.7951, + "num_input_tokens_seen": 31955304, + "step": 55035 + }, + { + "epoch": 8.19779565087876, + "grad_norm": 0.0205078125, + "learning_rate": 0.022047512602192055, + "loss": 0.7986, + "num_input_tokens_seen": 31958056, + "step": 55040 + }, + { + "epoch": 8.19854036341972, + "grad_norm": 0.03271484375, + "learning_rate": 0.02204579147744161, + "loss": 0.7952, + "num_input_tokens_seen": 31961096, + "step": 55045 + }, + { + "epoch": 8.19928507596068, + "grad_norm": 0.0322265625, + "learning_rate": 0.022044070233659674, + "loss": 0.8096, + "num_input_tokens_seen": 31963976, + "step": 55050 + }, + { + "epoch": 8.200029788501638, + "grad_norm": 0.0257568359375, + "learning_rate": 0.022042348870875333, + "loss": 0.7956, + "num_input_tokens_seen": 31966504, + "step": 55055 + }, + { + "epoch": 8.200774501042597, + "grad_norm": 0.026123046875, + "learning_rate": 0.022040627389117672, + "loss": 0.8189, + "num_input_tokens_seen": 31969256, + "step": 55060 + }, + { + "epoch": 8.201519213583557, + "grad_norm": 0.02978515625, + "learning_rate": 0.02203890578841576, + "loss": 0.8234, + "num_input_tokens_seen": 31972232, + "step": 55065 + }, + { + "epoch": 8.202263926124516, + "grad_norm": 0.039306640625, + "learning_rate": 0.02203718406879869, + "loss": 0.7787, + "num_input_tokens_seen": 31975176, + "step": 55070 + }, + { + "epoch": 8.203008638665475, + "grad_norm": 0.028076171875, + "learning_rate": 0.02203546223029555, + "loss": 0.787, + "num_input_tokens_seen": 31977992, + "step": 55075 + }, + { + "epoch": 8.203753351206434, + "grad_norm": 0.05712890625, + "learning_rate": 0.022033740272935424, + "loss": 0.7734, + "num_input_tokens_seen": 31980680, + "step": 55080 + }, + { + "epoch": 8.204498063747394, + "grad_norm": 0.057861328125, + "learning_rate": 0.022032018196747407, + "loss": 0.7997, + "num_input_tokens_seen": 31983432, + "step": 55085 + }, + { + "epoch": 8.205242776288353, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02203029600176059, + "loss": 0.7758, + "num_input_tokens_seen": 31986376, + "step": 55090 + }, + { + "epoch": 8.205987488829312, + "grad_norm": 0.03564453125, + "learning_rate": 0.022028573688004063, + "loss": 0.8015, + "num_input_tokens_seen": 31989032, + "step": 55095 + }, + { + "epoch": 8.20673220137027, + "grad_norm": 0.04248046875, + "learning_rate": 0.022026851255506928, + "loss": 0.8171, + "num_input_tokens_seen": 31991784, + "step": 55100 + }, + { + "epoch": 8.207476913911231, + "grad_norm": 0.03125, + "learning_rate": 0.022025128704298284, + "loss": 0.8061, + "num_input_tokens_seen": 31994504, + "step": 55105 + }, + { + "epoch": 8.20822162645219, + "grad_norm": 0.024169921875, + "learning_rate": 0.022023406034407233, + "loss": 0.8103, + "num_input_tokens_seen": 31997576, + "step": 55110 + }, + { + "epoch": 8.208966338993148, + "grad_norm": 0.031494140625, + "learning_rate": 0.02202168324586287, + "loss": 0.7928, + "num_input_tokens_seen": 32000360, + "step": 55115 + }, + { + "epoch": 8.209711051534107, + "grad_norm": 0.029541015625, + "learning_rate": 0.02201996033869431, + "loss": 0.789, + "num_input_tokens_seen": 32003080, + "step": 55120 + }, + { + "epoch": 8.210455764075068, + "grad_norm": 0.037109375, + "learning_rate": 0.022018237312930657, + "loss": 0.8063, + "num_input_tokens_seen": 32006248, + "step": 55125 + }, + { + "epoch": 8.211200476616026, + "grad_norm": 0.0252685546875, + "learning_rate": 0.022016514168601013, + "loss": 0.804, + "num_input_tokens_seen": 32009192, + "step": 55130 + }, + { + "epoch": 8.211945189156985, + "grad_norm": 0.034423828125, + "learning_rate": 0.02201479090573449, + "loss": 0.7917, + "num_input_tokens_seen": 32011848, + "step": 55135 + }, + { + "epoch": 8.212689901697944, + "grad_norm": 0.0264892578125, + "learning_rate": 0.02201306752436021, + "loss": 0.8032, + "num_input_tokens_seen": 32015016, + "step": 55140 + }, + { + "epoch": 8.213434614238905, + "grad_norm": 0.0400390625, + "learning_rate": 0.022011344024507277, + "loss": 0.8176, + "num_input_tokens_seen": 32018344, + "step": 55145 + }, + { + "epoch": 8.214179326779863, + "grad_norm": 0.036865234375, + "learning_rate": 0.022009620406204817, + "loss": 0.7956, + "num_input_tokens_seen": 32021160, + "step": 55150 + }, + { + "epoch": 8.214924039320822, + "grad_norm": 0.0213623046875, + "learning_rate": 0.022007896669481945, + "loss": 0.808, + "num_input_tokens_seen": 32024008, + "step": 55155 + }, + { + "epoch": 8.21566875186178, + "grad_norm": 0.029296875, + "learning_rate": 0.022006172814367782, + "loss": 0.8349, + "num_input_tokens_seen": 32026760, + "step": 55160 + }, + { + "epoch": 8.216413464402741, + "grad_norm": 0.028564453125, + "learning_rate": 0.022004448840891446, + "loss": 0.7903, + "num_input_tokens_seen": 32029928, + "step": 55165 + }, + { + "epoch": 8.2171581769437, + "grad_norm": 0.028076171875, + "learning_rate": 0.02200272474908207, + "loss": 0.8146, + "num_input_tokens_seen": 32032680, + "step": 55170 + }, + { + "epoch": 8.217902889484659, + "grad_norm": 0.03564453125, + "learning_rate": 0.022001000538968772, + "loss": 0.8051, + "num_input_tokens_seen": 32035592, + "step": 55175 + }, + { + "epoch": 8.218647602025618, + "grad_norm": 0.0303955078125, + "learning_rate": 0.021999276210580686, + "loss": 0.8062, + "num_input_tokens_seen": 32038696, + "step": 55180 + }, + { + "epoch": 8.219392314566578, + "grad_norm": 0.0224609375, + "learning_rate": 0.021997551763946945, + "loss": 0.794, + "num_input_tokens_seen": 32041608, + "step": 55185 + }, + { + "epoch": 8.220137027107537, + "grad_norm": 0.039306640625, + "learning_rate": 0.02199582719909668, + "loss": 0.8007, + "num_input_tokens_seen": 32044616, + "step": 55190 + }, + { + "epoch": 8.220881739648496, + "grad_norm": 0.0157470703125, + "learning_rate": 0.021994102516059016, + "loss": 0.8017, + "num_input_tokens_seen": 32047240, + "step": 55195 + }, + { + "epoch": 8.221626452189454, + "grad_norm": 0.03759765625, + "learning_rate": 0.02199237771486311, + "loss": 0.7942, + "num_input_tokens_seen": 32050184, + "step": 55200 + }, + { + "epoch": 8.222371164730415, + "grad_norm": 0.02685546875, + "learning_rate": 0.021990652795538083, + "loss": 0.8055, + "num_input_tokens_seen": 32052776, + "step": 55205 + }, + { + "epoch": 8.223115877271374, + "grad_norm": 0.0286865234375, + "learning_rate": 0.021988927758113085, + "loss": 0.8042, + "num_input_tokens_seen": 32055496, + "step": 55210 + }, + { + "epoch": 8.223860589812332, + "grad_norm": 0.032958984375, + "learning_rate": 0.021987202602617252, + "loss": 0.8037, + "num_input_tokens_seen": 32058472, + "step": 55215 + }, + { + "epoch": 8.224605302353291, + "grad_norm": 0.0247802734375, + "learning_rate": 0.021985477329079732, + "loss": 0.8083, + "num_input_tokens_seen": 32061256, + "step": 55220 + }, + { + "epoch": 8.22535001489425, + "grad_norm": 0.035888671875, + "learning_rate": 0.021983751937529678, + "loss": 0.8114, + "num_input_tokens_seen": 32064040, + "step": 55225 + }, + { + "epoch": 8.22609472743521, + "grad_norm": 0.0228271484375, + "learning_rate": 0.021982026427996227, + "loss": 0.7915, + "num_input_tokens_seen": 32066792, + "step": 55230 + }, + { + "epoch": 8.22683943997617, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02198030080050854, + "loss": 0.7935, + "num_input_tokens_seen": 32069832, + "step": 55235 + }, + { + "epoch": 8.227584152517128, + "grad_norm": 0.037353515625, + "learning_rate": 0.021978575055095765, + "loss": 0.7868, + "num_input_tokens_seen": 32072552, + "step": 55240 + }, + { + "epoch": 8.228328865058087, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02197684919178705, + "loss": 0.8066, + "num_input_tokens_seen": 32075304, + "step": 55245 + }, + { + "epoch": 8.229073577599047, + "grad_norm": 0.023681640625, + "learning_rate": 0.021975123210611566, + "loss": 0.8175, + "num_input_tokens_seen": 32078216, + "step": 55250 + }, + { + "epoch": 8.229818290140006, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02197339711159847, + "loss": 0.8117, + "num_input_tokens_seen": 32081256, + "step": 55255 + }, + { + "epoch": 8.230563002680965, + "grad_norm": 0.029541015625, + "learning_rate": 0.021971670894776908, + "loss": 0.8026, + "num_input_tokens_seen": 32084488, + "step": 55260 + }, + { + "epoch": 8.231307715221924, + "grad_norm": 0.016357421875, + "learning_rate": 0.021969944560176053, + "loss": 0.7904, + "num_input_tokens_seen": 32087368, + "step": 55265 + }, + { + "epoch": 8.232052427762884, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02196821810782507, + "loss": 0.7964, + "num_input_tokens_seen": 32090440, + "step": 55270 + }, + { + "epoch": 8.232797140303843, + "grad_norm": 0.042236328125, + "learning_rate": 0.02196649153775312, + "loss": 0.8049, + "num_input_tokens_seen": 32093160, + "step": 55275 + }, + { + "epoch": 8.233541852844802, + "grad_norm": 0.01708984375, + "learning_rate": 0.021964764849989386, + "loss": 0.7996, + "num_input_tokens_seen": 32095752, + "step": 55280 + }, + { + "epoch": 8.23428656538576, + "grad_norm": 0.02783203125, + "learning_rate": 0.02196303804456302, + "loss": 0.7736, + "num_input_tokens_seen": 32098728, + "step": 55285 + }, + { + "epoch": 8.235031277926721, + "grad_norm": 0.0257568359375, + "learning_rate": 0.021961311121503208, + "loss": 0.797, + "num_input_tokens_seen": 32101352, + "step": 55290 + }, + { + "epoch": 8.23577599046768, + "grad_norm": 0.037841796875, + "learning_rate": 0.021959584080839117, + "loss": 0.8465, + "num_input_tokens_seen": 32104200, + "step": 55295 + }, + { + "epoch": 8.236520703008638, + "grad_norm": 0.0294189453125, + "learning_rate": 0.02195785692259993, + "loss": 0.7996, + "num_input_tokens_seen": 32107048, + "step": 55300 + }, + { + "epoch": 8.237265415549597, + "grad_norm": 0.027587890625, + "learning_rate": 0.021956129646814815, + "loss": 0.7912, + "num_input_tokens_seen": 32109928, + "step": 55305 + }, + { + "epoch": 8.238010128090558, + "grad_norm": 0.02490234375, + "learning_rate": 0.021954402253512962, + "loss": 0.7846, + "num_input_tokens_seen": 32112808, + "step": 55310 + }, + { + "epoch": 8.238754840631517, + "grad_norm": 0.01904296875, + "learning_rate": 0.021952674742723555, + "loss": 0.8039, + "num_input_tokens_seen": 32115432, + "step": 55315 + }, + { + "epoch": 8.239499553172475, + "grad_norm": 0.0238037109375, + "learning_rate": 0.021950947114475775, + "loss": 0.8107, + "num_input_tokens_seen": 32118376, + "step": 55320 + }, + { + "epoch": 8.240244265713434, + "grad_norm": 0.0277099609375, + "learning_rate": 0.021949219368798806, + "loss": 0.7975, + "num_input_tokens_seen": 32121032, + "step": 55325 + }, + { + "epoch": 8.240988978254395, + "grad_norm": 0.0235595703125, + "learning_rate": 0.021947491505721835, + "loss": 0.82, + "num_input_tokens_seen": 32123880, + "step": 55330 + }, + { + "epoch": 8.241733690795353, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02194576352527406, + "loss": 0.784, + "num_input_tokens_seen": 32126536, + "step": 55335 + }, + { + "epoch": 8.242478403336312, + "grad_norm": 0.0263671875, + "learning_rate": 0.021944035427484666, + "loss": 0.7914, + "num_input_tokens_seen": 32129576, + "step": 55340 + }, + { + "epoch": 8.24322311587727, + "grad_norm": 0.0341796875, + "learning_rate": 0.021942307212382852, + "loss": 0.8265, + "num_input_tokens_seen": 32132296, + "step": 55345 + }, + { + "epoch": 8.243967828418231, + "grad_norm": 0.05712890625, + "learning_rate": 0.021940578879997813, + "loss": 0.8073, + "num_input_tokens_seen": 32135144, + "step": 55350 + }, + { + "epoch": 8.24471254095919, + "grad_norm": 0.0250244140625, + "learning_rate": 0.021938850430358755, + "loss": 0.8085, + "num_input_tokens_seen": 32138280, + "step": 55355 + }, + { + "epoch": 8.245457253500149, + "grad_norm": 0.045166015625, + "learning_rate": 0.021937121863494866, + "loss": 0.7998, + "num_input_tokens_seen": 32141160, + "step": 55360 + }, + { + "epoch": 8.246201966041108, + "grad_norm": 0.05029296875, + "learning_rate": 0.021935393179435353, + "loss": 0.7977, + "num_input_tokens_seen": 32144232, + "step": 55365 + }, + { + "epoch": 8.246946678582066, + "grad_norm": 0.018798828125, + "learning_rate": 0.02193366437820942, + "loss": 0.8012, + "num_input_tokens_seen": 32147272, + "step": 55370 + }, + { + "epoch": 8.247691391123027, + "grad_norm": 0.01470947265625, + "learning_rate": 0.02193193545984628, + "loss": 0.7939, + "num_input_tokens_seen": 32149992, + "step": 55375 + }, + { + "epoch": 8.248436103663986, + "grad_norm": 0.0220947265625, + "learning_rate": 0.021930206424375133, + "loss": 0.7906, + "num_input_tokens_seen": 32153032, + "step": 55380 + }, + { + "epoch": 8.249180816204944, + "grad_norm": 0.033447265625, + "learning_rate": 0.02192847727182519, + "loss": 0.8085, + "num_input_tokens_seen": 32155688, + "step": 55385 + }, + { + "epoch": 8.249925528745903, + "grad_norm": 0.0299072265625, + "learning_rate": 0.021926748002225667, + "loss": 0.7938, + "num_input_tokens_seen": 32158504, + "step": 55390 + }, + { + "epoch": 8.250670241286864, + "grad_norm": 0.0240478515625, + "learning_rate": 0.021925018615605776, + "loss": 0.7769, + "num_input_tokens_seen": 32161320, + "step": 55395 + }, + { + "epoch": 8.251414953827823, + "grad_norm": 0.036865234375, + "learning_rate": 0.021923289111994735, + "loss": 0.8223, + "num_input_tokens_seen": 32164200, + "step": 55400 + }, + { + "epoch": 8.252159666368781, + "grad_norm": 0.02685546875, + "learning_rate": 0.021921559491421758, + "loss": 0.7996, + "num_input_tokens_seen": 32167176, + "step": 55405 + }, + { + "epoch": 8.25290437890974, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02191982975391607, + "loss": 0.767, + "num_input_tokens_seen": 32169896, + "step": 55410 + }, + { + "epoch": 8.2536490914507, + "grad_norm": 0.033203125, + "learning_rate": 0.02191809989950689, + "loss": 0.7971, + "num_input_tokens_seen": 32172680, + "step": 55415 + }, + { + "epoch": 8.25439380399166, + "grad_norm": 0.031494140625, + "learning_rate": 0.021916369928223448, + "loss": 0.811, + "num_input_tokens_seen": 32175816, + "step": 55420 + }, + { + "epoch": 8.255138516532618, + "grad_norm": 0.0341796875, + "learning_rate": 0.021914639840094962, + "loss": 0.8008, + "num_input_tokens_seen": 32178600, + "step": 55425 + }, + { + "epoch": 8.255883229073577, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02191290963515066, + "loss": 0.7921, + "num_input_tokens_seen": 32181448, + "step": 55430 + }, + { + "epoch": 8.256627941614537, + "grad_norm": 0.023681640625, + "learning_rate": 0.02191117931341978, + "loss": 0.7917, + "num_input_tokens_seen": 32184104, + "step": 55435 + }, + { + "epoch": 8.257372654155496, + "grad_norm": 0.025390625, + "learning_rate": 0.02190944887493155, + "loss": 0.8285, + "num_input_tokens_seen": 32187112, + "step": 55440 + }, + { + "epoch": 8.258117366696455, + "grad_norm": 0.023681640625, + "learning_rate": 0.021907718319715205, + "loss": 0.8237, + "num_input_tokens_seen": 32190184, + "step": 55445 + }, + { + "epoch": 8.258862079237414, + "grad_norm": 0.03515625, + "learning_rate": 0.021905987647799975, + "loss": 0.8108, + "num_input_tokens_seen": 32193128, + "step": 55450 + }, + { + "epoch": 8.259606791778374, + "grad_norm": 0.04052734375, + "learning_rate": 0.02190425685921511, + "loss": 0.8005, + "num_input_tokens_seen": 32196104, + "step": 55455 + }, + { + "epoch": 8.260351504319333, + "grad_norm": 0.0260009765625, + "learning_rate": 0.021902525953989834, + "loss": 0.8234, + "num_input_tokens_seen": 32198888, + "step": 55460 + }, + { + "epoch": 8.261096216860292, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0219007949321534, + "loss": 0.8099, + "num_input_tokens_seen": 32201544, + "step": 55465 + }, + { + "epoch": 8.26184092940125, + "grad_norm": 0.0294189453125, + "learning_rate": 0.021899063793735053, + "loss": 0.8058, + "num_input_tokens_seen": 32204552, + "step": 55470 + }, + { + "epoch": 8.262585641942211, + "grad_norm": 0.041015625, + "learning_rate": 0.021897332538764032, + "loss": 0.7906, + "num_input_tokens_seen": 32207336, + "step": 55475 + }, + { + "epoch": 8.26333035448317, + "grad_norm": 0.0380859375, + "learning_rate": 0.021895601167269588, + "loss": 0.8063, + "num_input_tokens_seen": 32210184, + "step": 55480 + }, + { + "epoch": 8.264075067024129, + "grad_norm": 0.0546875, + "learning_rate": 0.021893869679280968, + "loss": 0.812, + "num_input_tokens_seen": 32213064, + "step": 55485 + }, + { + "epoch": 8.264819779565087, + "grad_norm": 0.038818359375, + "learning_rate": 0.021892138074827434, + "loss": 0.805, + "num_input_tokens_seen": 32216008, + "step": 55490 + }, + { + "epoch": 8.265564492106048, + "grad_norm": 0.017333984375, + "learning_rate": 0.021890406353938227, + "loss": 0.8065, + "num_input_tokens_seen": 32219176, + "step": 55495 + }, + { + "epoch": 8.266309204647007, + "grad_norm": 0.02587890625, + "learning_rate": 0.021888674516642607, + "loss": 0.7987, + "num_input_tokens_seen": 32222664, + "step": 55500 + }, + { + "epoch": 8.267053917187965, + "grad_norm": 0.0172119140625, + "learning_rate": 0.021886942562969833, + "loss": 0.7905, + "num_input_tokens_seen": 32225416, + "step": 55505 + }, + { + "epoch": 8.267798629728924, + "grad_norm": 0.0238037109375, + "learning_rate": 0.021885210492949163, + "loss": 0.8066, + "num_input_tokens_seen": 32228584, + "step": 55510 + }, + { + "epoch": 8.268543342269885, + "grad_norm": 0.0245361328125, + "learning_rate": 0.021883478306609864, + "loss": 0.7969, + "num_input_tokens_seen": 32231368, + "step": 55515 + }, + { + "epoch": 8.269288054810843, + "grad_norm": 0.0274658203125, + "learning_rate": 0.021881746003981194, + "loss": 0.8152, + "num_input_tokens_seen": 32234344, + "step": 55520 + }, + { + "epoch": 8.270032767351802, + "grad_norm": 0.041015625, + "learning_rate": 0.02188001358509242, + "loss": 0.8037, + "num_input_tokens_seen": 32237416, + "step": 55525 + }, + { + "epoch": 8.270777479892761, + "grad_norm": 0.034912109375, + "learning_rate": 0.021878281049972807, + "loss": 0.7941, + "num_input_tokens_seen": 32240584, + "step": 55530 + }, + { + "epoch": 8.271522192433721, + "grad_norm": 0.0211181640625, + "learning_rate": 0.021876548398651625, + "loss": 0.7888, + "num_input_tokens_seen": 32243528, + "step": 55535 + }, + { + "epoch": 8.27226690497468, + "grad_norm": 0.0196533203125, + "learning_rate": 0.02187481563115815, + "loss": 0.8049, + "num_input_tokens_seen": 32246472, + "step": 55540 + }, + { + "epoch": 8.273011617515639, + "grad_norm": 0.0250244140625, + "learning_rate": 0.021873082747521647, + "loss": 0.801, + "num_input_tokens_seen": 32249576, + "step": 55545 + }, + { + "epoch": 8.273756330056598, + "grad_norm": 0.0205078125, + "learning_rate": 0.0218713497477714, + "loss": 0.8226, + "num_input_tokens_seen": 32252104, + "step": 55550 + }, + { + "epoch": 8.274501042597556, + "grad_norm": 0.0218505859375, + "learning_rate": 0.021869616631936685, + "loss": 0.798, + "num_input_tokens_seen": 32255144, + "step": 55555 + }, + { + "epoch": 8.275245755138517, + "grad_norm": 0.0244140625, + "learning_rate": 0.02186788340004677, + "loss": 0.8211, + "num_input_tokens_seen": 32257832, + "step": 55560 + }, + { + "epoch": 8.275990467679476, + "grad_norm": 0.0308837890625, + "learning_rate": 0.021866150052130957, + "loss": 0.7979, + "num_input_tokens_seen": 32260616, + "step": 55565 + }, + { + "epoch": 8.276735180220435, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02186441658821851, + "loss": 0.7854, + "num_input_tokens_seen": 32263240, + "step": 55570 + }, + { + "epoch": 8.277479892761393, + "grad_norm": 0.0286865234375, + "learning_rate": 0.021862683008338723, + "loss": 0.8018, + "num_input_tokens_seen": 32266408, + "step": 55575 + }, + { + "epoch": 8.278224605302354, + "grad_norm": 0.02734375, + "learning_rate": 0.021860949312520882, + "loss": 0.8028, + "num_input_tokens_seen": 32269384, + "step": 55580 + }, + { + "epoch": 8.278969317843313, + "grad_norm": 0.0302734375, + "learning_rate": 0.02185921550079428, + "loss": 0.8294, + "num_input_tokens_seen": 32272392, + "step": 55585 + }, + { + "epoch": 8.279714030384271, + "grad_norm": 0.0252685546875, + "learning_rate": 0.021857481573188196, + "loss": 0.8346, + "num_input_tokens_seen": 32275176, + "step": 55590 + }, + { + "epoch": 8.28045874292523, + "grad_norm": 0.02783203125, + "learning_rate": 0.02185574752973193, + "loss": 0.8182, + "num_input_tokens_seen": 32278376, + "step": 55595 + }, + { + "epoch": 8.28120345546619, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02185401337045478, + "loss": 0.8, + "num_input_tokens_seen": 32281096, + "step": 55600 + }, + { + "epoch": 8.28194816800715, + "grad_norm": 0.0130615234375, + "learning_rate": 0.021852279095386045, + "loss": 0.8089, + "num_input_tokens_seen": 32283848, + "step": 55605 + }, + { + "epoch": 8.282692880548108, + "grad_norm": 0.0216064453125, + "learning_rate": 0.021850544704555015, + "loss": 0.8036, + "num_input_tokens_seen": 32286856, + "step": 55610 + }, + { + "epoch": 8.283437593089067, + "grad_norm": 0.02734375, + "learning_rate": 0.021848810197990995, + "loss": 0.8038, + "num_input_tokens_seen": 32289832, + "step": 55615 + }, + { + "epoch": 8.284182305630027, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02184707557572329, + "loss": 0.8071, + "num_input_tokens_seen": 32292616, + "step": 55620 + }, + { + "epoch": 8.284927018170986, + "grad_norm": 0.0277099609375, + "learning_rate": 0.021845340837781196, + "loss": 0.7863, + "num_input_tokens_seen": 32295752, + "step": 55625 + }, + { + "epoch": 8.285671730711945, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02184360598419403, + "loss": 0.805, + "num_input_tokens_seen": 32298664, + "step": 55630 + }, + { + "epoch": 8.286416443252904, + "grad_norm": 0.01953125, + "learning_rate": 0.021841871014991094, + "loss": 0.813, + "num_input_tokens_seen": 32301480, + "step": 55635 + }, + { + "epoch": 8.287161155793864, + "grad_norm": 0.028076171875, + "learning_rate": 0.0218401359302017, + "loss": 0.812, + "num_input_tokens_seen": 32304296, + "step": 55640 + }, + { + "epoch": 8.287905868334823, + "grad_norm": 0.0172119140625, + "learning_rate": 0.021838400729855167, + "loss": 0.7982, + "num_input_tokens_seen": 32307336, + "step": 55645 + }, + { + "epoch": 8.288650580875782, + "grad_norm": 0.02587890625, + "learning_rate": 0.021836665413980803, + "loss": 0.7943, + "num_input_tokens_seen": 32310408, + "step": 55650 + }, + { + "epoch": 8.28939529341674, + "grad_norm": 0.03125, + "learning_rate": 0.021834929982607925, + "loss": 0.7987, + "num_input_tokens_seen": 32313288, + "step": 55655 + }, + { + "epoch": 8.290140005957701, + "grad_norm": 0.0245361328125, + "learning_rate": 0.021833194435765847, + "loss": 0.7963, + "num_input_tokens_seen": 32316648, + "step": 55660 + }, + { + "epoch": 8.29088471849866, + "grad_norm": 0.033447265625, + "learning_rate": 0.0218314587734839, + "loss": 0.8097, + "num_input_tokens_seen": 32319752, + "step": 55665 + }, + { + "epoch": 8.291629431039619, + "grad_norm": 0.01458740234375, + "learning_rate": 0.021829722995791397, + "loss": 0.799, + "num_input_tokens_seen": 32322568, + "step": 55670 + }, + { + "epoch": 8.292374143580577, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02182798710271766, + "loss": 0.8134, + "num_input_tokens_seen": 32325544, + "step": 55675 + }, + { + "epoch": 8.293118856121538, + "grad_norm": 0.027587890625, + "learning_rate": 0.021826251094292027, + "loss": 0.8037, + "num_input_tokens_seen": 32328840, + "step": 55680 + }, + { + "epoch": 8.293863568662497, + "grad_norm": 0.017333984375, + "learning_rate": 0.021824514970543822, + "loss": 0.7973, + "num_input_tokens_seen": 32331688, + "step": 55685 + }, + { + "epoch": 8.294608281203455, + "grad_norm": 0.024658203125, + "learning_rate": 0.02182277873150237, + "loss": 0.816, + "num_input_tokens_seen": 32334504, + "step": 55690 + }, + { + "epoch": 8.295352993744414, + "grad_norm": 0.036865234375, + "learning_rate": 0.021821042377197005, + "loss": 0.7782, + "num_input_tokens_seen": 32337672, + "step": 55695 + }, + { + "epoch": 8.296097706285375, + "grad_norm": 0.03515625, + "learning_rate": 0.02181930590765706, + "loss": 0.7961, + "num_input_tokens_seen": 32340360, + "step": 55700 + }, + { + "epoch": 8.296842418826333, + "grad_norm": 0.048583984375, + "learning_rate": 0.021817569322911876, + "loss": 0.8183, + "num_input_tokens_seen": 32343400, + "step": 55705 + }, + { + "epoch": 8.297587131367292, + "grad_norm": 0.04736328125, + "learning_rate": 0.02181583262299078, + "loss": 0.7967, + "num_input_tokens_seen": 32346184, + "step": 55710 + }, + { + "epoch": 8.298331843908251, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02181409580792313, + "loss": 0.8039, + "num_input_tokens_seen": 32349032, + "step": 55715 + }, + { + "epoch": 8.299076556449211, + "grad_norm": 0.0361328125, + "learning_rate": 0.02181235887773825, + "loss": 0.7968, + "num_input_tokens_seen": 32352264, + "step": 55720 + }, + { + "epoch": 8.29982126899017, + "grad_norm": 0.02490234375, + "learning_rate": 0.021810621832465495, + "loss": 0.8071, + "num_input_tokens_seen": 32355272, + "step": 55725 + }, + { + "epoch": 8.300565981531129, + "grad_norm": 0.0322265625, + "learning_rate": 0.021808884672134203, + "loss": 0.7954, + "num_input_tokens_seen": 32357992, + "step": 55730 + }, + { + "epoch": 8.301310694072088, + "grad_norm": 0.0263671875, + "learning_rate": 0.021807147396773724, + "loss": 0.8079, + "num_input_tokens_seen": 32360744, + "step": 55735 + }, + { + "epoch": 8.302055406613047, + "grad_norm": 0.041015625, + "learning_rate": 0.02180541000641341, + "loss": 0.8044, + "num_input_tokens_seen": 32363528, + "step": 55740 + }, + { + "epoch": 8.302800119154007, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02180367250108261, + "loss": 0.7955, + "num_input_tokens_seen": 32366344, + "step": 55745 + }, + { + "epoch": 8.303544831694966, + "grad_norm": 0.0185546875, + "learning_rate": 0.02180193488081068, + "loss": 0.8005, + "num_input_tokens_seen": 32369160, + "step": 55750 + }, + { + "epoch": 8.304289544235925, + "grad_norm": 0.028564453125, + "learning_rate": 0.02180019714562697, + "loss": 0.788, + "num_input_tokens_seen": 32371976, + "step": 55755 + }, + { + "epoch": 8.305034256776883, + "grad_norm": 0.026123046875, + "learning_rate": 0.02179845929556084, + "loss": 0.8077, + "num_input_tokens_seen": 32374696, + "step": 55760 + }, + { + "epoch": 8.305778969317844, + "grad_norm": 0.02392578125, + "learning_rate": 0.021796721330641654, + "loss": 0.8091, + "num_input_tokens_seen": 32377448, + "step": 55765 + }, + { + "epoch": 8.306523681858803, + "grad_norm": 0.0286865234375, + "learning_rate": 0.021794983250898767, + "loss": 0.7989, + "num_input_tokens_seen": 32380264, + "step": 55770 + }, + { + "epoch": 8.307268394399761, + "grad_norm": 0.0284423828125, + "learning_rate": 0.021793245056361543, + "loss": 0.8018, + "num_input_tokens_seen": 32383208, + "step": 55775 + }, + { + "epoch": 8.30801310694072, + "grad_norm": 0.038818359375, + "learning_rate": 0.021791506747059347, + "loss": 0.8082, + "num_input_tokens_seen": 32385960, + "step": 55780 + }, + { + "epoch": 8.30875781948168, + "grad_norm": 0.026611328125, + "learning_rate": 0.02178976832302155, + "loss": 0.7935, + "num_input_tokens_seen": 32388680, + "step": 55785 + }, + { + "epoch": 8.30950253202264, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02178802978427752, + "loss": 0.8009, + "num_input_tokens_seen": 32391336, + "step": 55790 + }, + { + "epoch": 8.310247244563598, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02178629113085662, + "loss": 0.7949, + "num_input_tokens_seen": 32394344, + "step": 55795 + }, + { + "epoch": 8.310991957104557, + "grad_norm": 0.033935546875, + "learning_rate": 0.021784552362788232, + "loss": 0.8003, + "num_input_tokens_seen": 32397288, + "step": 55800 + }, + { + "epoch": 8.311736669645517, + "grad_norm": 0.0238037109375, + "learning_rate": 0.021782813480101727, + "loss": 0.808, + "num_input_tokens_seen": 32400040, + "step": 55805 + }, + { + "epoch": 8.312481382186476, + "grad_norm": 0.0289306640625, + "learning_rate": 0.021781074482826485, + "loss": 0.8008, + "num_input_tokens_seen": 32403016, + "step": 55810 + }, + { + "epoch": 8.313226094727435, + "grad_norm": 0.01806640625, + "learning_rate": 0.021779335370991876, + "loss": 0.7972, + "num_input_tokens_seen": 32405992, + "step": 55815 + }, + { + "epoch": 8.313970807268394, + "grad_norm": 0.040771484375, + "learning_rate": 0.021777596144627293, + "loss": 0.7976, + "num_input_tokens_seen": 32408904, + "step": 55820 + }, + { + "epoch": 8.314715519809354, + "grad_norm": 0.035400390625, + "learning_rate": 0.021775856803762105, + "loss": 0.7788, + "num_input_tokens_seen": 32411720, + "step": 55825 + }, + { + "epoch": 8.315460232350313, + "grad_norm": 0.032470703125, + "learning_rate": 0.021774117348425704, + "loss": 0.7969, + "num_input_tokens_seen": 32415016, + "step": 55830 + }, + { + "epoch": 8.316204944891272, + "grad_norm": 0.031494140625, + "learning_rate": 0.02177237777864748, + "loss": 0.8076, + "num_input_tokens_seen": 32417896, + "step": 55835 + }, + { + "epoch": 8.31694965743223, + "grad_norm": 0.042724609375, + "learning_rate": 0.021770638094456805, + "loss": 0.8046, + "num_input_tokens_seen": 32420936, + "step": 55840 + }, + { + "epoch": 8.317694369973191, + "grad_norm": 0.0177001953125, + "learning_rate": 0.021768898295883094, + "loss": 0.7844, + "num_input_tokens_seen": 32423784, + "step": 55845 + }, + { + "epoch": 8.31843908251415, + "grad_norm": 0.0230712890625, + "learning_rate": 0.02176715838295572, + "loss": 0.8069, + "num_input_tokens_seen": 32426536, + "step": 55850 + }, + { + "epoch": 8.319183795055109, + "grad_norm": 0.0269775390625, + "learning_rate": 0.021765418355704086, + "loss": 0.7674, + "num_input_tokens_seen": 32429512, + "step": 55855 + }, + { + "epoch": 8.319928507596067, + "grad_norm": 0.0277099609375, + "learning_rate": 0.021763678214157582, + "loss": 0.8005, + "num_input_tokens_seen": 32432456, + "step": 55860 + }, + { + "epoch": 8.320673220137028, + "grad_norm": 0.0250244140625, + "learning_rate": 0.021761937958345606, + "loss": 0.8128, + "num_input_tokens_seen": 32435656, + "step": 55865 + }, + { + "epoch": 8.321417932677987, + "grad_norm": 0.024169921875, + "learning_rate": 0.021760197588297565, + "loss": 0.7994, + "num_input_tokens_seen": 32438472, + "step": 55870 + }, + { + "epoch": 8.322162645218945, + "grad_norm": 0.03173828125, + "learning_rate": 0.021758457104042856, + "loss": 0.8061, + "num_input_tokens_seen": 32441416, + "step": 55875 + }, + { + "epoch": 8.322907357759904, + "grad_norm": 0.0220947265625, + "learning_rate": 0.021756716505610882, + "loss": 0.7852, + "num_input_tokens_seen": 32443976, + "step": 55880 + }, + { + "epoch": 8.323652070300863, + "grad_norm": 0.038818359375, + "learning_rate": 0.021754975793031053, + "loss": 0.8643, + "num_input_tokens_seen": 32446760, + "step": 55885 + }, + { + "epoch": 8.324396782841823, + "grad_norm": 0.032470703125, + "learning_rate": 0.02175323496633277, + "loss": 0.8124, + "num_input_tokens_seen": 32449640, + "step": 55890 + }, + { + "epoch": 8.325141495382782, + "grad_norm": 0.02197265625, + "learning_rate": 0.021751494025545447, + "loss": 0.795, + "num_input_tokens_seen": 32452488, + "step": 55895 + }, + { + "epoch": 8.325886207923741, + "grad_norm": 0.0186767578125, + "learning_rate": 0.021749752970698494, + "loss": 0.8106, + "num_input_tokens_seen": 32455336, + "step": 55900 + }, + { + "epoch": 8.3266309204647, + "grad_norm": 0.0263671875, + "learning_rate": 0.021748011801821326, + "loss": 0.7842, + "num_input_tokens_seen": 32457864, + "step": 55905 + }, + { + "epoch": 8.32737563300566, + "grad_norm": 0.0267333984375, + "learning_rate": 0.021746270518943355, + "loss": 0.8189, + "num_input_tokens_seen": 32460808, + "step": 55910 + }, + { + "epoch": 8.328120345546619, + "grad_norm": 0.0400390625, + "learning_rate": 0.021744529122094007, + "loss": 0.7764, + "num_input_tokens_seen": 32463912, + "step": 55915 + }, + { + "epoch": 8.328865058087578, + "grad_norm": 0.03662109375, + "learning_rate": 0.021742787611302685, + "loss": 0.8103, + "num_input_tokens_seen": 32467080, + "step": 55920 + }, + { + "epoch": 8.329609770628537, + "grad_norm": 0.037353515625, + "learning_rate": 0.021741045986598824, + "loss": 0.776, + "num_input_tokens_seen": 32470152, + "step": 55925 + }, + { + "epoch": 8.330354483169497, + "grad_norm": 0.037353515625, + "learning_rate": 0.02173930424801184, + "loss": 0.8279, + "num_input_tokens_seen": 32473000, + "step": 55930 + }, + { + "epoch": 8.331099195710456, + "grad_norm": 0.0177001953125, + "learning_rate": 0.021737562395571165, + "loss": 0.8201, + "num_input_tokens_seen": 32475688, + "step": 55935 + }, + { + "epoch": 8.331843908251415, + "grad_norm": 0.04150390625, + "learning_rate": 0.021735820429306215, + "loss": 0.8083, + "num_input_tokens_seen": 32478600, + "step": 55940 + }, + { + "epoch": 8.332588620792373, + "grad_norm": 0.03173828125, + "learning_rate": 0.021734078349246427, + "loss": 0.7746, + "num_input_tokens_seen": 32481736, + "step": 55945 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.01708984375, + "learning_rate": 0.021732336155421233, + "loss": 0.7822, + "num_input_tokens_seen": 32484744, + "step": 55950 + }, + { + "epoch": 8.334078045874293, + "grad_norm": 0.029541015625, + "learning_rate": 0.021730593847860056, + "loss": 0.8031, + "num_input_tokens_seen": 32487688, + "step": 55955 + }, + { + "epoch": 8.334822758415251, + "grad_norm": 0.0263671875, + "learning_rate": 0.02172885142659234, + "loss": 0.7951, + "num_input_tokens_seen": 32490280, + "step": 55960 + }, + { + "epoch": 8.33556747095621, + "grad_norm": 0.041015625, + "learning_rate": 0.021727108891647518, + "loss": 0.8161, + "num_input_tokens_seen": 32493192, + "step": 55965 + }, + { + "epoch": 8.33631218349717, + "grad_norm": 0.027587890625, + "learning_rate": 0.021725366243055027, + "loss": 0.7965, + "num_input_tokens_seen": 32496040, + "step": 55970 + }, + { + "epoch": 8.33705689603813, + "grad_norm": 0.03125, + "learning_rate": 0.02172362348084431, + "loss": 0.8028, + "num_input_tokens_seen": 32499080, + "step": 55975 + }, + { + "epoch": 8.337801608579088, + "grad_norm": 0.053466796875, + "learning_rate": 0.02172188060504481, + "loss": 0.7923, + "num_input_tokens_seen": 32501768, + "step": 55980 + }, + { + "epoch": 8.338546321120047, + "grad_norm": 0.022216796875, + "learning_rate": 0.021720137615685968, + "loss": 0.8052, + "num_input_tokens_seen": 32504456, + "step": 55985 + }, + { + "epoch": 8.339291033661008, + "grad_norm": 0.0301513671875, + "learning_rate": 0.02171839451279723, + "loss": 0.8016, + "num_input_tokens_seen": 32507496, + "step": 55990 + }, + { + "epoch": 8.340035746201966, + "grad_norm": 0.0267333984375, + "learning_rate": 0.021716651296408044, + "loss": 0.7907, + "num_input_tokens_seen": 32510408, + "step": 55995 + }, + { + "epoch": 8.340780458742925, + "grad_norm": 0.0283203125, + "learning_rate": 0.02171490796654786, + "loss": 0.8154, + "num_input_tokens_seen": 32513416, + "step": 56000 + }, + { + "epoch": 8.341525171283884, + "grad_norm": 0.0263671875, + "learning_rate": 0.021713164523246134, + "loss": 0.8363, + "num_input_tokens_seen": 32516232, + "step": 56005 + }, + { + "epoch": 8.342269883824844, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02171142096653231, + "loss": 0.8329, + "num_input_tokens_seen": 32519240, + "step": 56010 + }, + { + "epoch": 8.343014596365803, + "grad_norm": 0.020751953125, + "learning_rate": 0.021709677296435855, + "loss": 0.7919, + "num_input_tokens_seen": 32521896, + "step": 56015 + }, + { + "epoch": 8.343759308906762, + "grad_norm": 0.0230712890625, + "learning_rate": 0.021707933512986225, + "loss": 0.7979, + "num_input_tokens_seen": 32524968, + "step": 56020 + }, + { + "epoch": 8.34450402144772, + "grad_norm": 0.02392578125, + "learning_rate": 0.02170618961621287, + "loss": 0.8214, + "num_input_tokens_seen": 32527848, + "step": 56025 + }, + { + "epoch": 8.345248733988681, + "grad_norm": 0.029541015625, + "learning_rate": 0.02170444560614526, + "loss": 0.8246, + "num_input_tokens_seen": 32530760, + "step": 56030 + }, + { + "epoch": 8.34599344652964, + "grad_norm": 0.0213623046875, + "learning_rate": 0.021702701482812854, + "loss": 0.8013, + "num_input_tokens_seen": 32533512, + "step": 56035 + }, + { + "epoch": 8.346738159070599, + "grad_norm": 0.0234375, + "learning_rate": 0.021700957246245118, + "loss": 0.7998, + "num_input_tokens_seen": 32536168, + "step": 56040 + }, + { + "epoch": 8.347482871611557, + "grad_norm": 0.023681640625, + "learning_rate": 0.021699212896471524, + "loss": 0.8072, + "num_input_tokens_seen": 32539304, + "step": 56045 + }, + { + "epoch": 8.348227584152518, + "grad_norm": 0.032958984375, + "learning_rate": 0.021697468433521532, + "loss": 0.8157, + "num_input_tokens_seen": 32542408, + "step": 56050 + }, + { + "epoch": 8.348972296693477, + "grad_norm": 0.03076171875, + "learning_rate": 0.021695723857424624, + "loss": 0.8153, + "num_input_tokens_seen": 32545448, + "step": 56055 + }, + { + "epoch": 8.349717009234435, + "grad_norm": 0.0169677734375, + "learning_rate": 0.02169397916821026, + "loss": 0.8029, + "num_input_tokens_seen": 32548072, + "step": 56060 + }, + { + "epoch": 8.350461721775394, + "grad_norm": 0.0220947265625, + "learning_rate": 0.021692234365907927, + "loss": 0.8091, + "num_input_tokens_seen": 32550728, + "step": 56065 + }, + { + "epoch": 8.351206434316353, + "grad_norm": 0.029541015625, + "learning_rate": 0.021690489450547093, + "loss": 0.8069, + "num_input_tokens_seen": 32553896, + "step": 56070 + }, + { + "epoch": 8.351951146857314, + "grad_norm": 0.0157470703125, + "learning_rate": 0.02168874442215724, + "loss": 0.802, + "num_input_tokens_seen": 32556520, + "step": 56075 + }, + { + "epoch": 8.352695859398272, + "grad_norm": 0.035400390625, + "learning_rate": 0.02168699928076785, + "loss": 0.7965, + "num_input_tokens_seen": 32559368, + "step": 56080 + }, + { + "epoch": 8.353440571939231, + "grad_norm": 0.0203857421875, + "learning_rate": 0.021685254026408397, + "loss": 0.7897, + "num_input_tokens_seen": 32561992, + "step": 56085 + }, + { + "epoch": 8.35418528448019, + "grad_norm": 0.0299072265625, + "learning_rate": 0.021683508659108383, + "loss": 0.812, + "num_input_tokens_seen": 32564680, + "step": 56090 + }, + { + "epoch": 8.35492999702115, + "grad_norm": 0.024658203125, + "learning_rate": 0.021681763178897275, + "loss": 0.7863, + "num_input_tokens_seen": 32567528, + "step": 56095 + }, + { + "epoch": 8.35567470956211, + "grad_norm": 0.017578125, + "learning_rate": 0.021680017585804567, + "loss": 0.7992, + "num_input_tokens_seen": 32570184, + "step": 56100 + }, + { + "epoch": 8.356419422103068, + "grad_norm": 0.0228271484375, + "learning_rate": 0.021678271879859757, + "loss": 0.7944, + "num_input_tokens_seen": 32573000, + "step": 56105 + }, + { + "epoch": 8.357164134644027, + "grad_norm": 0.0225830078125, + "learning_rate": 0.021676526061092324, + "loss": 0.8, + "num_input_tokens_seen": 32575976, + "step": 56110 + }, + { + "epoch": 8.357908847184987, + "grad_norm": 0.0281982421875, + "learning_rate": 0.02167478012953178, + "loss": 0.8168, + "num_input_tokens_seen": 32578856, + "step": 56115 + }, + { + "epoch": 8.358653559725946, + "grad_norm": 0.030029296875, + "learning_rate": 0.0216730340852076, + "loss": 0.8135, + "num_input_tokens_seen": 32581608, + "step": 56120 + }, + { + "epoch": 8.359398272266905, + "grad_norm": 0.0284423828125, + "learning_rate": 0.021671287928149294, + "loss": 0.7997, + "num_input_tokens_seen": 32584456, + "step": 56125 + }, + { + "epoch": 8.360142984807863, + "grad_norm": 0.021484375, + "learning_rate": 0.02166954165838636, + "loss": 0.7783, + "num_input_tokens_seen": 32587336, + "step": 56130 + }, + { + "epoch": 8.360887697348824, + "grad_norm": 0.0322265625, + "learning_rate": 0.0216677952759483, + "loss": 0.8078, + "num_input_tokens_seen": 32590664, + "step": 56135 + }, + { + "epoch": 8.361632409889783, + "grad_norm": 0.0299072265625, + "learning_rate": 0.021666048780864613, + "loss": 0.7983, + "num_input_tokens_seen": 32593416, + "step": 56140 + }, + { + "epoch": 8.362377122430741, + "grad_norm": 0.0299072265625, + "learning_rate": 0.021664302173164805, + "loss": 0.7923, + "num_input_tokens_seen": 32596296, + "step": 56145 + }, + { + "epoch": 8.3631218349717, + "grad_norm": 0.03515625, + "learning_rate": 0.02166255545287839, + "loss": 0.8112, + "num_input_tokens_seen": 32599592, + "step": 56150 + }, + { + "epoch": 8.36386654751266, + "grad_norm": 0.021240234375, + "learning_rate": 0.02166080862003487, + "loss": 0.8071, + "num_input_tokens_seen": 32602408, + "step": 56155 + }, + { + "epoch": 8.36461126005362, + "grad_norm": 0.0211181640625, + "learning_rate": 0.021659061674663756, + "loss": 0.8118, + "num_input_tokens_seen": 32605128, + "step": 56160 + }, + { + "epoch": 8.365355972594578, + "grad_norm": 0.0281982421875, + "learning_rate": 0.021657314616794564, + "loss": 0.8053, + "num_input_tokens_seen": 32608040, + "step": 56165 + }, + { + "epoch": 8.366100685135537, + "grad_norm": 0.0262451171875, + "learning_rate": 0.021655567446456813, + "loss": 0.8074, + "num_input_tokens_seen": 32610984, + "step": 56170 + }, + { + "epoch": 8.366845397676498, + "grad_norm": 0.0322265625, + "learning_rate": 0.021653820163680008, + "loss": 0.8023, + "num_input_tokens_seen": 32613992, + "step": 56175 + }, + { + "epoch": 8.367590110217456, + "grad_norm": 0.016845703125, + "learning_rate": 0.021652072768493673, + "loss": 0.8174, + "num_input_tokens_seen": 32616872, + "step": 56180 + }, + { + "epoch": 8.368334822758415, + "grad_norm": 0.0152587890625, + "learning_rate": 0.021650325260927333, + "loss": 0.8227, + "num_input_tokens_seen": 32619464, + "step": 56185 + }, + { + "epoch": 8.369079535299374, + "grad_norm": 0.0250244140625, + "learning_rate": 0.021648577641010507, + "loss": 0.8005, + "num_input_tokens_seen": 32622600, + "step": 56190 + }, + { + "epoch": 8.369824247840334, + "grad_norm": 0.0263671875, + "learning_rate": 0.021646829908772716, + "loss": 0.8108, + "num_input_tokens_seen": 32625320, + "step": 56195 + }, + { + "epoch": 8.370568960381293, + "grad_norm": 0.02099609375, + "learning_rate": 0.021645082064243495, + "loss": 0.8017, + "num_input_tokens_seen": 32628168, + "step": 56200 + }, + { + "epoch": 8.371313672922252, + "grad_norm": 0.037109375, + "learning_rate": 0.02164333410745236, + "loss": 0.7982, + "num_input_tokens_seen": 32630984, + "step": 56205 + }, + { + "epoch": 8.37205838546321, + "grad_norm": 0.02392578125, + "learning_rate": 0.02164158603842885, + "loss": 0.8024, + "num_input_tokens_seen": 32633800, + "step": 56210 + }, + { + "epoch": 8.372803098004171, + "grad_norm": 0.0223388671875, + "learning_rate": 0.021639837857202495, + "loss": 0.8014, + "num_input_tokens_seen": 32637000, + "step": 56215 + }, + { + "epoch": 8.37354781054513, + "grad_norm": 0.049560546875, + "learning_rate": 0.02163808956380283, + "loss": 0.7952, + "num_input_tokens_seen": 32639624, + "step": 56220 + }, + { + "epoch": 8.374292523086089, + "grad_norm": 0.051025390625, + "learning_rate": 0.021636341158259385, + "loss": 0.8003, + "num_input_tokens_seen": 32642632, + "step": 56225 + }, + { + "epoch": 8.375037235627047, + "grad_norm": 0.03173828125, + "learning_rate": 0.021634592640601698, + "loss": 0.8155, + "num_input_tokens_seen": 32645640, + "step": 56230 + }, + { + "epoch": 8.375781948168008, + "grad_norm": 0.0303955078125, + "learning_rate": 0.021632844010859315, + "loss": 0.7904, + "num_input_tokens_seen": 32648520, + "step": 56235 + }, + { + "epoch": 8.376526660708967, + "grad_norm": 0.0233154296875, + "learning_rate": 0.021631095269061772, + "loss": 0.7832, + "num_input_tokens_seen": 32651688, + "step": 56240 + }, + { + "epoch": 8.377271373249926, + "grad_norm": 0.0145263671875, + "learning_rate": 0.021629346415238615, + "loss": 0.7822, + "num_input_tokens_seen": 32654504, + "step": 56245 + }, + { + "epoch": 8.378016085790884, + "grad_norm": 0.048095703125, + "learning_rate": 0.021627597449419392, + "loss": 0.7854, + "num_input_tokens_seen": 32657416, + "step": 56250 + }, + { + "epoch": 8.378760798331843, + "grad_norm": 0.025390625, + "learning_rate": 0.021625848371633644, + "loss": 0.7978, + "num_input_tokens_seen": 32660072, + "step": 56255 + }, + { + "epoch": 8.379505510872804, + "grad_norm": 0.030517578125, + "learning_rate": 0.02162409918191092, + "loss": 0.8226, + "num_input_tokens_seen": 32663240, + "step": 56260 + }, + { + "epoch": 8.380250223413762, + "grad_norm": 0.01544189453125, + "learning_rate": 0.02162234988028077, + "loss": 0.8236, + "num_input_tokens_seen": 32666024, + "step": 56265 + }, + { + "epoch": 8.380994935954721, + "grad_norm": 0.03173828125, + "learning_rate": 0.021620600466772753, + "loss": 0.7793, + "num_input_tokens_seen": 32668808, + "step": 56270 + }, + { + "epoch": 8.38173964849568, + "grad_norm": 0.03173828125, + "learning_rate": 0.02161885094141642, + "loss": 0.7977, + "num_input_tokens_seen": 32671816, + "step": 56275 + }, + { + "epoch": 8.38248436103664, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02161710130424133, + "loss": 0.8023, + "num_input_tokens_seen": 32674376, + "step": 56280 + }, + { + "epoch": 8.3832290735776, + "grad_norm": 0.0230712890625, + "learning_rate": 0.021615351555277034, + "loss": 0.8212, + "num_input_tokens_seen": 32677192, + "step": 56285 + }, + { + "epoch": 8.383973786118558, + "grad_norm": 0.026611328125, + "learning_rate": 0.021613601694553102, + "loss": 0.7889, + "num_input_tokens_seen": 32680104, + "step": 56290 + }, + { + "epoch": 8.384718498659517, + "grad_norm": 0.03173828125, + "learning_rate": 0.021611851722099087, + "loss": 0.8324, + "num_input_tokens_seen": 32683016, + "step": 56295 + }, + { + "epoch": 8.385463211200477, + "grad_norm": 0.021484375, + "learning_rate": 0.021610101637944558, + "loss": 0.7868, + "num_input_tokens_seen": 32685800, + "step": 56300 + }, + { + "epoch": 8.386207923741436, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02160835144211908, + "loss": 0.8118, + "num_input_tokens_seen": 32688808, + "step": 56305 + }, + { + "epoch": 8.386952636282395, + "grad_norm": 0.03271484375, + "learning_rate": 0.02160660113465222, + "loss": 0.8081, + "num_input_tokens_seen": 32691752, + "step": 56310 + }, + { + "epoch": 8.387697348823353, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02160485071557355, + "loss": 0.8126, + "num_input_tokens_seen": 32695272, + "step": 56315 + }, + { + "epoch": 8.388442061364314, + "grad_norm": 0.019775390625, + "learning_rate": 0.02160310018491264, + "loss": 0.7955, + "num_input_tokens_seen": 32698312, + "step": 56320 + }, + { + "epoch": 8.389186773905273, + "grad_norm": 0.036865234375, + "learning_rate": 0.021601349542699065, + "loss": 0.7967, + "num_input_tokens_seen": 32701224, + "step": 56325 + }, + { + "epoch": 8.389931486446232, + "grad_norm": 0.039306640625, + "learning_rate": 0.021599598788962397, + "loss": 0.8133, + "num_input_tokens_seen": 32704072, + "step": 56330 + }, + { + "epoch": 8.39067619898719, + "grad_norm": 0.0301513671875, + "learning_rate": 0.021597847923732212, + "loss": 0.8115, + "num_input_tokens_seen": 32706856, + "step": 56335 + }, + { + "epoch": 8.39142091152815, + "grad_norm": 0.017333984375, + "learning_rate": 0.021596096947038096, + "loss": 0.7972, + "num_input_tokens_seen": 32709640, + "step": 56340 + }, + { + "epoch": 8.39216562406911, + "grad_norm": 0.029052734375, + "learning_rate": 0.021594345858909625, + "loss": 0.8165, + "num_input_tokens_seen": 32712392, + "step": 56345 + }, + { + "epoch": 8.392910336610068, + "grad_norm": 0.0274658203125, + "learning_rate": 0.021592594659376383, + "loss": 0.7958, + "num_input_tokens_seen": 32715176, + "step": 56350 + }, + { + "epoch": 8.393655049151027, + "grad_norm": 0.0184326171875, + "learning_rate": 0.021590843348467954, + "loss": 0.8139, + "num_input_tokens_seen": 32717896, + "step": 56355 + }, + { + "epoch": 8.394399761691988, + "grad_norm": 0.033203125, + "learning_rate": 0.021589091926213926, + "loss": 0.8097, + "num_input_tokens_seen": 32720424, + "step": 56360 + }, + { + "epoch": 8.395144474232946, + "grad_norm": 0.031494140625, + "learning_rate": 0.021587340392643886, + "loss": 0.8159, + "num_input_tokens_seen": 32723400, + "step": 56365 + }, + { + "epoch": 8.395889186773905, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02158558874778743, + "loss": 0.8076, + "num_input_tokens_seen": 32726536, + "step": 56370 + }, + { + "epoch": 8.396633899314864, + "grad_norm": 0.05419921875, + "learning_rate": 0.02158383699167414, + "loss": 0.8136, + "num_input_tokens_seen": 32729352, + "step": 56375 + }, + { + "epoch": 8.397378611855824, + "grad_norm": 0.02294921875, + "learning_rate": 0.021582085124333617, + "loss": 0.8096, + "num_input_tokens_seen": 32732264, + "step": 56380 + }, + { + "epoch": 8.398123324396783, + "grad_norm": 0.0341796875, + "learning_rate": 0.021580333145795456, + "loss": 0.8184, + "num_input_tokens_seen": 32735272, + "step": 56385 + }, + { + "epoch": 8.398868036937742, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02157858105608925, + "loss": 0.8006, + "num_input_tokens_seen": 32737992, + "step": 56390 + }, + { + "epoch": 8.3996127494787, + "grad_norm": 0.0240478515625, + "learning_rate": 0.021576828855244607, + "loss": 0.8129, + "num_input_tokens_seen": 32740808, + "step": 56395 + }, + { + "epoch": 8.400357462019661, + "grad_norm": 0.02880859375, + "learning_rate": 0.021575076543291126, + "loss": 0.792, + "num_input_tokens_seen": 32743848, + "step": 56400 + }, + { + "epoch": 8.40110217456062, + "grad_norm": 0.04833984375, + "learning_rate": 0.02157332412025841, + "loss": 0.8035, + "num_input_tokens_seen": 32746952, + "step": 56405 + }, + { + "epoch": 8.401846887101579, + "grad_norm": 0.025390625, + "learning_rate": 0.021571571586176057, + "loss": 0.803, + "num_input_tokens_seen": 32750216, + "step": 56410 + }, + { + "epoch": 8.402591599642538, + "grad_norm": 0.038330078125, + "learning_rate": 0.02156981894107369, + "loss": 0.7847, + "num_input_tokens_seen": 32753480, + "step": 56415 + }, + { + "epoch": 8.403336312183498, + "grad_norm": 0.029052734375, + "learning_rate": 0.021568066184980906, + "loss": 0.8395, + "num_input_tokens_seen": 32756456, + "step": 56420 + }, + { + "epoch": 8.404081024724457, + "grad_norm": 0.023193359375, + "learning_rate": 0.02156631331792732, + "loss": 0.7999, + "num_input_tokens_seen": 32759080, + "step": 56425 + }, + { + "epoch": 8.404825737265416, + "grad_norm": 0.041015625, + "learning_rate": 0.021564560339942543, + "loss": 0.8135, + "num_input_tokens_seen": 32762184, + "step": 56430 + }, + { + "epoch": 8.405570449806374, + "grad_norm": 0.0242919921875, + "learning_rate": 0.021562807251056186, + "loss": 0.7815, + "num_input_tokens_seen": 32765096, + "step": 56435 + }, + { + "epoch": 8.406315162347333, + "grad_norm": 0.029541015625, + "learning_rate": 0.021561054051297875, + "loss": 0.819, + "num_input_tokens_seen": 32767912, + "step": 56440 + }, + { + "epoch": 8.407059874888294, + "grad_norm": 0.055419921875, + "learning_rate": 0.02155930074069723, + "loss": 0.8094, + "num_input_tokens_seen": 32770696, + "step": 56445 + }, + { + "epoch": 8.407804587429252, + "grad_norm": 0.034912109375, + "learning_rate": 0.02155754731928386, + "loss": 0.8043, + "num_input_tokens_seen": 32773768, + "step": 56450 + }, + { + "epoch": 8.408549299970211, + "grad_norm": 0.02880859375, + "learning_rate": 0.021555793787087392, + "loss": 0.7925, + "num_input_tokens_seen": 32776840, + "step": 56455 + }, + { + "epoch": 8.40929401251117, + "grad_norm": 0.0303955078125, + "learning_rate": 0.021554040144137452, + "loss": 0.8188, + "num_input_tokens_seen": 32779752, + "step": 56460 + }, + { + "epoch": 8.41003872505213, + "grad_norm": 0.0238037109375, + "learning_rate": 0.021552286390463665, + "loss": 0.8066, + "num_input_tokens_seen": 32782568, + "step": 56465 + }, + { + "epoch": 8.41078343759309, + "grad_norm": 0.0299072265625, + "learning_rate": 0.021550532526095657, + "loss": 0.7847, + "num_input_tokens_seen": 32785736, + "step": 56470 + }, + { + "epoch": 8.411528150134048, + "grad_norm": 0.015625, + "learning_rate": 0.021548778551063064, + "loss": 0.8092, + "num_input_tokens_seen": 32788744, + "step": 56475 + }, + { + "epoch": 8.412272862675007, + "grad_norm": 0.0322265625, + "learning_rate": 0.021547024465395512, + "loss": 0.7963, + "num_input_tokens_seen": 32791464, + "step": 56480 + }, + { + "epoch": 8.413017575215967, + "grad_norm": 0.033203125, + "learning_rate": 0.021545270269122632, + "loss": 0.8049, + "num_input_tokens_seen": 32794440, + "step": 56485 + }, + { + "epoch": 8.413762287756926, + "grad_norm": 0.02490234375, + "learning_rate": 0.02154351596227407, + "loss": 0.7966, + "num_input_tokens_seen": 32797288, + "step": 56490 + }, + { + "epoch": 8.414507000297885, + "grad_norm": 0.031494140625, + "learning_rate": 0.02154176154487945, + "loss": 0.7977, + "num_input_tokens_seen": 32800296, + "step": 56495 + }, + { + "epoch": 8.415251712838844, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02154000701696842, + "loss": 0.8011, + "num_input_tokens_seen": 32802984, + "step": 56500 + }, + { + "epoch": 8.415996425379804, + "grad_norm": 0.0174560546875, + "learning_rate": 0.021538252378570613, + "loss": 0.8095, + "num_input_tokens_seen": 32805960, + "step": 56505 + }, + { + "epoch": 8.416741137920763, + "grad_norm": 0.0252685546875, + "learning_rate": 0.021536497629715684, + "loss": 0.7932, + "num_input_tokens_seen": 32808808, + "step": 56510 + }, + { + "epoch": 8.417485850461722, + "grad_norm": 0.04296875, + "learning_rate": 0.021534742770433272, + "loss": 0.8226, + "num_input_tokens_seen": 32811560, + "step": 56515 + }, + { + "epoch": 8.41823056300268, + "grad_norm": 0.0283203125, + "learning_rate": 0.021532987800753016, + "loss": 0.8061, + "num_input_tokens_seen": 32814632, + "step": 56520 + }, + { + "epoch": 8.418975275543641, + "grad_norm": 0.0301513671875, + "learning_rate": 0.021531232720704573, + "loss": 0.8025, + "num_input_tokens_seen": 32817608, + "step": 56525 + }, + { + "epoch": 8.4197199880846, + "grad_norm": 0.0341796875, + "learning_rate": 0.021529477530317594, + "loss": 0.7907, + "num_input_tokens_seen": 32820488, + "step": 56530 + }, + { + "epoch": 8.420464700625558, + "grad_norm": 0.025146484375, + "learning_rate": 0.021527722229621727, + "loss": 0.8165, + "num_input_tokens_seen": 32823432, + "step": 56535 + }, + { + "epoch": 8.421209413166517, + "grad_norm": 0.02783203125, + "learning_rate": 0.021525966818646627, + "loss": 0.8156, + "num_input_tokens_seen": 32826344, + "step": 56540 + }, + { + "epoch": 8.421954125707478, + "grad_norm": 0.034423828125, + "learning_rate": 0.021524211297421953, + "loss": 0.826, + "num_input_tokens_seen": 32829192, + "step": 56545 + }, + { + "epoch": 8.422698838248436, + "grad_norm": 0.02294921875, + "learning_rate": 0.021522455665977356, + "loss": 0.798, + "num_input_tokens_seen": 32832040, + "step": 56550 + }, + { + "epoch": 8.423443550789395, + "grad_norm": 0.0262451171875, + "learning_rate": 0.021520699924342506, + "loss": 0.8052, + "num_input_tokens_seen": 32835368, + "step": 56555 + }, + { + "epoch": 8.424188263330354, + "grad_norm": 0.016357421875, + "learning_rate": 0.021518944072547053, + "loss": 0.7914, + "num_input_tokens_seen": 32838312, + "step": 56560 + }, + { + "epoch": 8.424932975871315, + "grad_norm": 0.0283203125, + "learning_rate": 0.02151718811062066, + "loss": 0.8081, + "num_input_tokens_seen": 32841096, + "step": 56565 + }, + { + "epoch": 8.425677688412273, + "grad_norm": 0.020263671875, + "learning_rate": 0.021515432038593005, + "loss": 0.7908, + "num_input_tokens_seen": 32844040, + "step": 56570 + }, + { + "epoch": 8.426422400953232, + "grad_norm": 0.023193359375, + "learning_rate": 0.021513675856493747, + "loss": 0.8038, + "num_input_tokens_seen": 32846696, + "step": 56575 + }, + { + "epoch": 8.42716711349419, + "grad_norm": 0.03125, + "learning_rate": 0.021511919564352555, + "loss": 0.7974, + "num_input_tokens_seen": 32849544, + "step": 56580 + }, + { + "epoch": 8.42791182603515, + "grad_norm": 0.0242919921875, + "learning_rate": 0.021510163162199104, + "loss": 0.8015, + "num_input_tokens_seen": 32852648, + "step": 56585 + }, + { + "epoch": 8.42865653857611, + "grad_norm": 0.0205078125, + "learning_rate": 0.02150840665006306, + "loss": 0.7959, + "num_input_tokens_seen": 32856072, + "step": 56590 + }, + { + "epoch": 8.429401251117069, + "grad_norm": 0.037109375, + "learning_rate": 0.021506650027974097, + "loss": 0.8264, + "num_input_tokens_seen": 32858920, + "step": 56595 + }, + { + "epoch": 8.430145963658028, + "grad_norm": 0.0152587890625, + "learning_rate": 0.021504893295961898, + "loss": 0.8222, + "num_input_tokens_seen": 32861576, + "step": 56600 + }, + { + "epoch": 8.430890676198986, + "grad_norm": 0.022216796875, + "learning_rate": 0.021503136454056136, + "loss": 0.7894, + "num_input_tokens_seen": 32864584, + "step": 56605 + }, + { + "epoch": 8.431635388739947, + "grad_norm": 0.029052734375, + "learning_rate": 0.021501379502286498, + "loss": 0.7916, + "num_input_tokens_seen": 32867400, + "step": 56610 + }, + { + "epoch": 8.432380101280906, + "grad_norm": 0.029296875, + "learning_rate": 0.02149962244068266, + "loss": 0.7952, + "num_input_tokens_seen": 32870344, + "step": 56615 + }, + { + "epoch": 8.433124813821864, + "grad_norm": 0.0380859375, + "learning_rate": 0.021497865269274302, + "loss": 0.8221, + "num_input_tokens_seen": 32873320, + "step": 56620 + }, + { + "epoch": 8.433869526362823, + "grad_norm": 0.0419921875, + "learning_rate": 0.02149610798809112, + "loss": 0.8157, + "num_input_tokens_seen": 32875976, + "step": 56625 + }, + { + "epoch": 8.434614238903784, + "grad_norm": 0.034423828125, + "learning_rate": 0.02149435059716279, + "loss": 0.8006, + "num_input_tokens_seen": 32878824, + "step": 56630 + }, + { + "epoch": 8.435358951444742, + "grad_norm": 0.03759765625, + "learning_rate": 0.021492593096519012, + "loss": 0.8093, + "num_input_tokens_seen": 32881608, + "step": 56635 + }, + { + "epoch": 8.436103663985701, + "grad_norm": 0.037109375, + "learning_rate": 0.02149083548618947, + "loss": 0.7975, + "num_input_tokens_seen": 32884456, + "step": 56640 + }, + { + "epoch": 8.43684837652666, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02148907776620386, + "loss": 0.8214, + "num_input_tokens_seen": 32887336, + "step": 56645 + }, + { + "epoch": 8.43759308906762, + "grad_norm": 0.0255126953125, + "learning_rate": 0.021487319936591878, + "loss": 0.7898, + "num_input_tokens_seen": 32890184, + "step": 56650 + }, + { + "epoch": 8.43833780160858, + "grad_norm": 0.041015625, + "learning_rate": 0.021485561997383218, + "loss": 0.7867, + "num_input_tokens_seen": 32893128, + "step": 56655 + }, + { + "epoch": 8.439082514149538, + "grad_norm": 0.027587890625, + "learning_rate": 0.021483803948607577, + "loss": 0.8056, + "num_input_tokens_seen": 32896136, + "step": 56660 + }, + { + "epoch": 8.439827226690497, + "grad_norm": 0.024169921875, + "learning_rate": 0.02148204579029466, + "loss": 0.7964, + "num_input_tokens_seen": 32898856, + "step": 56665 + }, + { + "epoch": 8.440571939231457, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02148028752247417, + "loss": 0.7925, + "num_input_tokens_seen": 32901672, + "step": 56670 + }, + { + "epoch": 8.441316651772416, + "grad_norm": 0.031494140625, + "learning_rate": 0.021478529145175805, + "loss": 0.8171, + "num_input_tokens_seen": 32904680, + "step": 56675 + }, + { + "epoch": 8.442061364313375, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02147677065842928, + "loss": 0.8143, + "num_input_tokens_seen": 32907624, + "step": 56680 + }, + { + "epoch": 8.442806076854334, + "grad_norm": 0.02392578125, + "learning_rate": 0.02147501206226429, + "loss": 0.8182, + "num_input_tokens_seen": 32910632, + "step": 56685 + }, + { + "epoch": 8.443550789395294, + "grad_norm": 0.02783203125, + "learning_rate": 0.021473253356710555, + "loss": 0.7932, + "num_input_tokens_seen": 32913608, + "step": 56690 + }, + { + "epoch": 8.444295501936253, + "grad_norm": 0.016357421875, + "learning_rate": 0.021471494541797784, + "loss": 0.8136, + "num_input_tokens_seen": 32916392, + "step": 56695 + }, + { + "epoch": 8.445040214477212, + "grad_norm": 0.0302734375, + "learning_rate": 0.021469735617555687, + "loss": 0.8079, + "num_input_tokens_seen": 32919528, + "step": 56700 + }, + { + "epoch": 8.44578492701817, + "grad_norm": 0.029541015625, + "learning_rate": 0.02146797658401399, + "loss": 0.7835, + "num_input_tokens_seen": 32922120, + "step": 56705 + }, + { + "epoch": 8.446529639559131, + "grad_norm": 0.0240478515625, + "learning_rate": 0.021466217441202397, + "loss": 0.7785, + "num_input_tokens_seen": 32925000, + "step": 56710 + }, + { + "epoch": 8.44727435210009, + "grad_norm": 0.0262451171875, + "learning_rate": 0.021464458189150635, + "loss": 0.8101, + "num_input_tokens_seen": 32928104, + "step": 56715 + }, + { + "epoch": 8.448019064641048, + "grad_norm": 0.0233154296875, + "learning_rate": 0.021462698827888416, + "loss": 0.8159, + "num_input_tokens_seen": 32931016, + "step": 56720 + }, + { + "epoch": 8.448763777182007, + "grad_norm": 0.039794921875, + "learning_rate": 0.021460939357445474, + "loss": 0.8171, + "num_input_tokens_seen": 32934344, + "step": 56725 + }, + { + "epoch": 8.449508489722968, + "grad_norm": 0.026611328125, + "learning_rate": 0.021459179777851525, + "loss": 0.7975, + "num_input_tokens_seen": 32936904, + "step": 56730 + }, + { + "epoch": 8.450253202263927, + "grad_norm": 0.02197265625, + "learning_rate": 0.021457420089136304, + "loss": 0.792, + "num_input_tokens_seen": 32939720, + "step": 56735 + }, + { + "epoch": 8.450997914804885, + "grad_norm": 0.027099609375, + "learning_rate": 0.02145566029132953, + "loss": 0.7934, + "num_input_tokens_seen": 32942760, + "step": 56740 + }, + { + "epoch": 8.451742627345844, + "grad_norm": 0.015869140625, + "learning_rate": 0.021453900384460937, + "loss": 0.7912, + "num_input_tokens_seen": 32945608, + "step": 56745 + }, + { + "epoch": 8.452487339886805, + "grad_norm": 0.03271484375, + "learning_rate": 0.021452140368560257, + "loss": 0.8017, + "num_input_tokens_seen": 32948776, + "step": 56750 + }, + { + "epoch": 8.453232052427763, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02145038024365722, + "loss": 0.817, + "num_input_tokens_seen": 32951720, + "step": 56755 + }, + { + "epoch": 8.453976764968722, + "grad_norm": 0.0155029296875, + "learning_rate": 0.021448620009781566, + "loss": 0.7925, + "num_input_tokens_seen": 32954664, + "step": 56760 + }, + { + "epoch": 8.45472147750968, + "grad_norm": 0.0274658203125, + "learning_rate": 0.021446859666963036, + "loss": 0.7937, + "num_input_tokens_seen": 32957576, + "step": 56765 + }, + { + "epoch": 8.45546619005064, + "grad_norm": 0.0223388671875, + "learning_rate": 0.021445099215231356, + "loss": 0.8054, + "num_input_tokens_seen": 32960680, + "step": 56770 + }, + { + "epoch": 8.4562109025916, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02144333865461628, + "loss": 0.8058, + "num_input_tokens_seen": 32963528, + "step": 56775 + }, + { + "epoch": 8.456955615132559, + "grad_norm": 0.0166015625, + "learning_rate": 0.021441577985147554, + "loss": 0.7978, + "num_input_tokens_seen": 32966344, + "step": 56780 + }, + { + "epoch": 8.457700327673518, + "grad_norm": 0.031494140625, + "learning_rate": 0.0214398172068549, + "loss": 0.789, + "num_input_tokens_seen": 32969512, + "step": 56785 + }, + { + "epoch": 8.458445040214476, + "grad_norm": 0.0211181640625, + "learning_rate": 0.021438056319768087, + "loss": 0.7857, + "num_input_tokens_seen": 32972168, + "step": 56790 + }, + { + "epoch": 8.459189752755437, + "grad_norm": 0.023681640625, + "learning_rate": 0.021436295323916853, + "loss": 0.8431, + "num_input_tokens_seen": 32975048, + "step": 56795 + }, + { + "epoch": 8.459934465296396, + "grad_norm": 0.0302734375, + "learning_rate": 0.021434534219330954, + "loss": 0.8027, + "num_input_tokens_seen": 32977864, + "step": 56800 + }, + { + "epoch": 8.460679177837354, + "grad_norm": 0.031982421875, + "learning_rate": 0.02143277300604014, + "loss": 0.7968, + "num_input_tokens_seen": 32980712, + "step": 56805 + }, + { + "epoch": 8.461423890378313, + "grad_norm": 0.0169677734375, + "learning_rate": 0.02143101168407416, + "loss": 0.8355, + "num_input_tokens_seen": 32983432, + "step": 56810 + }, + { + "epoch": 8.462168602919274, + "grad_norm": 0.021240234375, + "learning_rate": 0.02142925025346278, + "loss": 0.8343, + "num_input_tokens_seen": 32986472, + "step": 56815 + }, + { + "epoch": 8.462913315460233, + "grad_norm": 0.01300048828125, + "learning_rate": 0.021427488714235745, + "loss": 0.8105, + "num_input_tokens_seen": 32989064, + "step": 56820 + }, + { + "epoch": 8.463658028001191, + "grad_norm": 0.0289306640625, + "learning_rate": 0.021425727066422824, + "loss": 0.8088, + "num_input_tokens_seen": 32992168, + "step": 56825 + }, + { + "epoch": 8.46440274054215, + "grad_norm": 0.03076171875, + "learning_rate": 0.02142396531005377, + "loss": 0.7976, + "num_input_tokens_seen": 32994856, + "step": 56830 + }, + { + "epoch": 8.46514745308311, + "grad_norm": 0.0159912109375, + "learning_rate": 0.02142220344515836, + "loss": 0.8016, + "num_input_tokens_seen": 32997992, + "step": 56835 + }, + { + "epoch": 8.46589216562407, + "grad_norm": 0.03173828125, + "learning_rate": 0.021420441471766343, + "loss": 0.8096, + "num_input_tokens_seen": 33000584, + "step": 56840 + }, + { + "epoch": 8.466636878165028, + "grad_norm": 0.023193359375, + "learning_rate": 0.021418679389907498, + "loss": 0.796, + "num_input_tokens_seen": 33003304, + "step": 56845 + }, + { + "epoch": 8.467381590705987, + "grad_norm": 0.046875, + "learning_rate": 0.02141691719961158, + "loss": 0.8091, + "num_input_tokens_seen": 33006504, + "step": 56850 + }, + { + "epoch": 8.468126303246947, + "grad_norm": 0.0218505859375, + "learning_rate": 0.021415154900908378, + "loss": 0.8007, + "num_input_tokens_seen": 33009128, + "step": 56855 + }, + { + "epoch": 8.468871015787906, + "grad_norm": 0.0277099609375, + "learning_rate": 0.02141339249382765, + "loss": 0.8071, + "num_input_tokens_seen": 33011848, + "step": 56860 + }, + { + "epoch": 8.469615728328865, + "grad_norm": 0.023193359375, + "learning_rate": 0.02141162997839917, + "loss": 0.8038, + "num_input_tokens_seen": 33014536, + "step": 56865 + }, + { + "epoch": 8.470360440869824, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02140986735465272, + "loss": 0.8056, + "num_input_tokens_seen": 33017672, + "step": 56870 + }, + { + "epoch": 8.471105153410784, + "grad_norm": 0.0216064453125, + "learning_rate": 0.021408104622618076, + "loss": 0.8098, + "num_input_tokens_seen": 33020232, + "step": 56875 + }, + { + "epoch": 8.471849865951743, + "grad_norm": 0.02880859375, + "learning_rate": 0.021406341782325023, + "loss": 0.812, + "num_input_tokens_seen": 33022888, + "step": 56880 + }, + { + "epoch": 8.472594578492702, + "grad_norm": 0.02197265625, + "learning_rate": 0.02140457883380333, + "loss": 0.7951, + "num_input_tokens_seen": 33025768, + "step": 56885 + }, + { + "epoch": 8.47333929103366, + "grad_norm": 0.02294921875, + "learning_rate": 0.021402815777082786, + "loss": 0.8056, + "num_input_tokens_seen": 33028680, + "step": 56890 + }, + { + "epoch": 8.474084003574621, + "grad_norm": 0.0361328125, + "learning_rate": 0.021401052612193182, + "loss": 0.8104, + "num_input_tokens_seen": 33031880, + "step": 56895 + }, + { + "epoch": 8.47482871611558, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0213992893391643, + "loss": 0.8066, + "num_input_tokens_seen": 33034760, + "step": 56900 + }, + { + "epoch": 8.475573428656539, + "grad_norm": 0.0218505859375, + "learning_rate": 0.021397525958025923, + "loss": 0.8079, + "num_input_tokens_seen": 33037576, + "step": 56905 + }, + { + "epoch": 8.476318141197497, + "grad_norm": 0.033447265625, + "learning_rate": 0.021395762468807852, + "loss": 0.8089, + "num_input_tokens_seen": 33040520, + "step": 56910 + }, + { + "epoch": 8.477062853738458, + "grad_norm": 0.0159912109375, + "learning_rate": 0.021393998871539872, + "loss": 0.8052, + "num_input_tokens_seen": 33043496, + "step": 56915 + }, + { + "epoch": 8.477807566279417, + "grad_norm": 0.0166015625, + "learning_rate": 0.02139223516625178, + "loss": 0.8091, + "num_input_tokens_seen": 33046376, + "step": 56920 + }, + { + "epoch": 8.478552278820375, + "grad_norm": 0.03564453125, + "learning_rate": 0.021390471352973374, + "loss": 0.7937, + "num_input_tokens_seen": 33049224, + "step": 56925 + }, + { + "epoch": 8.479296991361334, + "grad_norm": 0.0267333984375, + "learning_rate": 0.021388707431734443, + "loss": 0.8113, + "num_input_tokens_seen": 33052136, + "step": 56930 + }, + { + "epoch": 8.480041703902295, + "grad_norm": 0.0224609375, + "learning_rate": 0.021386943402564803, + "loss": 0.8288, + "num_input_tokens_seen": 33054792, + "step": 56935 + }, + { + "epoch": 8.480786416443253, + "grad_norm": 0.02001953125, + "learning_rate": 0.02138517926549424, + "loss": 0.7993, + "num_input_tokens_seen": 33057704, + "step": 56940 + }, + { + "epoch": 8.481531128984212, + "grad_norm": 0.03369140625, + "learning_rate": 0.021383415020552562, + "loss": 0.8158, + "num_input_tokens_seen": 33060680, + "step": 56945 + }, + { + "epoch": 8.482275841525171, + "grad_norm": 0.0250244140625, + "learning_rate": 0.02138165066776958, + "loss": 0.8087, + "num_input_tokens_seen": 33063592, + "step": 56950 + }, + { + "epoch": 8.48302055406613, + "grad_norm": 0.027587890625, + "learning_rate": 0.02137988620717509, + "loss": 0.8063, + "num_input_tokens_seen": 33066472, + "step": 56955 + }, + { + "epoch": 8.48376526660709, + "grad_norm": 0.040283203125, + "learning_rate": 0.021378121638798906, + "loss": 0.8016, + "num_input_tokens_seen": 33069352, + "step": 56960 + }, + { + "epoch": 8.484509979148049, + "grad_norm": 0.022705078125, + "learning_rate": 0.02137635696267084, + "loss": 0.7951, + "num_input_tokens_seen": 33072200, + "step": 56965 + }, + { + "epoch": 8.485254691689008, + "grad_norm": 0.024169921875, + "learning_rate": 0.02137459217882071, + "loss": 0.8005, + "num_input_tokens_seen": 33075304, + "step": 56970 + }, + { + "epoch": 8.485999404229966, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02137282728727832, + "loss": 0.805, + "num_input_tokens_seen": 33078472, + "step": 56975 + }, + { + "epoch": 8.486744116770927, + "grad_norm": 0.032470703125, + "learning_rate": 0.021371062288073492, + "loss": 0.797, + "num_input_tokens_seen": 33081512, + "step": 56980 + }, + { + "epoch": 8.487488829311886, + "grad_norm": 0.024169921875, + "learning_rate": 0.02136929718123604, + "loss": 0.8045, + "num_input_tokens_seen": 33084232, + "step": 56985 + }, + { + "epoch": 8.488233541852845, + "grad_norm": 0.036865234375, + "learning_rate": 0.021367531966795784, + "loss": 0.8133, + "num_input_tokens_seen": 33087176, + "step": 56990 + }, + { + "epoch": 8.488978254393803, + "grad_norm": 0.024169921875, + "learning_rate": 0.021365766644782548, + "loss": 0.8116, + "num_input_tokens_seen": 33090024, + "step": 56995 + }, + { + "epoch": 8.489722966934764, + "grad_norm": 0.04296875, + "learning_rate": 0.021364001215226157, + "loss": 0.8039, + "num_input_tokens_seen": 33093064, + "step": 57000 + }, + { + "epoch": 8.490467679475723, + "grad_norm": 0.0244140625, + "learning_rate": 0.021362235678156427, + "loss": 0.7965, + "num_input_tokens_seen": 33095752, + "step": 57005 + }, + { + "epoch": 8.491212392016681, + "grad_norm": 0.02099609375, + "learning_rate": 0.0213604700336032, + "loss": 0.7987, + "num_input_tokens_seen": 33098472, + "step": 57010 + }, + { + "epoch": 8.49195710455764, + "grad_norm": 0.015869140625, + "learning_rate": 0.021358704281596294, + "loss": 0.7904, + "num_input_tokens_seen": 33101096, + "step": 57015 + }, + { + "epoch": 8.4927018170986, + "grad_norm": 0.020263671875, + "learning_rate": 0.02135693842216554, + "loss": 0.8042, + "num_input_tokens_seen": 33104040, + "step": 57020 + }, + { + "epoch": 8.49344652963956, + "grad_norm": 0.0230712890625, + "learning_rate": 0.021355172455340774, + "loss": 0.7969, + "num_input_tokens_seen": 33106920, + "step": 57025 + }, + { + "epoch": 8.494191242180518, + "grad_norm": 0.0223388671875, + "learning_rate": 0.021353406381151824, + "loss": 0.793, + "num_input_tokens_seen": 33109992, + "step": 57030 + }, + { + "epoch": 8.494935954721477, + "grad_norm": 0.0174560546875, + "learning_rate": 0.021351640199628534, + "loss": 0.8012, + "num_input_tokens_seen": 33114216, + "step": 57035 + }, + { + "epoch": 8.495680667262437, + "grad_norm": 0.03466796875, + "learning_rate": 0.02134987391080074, + "loss": 0.8138, + "num_input_tokens_seen": 33117128, + "step": 57040 + }, + { + "epoch": 8.496425379803396, + "grad_norm": 0.01416015625, + "learning_rate": 0.02134810751469828, + "loss": 0.7913, + "num_input_tokens_seen": 33119656, + "step": 57045 + }, + { + "epoch": 8.497170092344355, + "grad_norm": 0.022216796875, + "learning_rate": 0.02134634101135099, + "loss": 0.7889, + "num_input_tokens_seen": 33122408, + "step": 57050 + }, + { + "epoch": 8.497914804885314, + "grad_norm": 0.024169921875, + "learning_rate": 0.021344574400788723, + "loss": 0.7908, + "num_input_tokens_seen": 33125128, + "step": 57055 + }, + { + "epoch": 8.498659517426274, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02134280768304132, + "loss": 0.8013, + "num_input_tokens_seen": 33127912, + "step": 57060 + }, + { + "epoch": 8.499404229967233, + "grad_norm": 0.0228271484375, + "learning_rate": 0.021341040858138625, + "loss": 0.8018, + "num_input_tokens_seen": 33130952, + "step": 57065 + }, + { + "epoch": 8.500148942508192, + "grad_norm": 0.03564453125, + "learning_rate": 0.021339273926110493, + "loss": 0.8132, + "num_input_tokens_seen": 33133896, + "step": 57070 + }, + { + "epoch": 8.50089365504915, + "grad_norm": 0.04296875, + "learning_rate": 0.021337506886986764, + "loss": 0.8118, + "num_input_tokens_seen": 33136968, + "step": 57075 + }, + { + "epoch": 8.501638367590111, + "grad_norm": 0.02392578125, + "learning_rate": 0.021335739740797305, + "loss": 0.796, + "num_input_tokens_seen": 33139976, + "step": 57080 + }, + { + "epoch": 8.50238308013107, + "grad_norm": 0.04052734375, + "learning_rate": 0.02133397248757196, + "loss": 0.8075, + "num_input_tokens_seen": 33142856, + "step": 57085 + }, + { + "epoch": 8.503127792672029, + "grad_norm": 0.02099609375, + "learning_rate": 0.02133220512734059, + "loss": 0.7945, + "num_input_tokens_seen": 33145704, + "step": 57090 + }, + { + "epoch": 8.503872505212987, + "grad_norm": 0.0247802734375, + "learning_rate": 0.021330437660133046, + "loss": 0.8077, + "num_input_tokens_seen": 33148552, + "step": 57095 + }, + { + "epoch": 8.504617217753946, + "grad_norm": 0.0216064453125, + "learning_rate": 0.021328670085979193, + "loss": 0.7899, + "num_input_tokens_seen": 33151112, + "step": 57100 + }, + { + "epoch": 8.505361930294907, + "grad_norm": 0.0302734375, + "learning_rate": 0.02132690240490889, + "loss": 0.843, + "num_input_tokens_seen": 33153832, + "step": 57105 + }, + { + "epoch": 8.506106642835865, + "grad_norm": 0.023193359375, + "learning_rate": 0.021325134616952006, + "loss": 0.8059, + "num_input_tokens_seen": 33156744, + "step": 57110 + }, + { + "epoch": 8.506851355376824, + "grad_norm": 0.0341796875, + "learning_rate": 0.021323366722138394, + "loss": 0.8115, + "num_input_tokens_seen": 33159528, + "step": 57115 + }, + { + "epoch": 8.507596067917785, + "grad_norm": 0.033203125, + "learning_rate": 0.021321598720497933, + "loss": 0.8199, + "num_input_tokens_seen": 33162408, + "step": 57120 + }, + { + "epoch": 8.508340780458743, + "grad_norm": 0.03759765625, + "learning_rate": 0.021319830612060484, + "loss": 0.7942, + "num_input_tokens_seen": 33165352, + "step": 57125 + }, + { + "epoch": 8.509085492999702, + "grad_norm": 0.02197265625, + "learning_rate": 0.021318062396855922, + "loss": 0.7905, + "num_input_tokens_seen": 33168488, + "step": 57130 + }, + { + "epoch": 8.509830205540661, + "grad_norm": 0.02734375, + "learning_rate": 0.021316294074914116, + "loss": 0.8341, + "num_input_tokens_seen": 33171336, + "step": 57135 + }, + { + "epoch": 8.51057491808162, + "grad_norm": 0.0296630859375, + "learning_rate": 0.02131452564626494, + "loss": 0.7926, + "num_input_tokens_seen": 33174216, + "step": 57140 + }, + { + "epoch": 8.51131963062258, + "grad_norm": 0.019775390625, + "learning_rate": 0.021312757110938278, + "loss": 0.8009, + "num_input_tokens_seen": 33177224, + "step": 57145 + }, + { + "epoch": 8.512064343163539, + "grad_norm": 0.021484375, + "learning_rate": 0.021310988468963996, + "loss": 0.7973, + "num_input_tokens_seen": 33179976, + "step": 57150 + }, + { + "epoch": 8.512809055704498, + "grad_norm": 0.025146484375, + "learning_rate": 0.021309219720371975, + "loss": 0.8119, + "num_input_tokens_seen": 33182824, + "step": 57155 + }, + { + "epoch": 8.513553768245457, + "grad_norm": 0.027099609375, + "learning_rate": 0.021307450865192097, + "loss": 0.8038, + "num_input_tokens_seen": 33185704, + "step": 57160 + }, + { + "epoch": 8.514298480786417, + "grad_norm": 0.01275634765625, + "learning_rate": 0.021305681903454252, + "loss": 0.8103, + "num_input_tokens_seen": 33188328, + "step": 57165 + }, + { + "epoch": 8.515043193327376, + "grad_norm": 0.029296875, + "learning_rate": 0.021303912835188323, + "loss": 0.8203, + "num_input_tokens_seen": 33191176, + "step": 57170 + }, + { + "epoch": 8.515787905868335, + "grad_norm": 0.01446533203125, + "learning_rate": 0.02130214366042419, + "loss": 0.8035, + "num_input_tokens_seen": 33194056, + "step": 57175 + }, + { + "epoch": 8.516532618409293, + "grad_norm": 0.01385498046875, + "learning_rate": 0.021300374379191748, + "loss": 0.8077, + "num_input_tokens_seen": 33197064, + "step": 57180 + }, + { + "epoch": 8.517277330950254, + "grad_norm": 0.0216064453125, + "learning_rate": 0.02129860499152088, + "loss": 0.8013, + "num_input_tokens_seen": 33199816, + "step": 57185 + }, + { + "epoch": 8.518022043491213, + "grad_norm": 0.034423828125, + "learning_rate": 0.021296835497441484, + "loss": 0.8048, + "num_input_tokens_seen": 33202728, + "step": 57190 + }, + { + "epoch": 8.518766756032171, + "grad_norm": 0.01348876953125, + "learning_rate": 0.02129506589698345, + "loss": 0.7999, + "num_input_tokens_seen": 33205416, + "step": 57195 + }, + { + "epoch": 8.51951146857313, + "grad_norm": 0.0164794921875, + "learning_rate": 0.021293296190176676, + "loss": 0.7933, + "num_input_tokens_seen": 33208584, + "step": 57200 + }, + { + "epoch": 8.52025618111409, + "grad_norm": 0.03173828125, + "learning_rate": 0.021291526377051062, + "loss": 0.8052, + "num_input_tokens_seen": 33211720, + "step": 57205 + }, + { + "epoch": 8.52100089365505, + "grad_norm": 0.0250244140625, + "learning_rate": 0.021289756457636505, + "loss": 0.8166, + "num_input_tokens_seen": 33214888, + "step": 57210 + }, + { + "epoch": 8.521745606196008, + "grad_norm": 0.0150146484375, + "learning_rate": 0.021287986431962905, + "loss": 0.8057, + "num_input_tokens_seen": 33218024, + "step": 57215 + }, + { + "epoch": 8.522490318736967, + "grad_norm": 0.0244140625, + "learning_rate": 0.021286216300060164, + "loss": 0.8118, + "num_input_tokens_seen": 33221096, + "step": 57220 + }, + { + "epoch": 8.523235031277927, + "grad_norm": 0.0223388671875, + "learning_rate": 0.021284446061958187, + "loss": 0.8215, + "num_input_tokens_seen": 33224424, + "step": 57225 + }, + { + "epoch": 8.523979743818886, + "grad_norm": 0.013916015625, + "learning_rate": 0.02128267571768688, + "loss": 0.8177, + "num_input_tokens_seen": 33227048, + "step": 57230 + }, + { + "epoch": 8.524724456359845, + "grad_norm": 0.032958984375, + "learning_rate": 0.021280905267276155, + "loss": 0.8101, + "num_input_tokens_seen": 33230184, + "step": 57235 + }, + { + "epoch": 8.525469168900804, + "grad_norm": 0.0216064453125, + "learning_rate": 0.021279134710755914, + "loss": 0.798, + "num_input_tokens_seen": 33233256, + "step": 57240 + }, + { + "epoch": 8.526213881441764, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02127736404815608, + "loss": 0.7887, + "num_input_tokens_seen": 33236264, + "step": 57245 + }, + { + "epoch": 8.526958593982723, + "grad_norm": 0.022705078125, + "learning_rate": 0.021275593279506556, + "loss": 0.8081, + "num_input_tokens_seen": 33239144, + "step": 57250 + }, + { + "epoch": 8.527703306523682, + "grad_norm": 0.0230712890625, + "learning_rate": 0.021273822404837266, + "loss": 0.7899, + "num_input_tokens_seen": 33241736, + "step": 57255 + }, + { + "epoch": 8.52844801906464, + "grad_norm": 0.03125, + "learning_rate": 0.02127205142417812, + "loss": 0.8067, + "num_input_tokens_seen": 33244520, + "step": 57260 + }, + { + "epoch": 8.529192731605601, + "grad_norm": 0.040283203125, + "learning_rate": 0.021270280337559042, + "loss": 0.7972, + "num_input_tokens_seen": 33247528, + "step": 57265 + }, + { + "epoch": 8.52993744414656, + "grad_norm": 0.0264892578125, + "learning_rate": 0.021268509145009946, + "loss": 0.804, + "num_input_tokens_seen": 33250248, + "step": 57270 + }, + { + "epoch": 8.530682156687519, + "grad_norm": 0.0220947265625, + "learning_rate": 0.021266737846560767, + "loss": 0.8006, + "num_input_tokens_seen": 33252808, + "step": 57275 + }, + { + "epoch": 8.531426869228477, + "grad_norm": 0.02197265625, + "learning_rate": 0.021264966442241413, + "loss": 0.788, + "num_input_tokens_seen": 33255496, + "step": 57280 + }, + { + "epoch": 8.532171581769436, + "grad_norm": 0.0260009765625, + "learning_rate": 0.02126319493208182, + "loss": 0.8053, + "num_input_tokens_seen": 33258376, + "step": 57285 + }, + { + "epoch": 8.532916294310397, + "grad_norm": 0.02001953125, + "learning_rate": 0.021261423316111916, + "loss": 0.7915, + "num_input_tokens_seen": 33260872, + "step": 57290 + }, + { + "epoch": 8.533661006851355, + "grad_norm": 0.02392578125, + "learning_rate": 0.02125965159436163, + "loss": 0.7956, + "num_input_tokens_seen": 33263752, + "step": 57295 + }, + { + "epoch": 8.534405719392314, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02125787976686089, + "loss": 0.8012, + "num_input_tokens_seen": 33266248, + "step": 57300 + }, + { + "epoch": 8.535150431933273, + "grad_norm": 0.0255126953125, + "learning_rate": 0.021256107833639633, + "loss": 0.8025, + "num_input_tokens_seen": 33269000, + "step": 57305 + }, + { + "epoch": 8.535895144474233, + "grad_norm": 0.0213623046875, + "learning_rate": 0.021254335794727793, + "loss": 0.8072, + "num_input_tokens_seen": 33271688, + "step": 57310 + }, + { + "epoch": 8.536639857015192, + "grad_norm": 0.0225830078125, + "learning_rate": 0.021252563650155307, + "loss": 0.8016, + "num_input_tokens_seen": 33274472, + "step": 57315 + }, + { + "epoch": 8.537384569556151, + "grad_norm": 0.034912109375, + "learning_rate": 0.021250791399952108, + "loss": 0.8002, + "num_input_tokens_seen": 33277704, + "step": 57320 + }, + { + "epoch": 8.53812928209711, + "grad_norm": 0.027099609375, + "learning_rate": 0.021249019044148144, + "loss": 0.7936, + "num_input_tokens_seen": 33280552, + "step": 57325 + }, + { + "epoch": 8.53887399463807, + "grad_norm": 0.020751953125, + "learning_rate": 0.021247246582773358, + "loss": 0.7927, + "num_input_tokens_seen": 33283496, + "step": 57330 + }, + { + "epoch": 8.539618707179029, + "grad_norm": 0.0224609375, + "learning_rate": 0.021245474015857687, + "loss": 0.8269, + "num_input_tokens_seen": 33286696, + "step": 57335 + }, + { + "epoch": 8.540363419719988, + "grad_norm": 0.016845703125, + "learning_rate": 0.021243701343431082, + "loss": 0.8192, + "num_input_tokens_seen": 33289448, + "step": 57340 + }, + { + "epoch": 8.541108132260947, + "grad_norm": 0.0140380859375, + "learning_rate": 0.02124192856552349, + "loss": 0.8015, + "num_input_tokens_seen": 33292232, + "step": 57345 + }, + { + "epoch": 8.541852844801907, + "grad_norm": 0.0291748046875, + "learning_rate": 0.021240155682164856, + "loss": 0.7903, + "num_input_tokens_seen": 33294760, + "step": 57350 + }, + { + "epoch": 8.542597557342866, + "grad_norm": 0.0255126953125, + "learning_rate": 0.021238382693385134, + "loss": 0.8145, + "num_input_tokens_seen": 33297544, + "step": 57355 + }, + { + "epoch": 8.543342269883825, + "grad_norm": 0.0242919921875, + "learning_rate": 0.021236609599214275, + "loss": 0.8029, + "num_input_tokens_seen": 33300296, + "step": 57360 + }, + { + "epoch": 8.544086982424783, + "grad_norm": 0.025390625, + "learning_rate": 0.021234836399682235, + "loss": 0.7922, + "num_input_tokens_seen": 33302984, + "step": 57365 + }, + { + "epoch": 8.544831694965744, + "grad_norm": 0.0206298828125, + "learning_rate": 0.021233063094818976, + "loss": 0.7947, + "num_input_tokens_seen": 33305928, + "step": 57370 + }, + { + "epoch": 8.545576407506703, + "grad_norm": 0.0220947265625, + "learning_rate": 0.02123128968465445, + "loss": 0.795, + "num_input_tokens_seen": 33308936, + "step": 57375 + }, + { + "epoch": 8.546321120047661, + "grad_norm": 0.018798828125, + "learning_rate": 0.021229516169218615, + "loss": 0.8007, + "num_input_tokens_seen": 33311880, + "step": 57380 + }, + { + "epoch": 8.54706583258862, + "grad_norm": 0.034423828125, + "learning_rate": 0.02122774254854144, + "loss": 0.7931, + "num_input_tokens_seen": 33315048, + "step": 57385 + }, + { + "epoch": 8.54781054512958, + "grad_norm": 0.0185546875, + "learning_rate": 0.021225968822652878, + "loss": 0.7903, + "num_input_tokens_seen": 33317960, + "step": 57390 + }, + { + "epoch": 8.54855525767054, + "grad_norm": 0.0279541015625, + "learning_rate": 0.021224194991582904, + "loss": 0.7939, + "num_input_tokens_seen": 33320808, + "step": 57395 + }, + { + "epoch": 8.549299970211498, + "grad_norm": 0.028076171875, + "learning_rate": 0.02122242105536148, + "loss": 0.8192, + "num_input_tokens_seen": 33324872, + "step": 57400 + }, + { + "epoch": 8.550044682752457, + "grad_norm": 0.02880859375, + "learning_rate": 0.02122064701401858, + "loss": 0.7832, + "num_input_tokens_seen": 33327272, + "step": 57405 + }, + { + "epoch": 8.550789395293418, + "grad_norm": 0.0216064453125, + "learning_rate": 0.021218872867584167, + "loss": 0.8078, + "num_input_tokens_seen": 33329864, + "step": 57410 + }, + { + "epoch": 8.551534107834376, + "grad_norm": 0.0235595703125, + "learning_rate": 0.021217098616088222, + "loss": 0.7903, + "num_input_tokens_seen": 33332488, + "step": 57415 + }, + { + "epoch": 8.552278820375335, + "grad_norm": 0.0289306640625, + "learning_rate": 0.021215324259560714, + "loss": 0.7927, + "num_input_tokens_seen": 33335720, + "step": 57420 + }, + { + "epoch": 8.553023532916294, + "grad_norm": 0.01312255859375, + "learning_rate": 0.021213549798031615, + "loss": 0.7947, + "num_input_tokens_seen": 33338632, + "step": 57425 + }, + { + "epoch": 8.553768245457253, + "grad_norm": 0.03271484375, + "learning_rate": 0.02121177523153091, + "loss": 0.8247, + "num_input_tokens_seen": 33341576, + "step": 57430 + }, + { + "epoch": 8.554512957998213, + "grad_norm": 0.0206298828125, + "learning_rate": 0.021210000560088576, + "loss": 0.7963, + "num_input_tokens_seen": 33344456, + "step": 57435 + }, + { + "epoch": 8.555257670539172, + "grad_norm": 0.0233154296875, + "learning_rate": 0.021208225783734595, + "loss": 0.8046, + "num_input_tokens_seen": 33347336, + "step": 57440 + }, + { + "epoch": 8.55600238308013, + "grad_norm": 0.0260009765625, + "learning_rate": 0.021206450902498945, + "loss": 0.8056, + "num_input_tokens_seen": 33350472, + "step": 57445 + }, + { + "epoch": 8.556747095621091, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02120467591641162, + "loss": 0.8144, + "num_input_tokens_seen": 33353256, + "step": 57450 + }, + { + "epoch": 8.55749180816205, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0212029008255026, + "loss": 0.8045, + "num_input_tokens_seen": 33355848, + "step": 57455 + }, + { + "epoch": 8.558236520703009, + "grad_norm": 0.03173828125, + "learning_rate": 0.02120112562980188, + "loss": 0.7991, + "num_input_tokens_seen": 33358600, + "step": 57460 + }, + { + "epoch": 8.558981233243967, + "grad_norm": 0.0205078125, + "learning_rate": 0.021199350329339433, + "loss": 0.7935, + "num_input_tokens_seen": 33361480, + "step": 57465 + }, + { + "epoch": 8.559725945784926, + "grad_norm": 0.01409912109375, + "learning_rate": 0.02119757492414527, + "loss": 0.8142, + "num_input_tokens_seen": 33364232, + "step": 57470 + }, + { + "epoch": 8.560470658325887, + "grad_norm": 0.0205078125, + "learning_rate": 0.02119579941424938, + "loss": 0.8281, + "num_input_tokens_seen": 33367176, + "step": 57475 + }, + { + "epoch": 8.561215370866845, + "grad_norm": 0.0177001953125, + "learning_rate": 0.02119402379968175, + "loss": 0.8015, + "num_input_tokens_seen": 33370216, + "step": 57480 + }, + { + "epoch": 8.561960083407804, + "grad_norm": 0.0361328125, + "learning_rate": 0.02119224808047239, + "loss": 0.786, + "num_input_tokens_seen": 33373000, + "step": 57485 + }, + { + "epoch": 8.562704795948763, + "grad_norm": 0.033203125, + "learning_rate": 0.021190472256651285, + "loss": 0.802, + "num_input_tokens_seen": 33376136, + "step": 57490 + }, + { + "epoch": 8.563449508489724, + "grad_norm": 0.0235595703125, + "learning_rate": 0.021188696328248446, + "loss": 0.7944, + "num_input_tokens_seen": 33378920, + "step": 57495 + }, + { + "epoch": 8.564194221030682, + "grad_norm": 0.0208740234375, + "learning_rate": 0.021186920295293874, + "loss": 0.7929, + "num_input_tokens_seen": 33381928, + "step": 57500 + }, + { + "epoch": 8.564938933571641, + "grad_norm": 0.0172119140625, + "learning_rate": 0.02118514415781757, + "loss": 0.8025, + "num_input_tokens_seen": 33384840, + "step": 57505 + }, + { + "epoch": 8.5656836461126, + "grad_norm": 0.023193359375, + "learning_rate": 0.021183367915849544, + "loss": 0.7897, + "num_input_tokens_seen": 33387656, + "step": 57510 + }, + { + "epoch": 8.56642835865356, + "grad_norm": 0.0224609375, + "learning_rate": 0.021181591569419803, + "loss": 0.7858, + "num_input_tokens_seen": 33390856, + "step": 57515 + }, + { + "epoch": 8.567173071194519, + "grad_norm": 0.025390625, + "learning_rate": 0.02117981511855835, + "loss": 0.779, + "num_input_tokens_seen": 33393448, + "step": 57520 + }, + { + "epoch": 8.567917783735478, + "grad_norm": 0.022216796875, + "learning_rate": 0.021178038563295202, + "loss": 0.8024, + "num_input_tokens_seen": 33396392, + "step": 57525 + }, + { + "epoch": 8.568662496276437, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02117626190366037, + "loss": 0.8224, + "num_input_tokens_seen": 33399304, + "step": 57530 + }, + { + "epoch": 8.569407208817397, + "grad_norm": 0.027587890625, + "learning_rate": 0.021174485139683875, + "loss": 0.7961, + "num_input_tokens_seen": 33402088, + "step": 57535 + }, + { + "epoch": 8.570151921358356, + "grad_norm": 0.02880859375, + "learning_rate": 0.021172708271395727, + "loss": 0.788, + "num_input_tokens_seen": 33404808, + "step": 57540 + }, + { + "epoch": 8.570896633899315, + "grad_norm": 0.0289306640625, + "learning_rate": 0.021170931298825954, + "loss": 0.786, + "num_input_tokens_seen": 33407720, + "step": 57545 + }, + { + "epoch": 8.571641346440273, + "grad_norm": 0.0186767578125, + "learning_rate": 0.02116915422200456, + "loss": 0.8022, + "num_input_tokens_seen": 33410792, + "step": 57550 + }, + { + "epoch": 8.572386058981234, + "grad_norm": 0.03857421875, + "learning_rate": 0.021167377040961576, + "loss": 0.8034, + "num_input_tokens_seen": 33413544, + "step": 57555 + }, + { + "epoch": 8.573130771522193, + "grad_norm": 0.0279541015625, + "learning_rate": 0.021165599755727026, + "loss": 0.819, + "num_input_tokens_seen": 33416520, + "step": 57560 + }, + { + "epoch": 8.573875484063151, + "grad_norm": 0.02587890625, + "learning_rate": 0.021163822366330933, + "loss": 0.8152, + "num_input_tokens_seen": 33419336, + "step": 57565 + }, + { + "epoch": 8.57462019660411, + "grad_norm": 0.025146484375, + "learning_rate": 0.021162044872803328, + "loss": 0.7914, + "num_input_tokens_seen": 33421896, + "step": 57570 + }, + { + "epoch": 8.57536490914507, + "grad_norm": 0.033447265625, + "learning_rate": 0.021160267275174238, + "loss": 0.8134, + "num_input_tokens_seen": 33424840, + "step": 57575 + }, + { + "epoch": 8.57610962168603, + "grad_norm": 0.023193359375, + "learning_rate": 0.021158489573473695, + "loss": 0.8014, + "num_input_tokens_seen": 33427784, + "step": 57580 + }, + { + "epoch": 8.576854334226988, + "grad_norm": 0.0205078125, + "learning_rate": 0.021156711767731724, + "loss": 0.7922, + "num_input_tokens_seen": 33430536, + "step": 57585 + }, + { + "epoch": 8.577599046767947, + "grad_norm": 0.021484375, + "learning_rate": 0.021154933857978366, + "loss": 0.8173, + "num_input_tokens_seen": 33433704, + "step": 57590 + }, + { + "epoch": 8.578343759308908, + "grad_norm": 0.0211181640625, + "learning_rate": 0.02115315584424366, + "loss": 0.7984, + "num_input_tokens_seen": 33436584, + "step": 57595 + }, + { + "epoch": 8.579088471849866, + "grad_norm": 0.025146484375, + "learning_rate": 0.02115137772655764, + "loss": 0.8009, + "num_input_tokens_seen": 33439432, + "step": 57600 + }, + { + "epoch": 8.579833184390825, + "grad_norm": 0.0240478515625, + "learning_rate": 0.021149599504950344, + "loss": 0.799, + "num_input_tokens_seen": 33442504, + "step": 57605 + }, + { + "epoch": 8.580577896931784, + "grad_norm": 0.01806640625, + "learning_rate": 0.02114782117945181, + "loss": 0.7975, + "num_input_tokens_seen": 33445480, + "step": 57610 + }, + { + "epoch": 8.581322609472743, + "grad_norm": 0.026611328125, + "learning_rate": 0.021146042750092093, + "loss": 0.7944, + "num_input_tokens_seen": 33448392, + "step": 57615 + }, + { + "epoch": 8.582067322013703, + "grad_norm": 0.033203125, + "learning_rate": 0.021144264216901226, + "loss": 0.8061, + "num_input_tokens_seen": 33451208, + "step": 57620 + }, + { + "epoch": 8.582812034554662, + "grad_norm": 0.0242919921875, + "learning_rate": 0.021142485579909265, + "loss": 0.8216, + "num_input_tokens_seen": 33454088, + "step": 57625 + }, + { + "epoch": 8.58355674709562, + "grad_norm": 0.021240234375, + "learning_rate": 0.021140706839146248, + "loss": 0.7792, + "num_input_tokens_seen": 33456968, + "step": 57630 + }, + { + "epoch": 8.584301459636581, + "grad_norm": 0.0262451171875, + "learning_rate": 0.021138927994642225, + "loss": 0.8083, + "num_input_tokens_seen": 33459912, + "step": 57635 + }, + { + "epoch": 8.58504617217754, + "grad_norm": 0.0220947265625, + "learning_rate": 0.021137149046427262, + "loss": 0.7829, + "num_input_tokens_seen": 33463048, + "step": 57640 + }, + { + "epoch": 8.585790884718499, + "grad_norm": 0.032958984375, + "learning_rate": 0.021135369994531403, + "loss": 0.7891, + "num_input_tokens_seen": 33465736, + "step": 57645 + }, + { + "epoch": 8.586535597259457, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0211335908389847, + "loss": 0.8086, + "num_input_tokens_seen": 33468712, + "step": 57650 + }, + { + "epoch": 8.587280309800416, + "grad_norm": 0.0238037109375, + "learning_rate": 0.021131811579817215, + "loss": 0.805, + "num_input_tokens_seen": 33471592, + "step": 57655 + }, + { + "epoch": 8.588025022341377, + "grad_norm": 0.0289306640625, + "learning_rate": 0.021130032217059003, + "loss": 0.8072, + "num_input_tokens_seen": 33474760, + "step": 57660 + }, + { + "epoch": 8.588769734882336, + "grad_norm": 0.01214599609375, + "learning_rate": 0.021128252750740126, + "loss": 0.7786, + "num_input_tokens_seen": 33477864, + "step": 57665 + }, + { + "epoch": 8.589514447423294, + "grad_norm": 0.0242919921875, + "learning_rate": 0.021126473180890653, + "loss": 0.7867, + "num_input_tokens_seen": 33481096, + "step": 57670 + }, + { + "epoch": 8.590259159964253, + "grad_norm": 0.022705078125, + "learning_rate": 0.02112469350754064, + "loss": 0.7999, + "num_input_tokens_seen": 33483624, + "step": 57675 + }, + { + "epoch": 8.591003872505214, + "grad_norm": 0.02099609375, + "learning_rate": 0.021122913730720155, + "loss": 0.797, + "num_input_tokens_seen": 33486536, + "step": 57680 + }, + { + "epoch": 8.591748585046172, + "grad_norm": 0.01397705078125, + "learning_rate": 0.021121133850459266, + "loss": 0.7922, + "num_input_tokens_seen": 33489256, + "step": 57685 + }, + { + "epoch": 8.592493297587131, + "grad_norm": 0.025146484375, + "learning_rate": 0.021119353866788038, + "loss": 0.7594, + "num_input_tokens_seen": 33492040, + "step": 57690 + }, + { + "epoch": 8.59323801012809, + "grad_norm": 0.022705078125, + "learning_rate": 0.021117573779736547, + "loss": 0.7822, + "num_input_tokens_seen": 33495080, + "step": 57695 + }, + { + "epoch": 8.59398272266905, + "grad_norm": 0.033203125, + "learning_rate": 0.021115793589334873, + "loss": 0.7781, + "num_input_tokens_seen": 33498056, + "step": 57700 + }, + { + "epoch": 8.59472743521001, + "grad_norm": 0.03125, + "learning_rate": 0.021114013295613073, + "loss": 0.7861, + "num_input_tokens_seen": 33500872, + "step": 57705 + }, + { + "epoch": 8.595472147750968, + "grad_norm": 0.0306396484375, + "learning_rate": 0.021112232898601237, + "loss": 0.8209, + "num_input_tokens_seen": 33504008, + "step": 57710 + }, + { + "epoch": 8.596216860291927, + "grad_norm": 0.035400390625, + "learning_rate": 0.021110452398329434, + "loss": 0.8268, + "num_input_tokens_seen": 33506760, + "step": 57715 + }, + { + "epoch": 8.596961572832887, + "grad_norm": 0.031494140625, + "learning_rate": 0.021108671794827748, + "loss": 0.8299, + "num_input_tokens_seen": 33509608, + "step": 57720 + }, + { + "epoch": 8.597706285373846, + "grad_norm": 0.031982421875, + "learning_rate": 0.021106891088126266, + "loss": 0.8032, + "num_input_tokens_seen": 33512648, + "step": 57725 + }, + { + "epoch": 8.598450997914805, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02110511027825506, + "loss": 0.7822, + "num_input_tokens_seen": 33515528, + "step": 57730 + }, + { + "epoch": 8.599195710455763, + "grad_norm": 0.022216796875, + "learning_rate": 0.021103329365244224, + "loss": 0.7631, + "num_input_tokens_seen": 33518632, + "step": 57735 + }, + { + "epoch": 8.599940422996724, + "grad_norm": 0.026611328125, + "learning_rate": 0.021101548349123842, + "loss": 0.8123, + "num_input_tokens_seen": 33521352, + "step": 57740 + }, + { + "epoch": 8.600685135537683, + "grad_norm": 0.017333984375, + "learning_rate": 0.021099767229924005, + "loss": 0.7996, + "num_input_tokens_seen": 33524008, + "step": 57745 + }, + { + "epoch": 8.601429848078642, + "grad_norm": 0.0211181640625, + "learning_rate": 0.021097986007674794, + "loss": 0.8188, + "num_input_tokens_seen": 33527048, + "step": 57750 + }, + { + "epoch": 8.6021745606196, + "grad_norm": 0.0247802734375, + "learning_rate": 0.021096204682406312, + "loss": 0.7745, + "num_input_tokens_seen": 33530024, + "step": 57755 + }, + { + "epoch": 8.60291927316056, + "grad_norm": 0.0228271484375, + "learning_rate": 0.021094423254148644, + "loss": 0.7958, + "num_input_tokens_seen": 33532872, + "step": 57760 + }, + { + "epoch": 8.60366398570152, + "grad_norm": 0.02392578125, + "learning_rate": 0.021092641722931887, + "loss": 0.7964, + "num_input_tokens_seen": 33535752, + "step": 57765 + }, + { + "epoch": 8.604408698242478, + "grad_norm": 0.0220947265625, + "learning_rate": 0.021090860088786148, + "loss": 0.821, + "num_input_tokens_seen": 33538696, + "step": 57770 + }, + { + "epoch": 8.605153410783437, + "grad_norm": 0.021728515625, + "learning_rate": 0.02108907835174151, + "loss": 0.802, + "num_input_tokens_seen": 33541448, + "step": 57775 + }, + { + "epoch": 8.605898123324398, + "grad_norm": 0.02685546875, + "learning_rate": 0.021087296511828085, + "loss": 0.8162, + "num_input_tokens_seen": 33544648, + "step": 57780 + }, + { + "epoch": 8.606642835865356, + "grad_norm": 0.022216796875, + "learning_rate": 0.021085514569075977, + "loss": 0.7823, + "num_input_tokens_seen": 33547560, + "step": 57785 + }, + { + "epoch": 8.607387548406315, + "grad_norm": 0.031005859375, + "learning_rate": 0.02108373252351528, + "loss": 0.7912, + "num_input_tokens_seen": 33550312, + "step": 57790 + }, + { + "epoch": 8.608132260947274, + "grad_norm": 0.023681640625, + "learning_rate": 0.021081950375176108, + "loss": 0.8018, + "num_input_tokens_seen": 33553032, + "step": 57795 + }, + { + "epoch": 8.608876973488233, + "grad_norm": 0.0255126953125, + "learning_rate": 0.021080168124088562, + "loss": 0.7939, + "num_input_tokens_seen": 33555880, + "step": 57800 + }, + { + "epoch": 8.609621686029193, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02107838577028276, + "loss": 0.7875, + "num_input_tokens_seen": 33558792, + "step": 57805 + }, + { + "epoch": 8.610366398570152, + "grad_norm": 0.0184326171875, + "learning_rate": 0.021076603313788807, + "loss": 0.8031, + "num_input_tokens_seen": 33561864, + "step": 57810 + }, + { + "epoch": 8.61111111111111, + "grad_norm": 0.022216796875, + "learning_rate": 0.021074820754636816, + "loss": 0.8222, + "num_input_tokens_seen": 33564680, + "step": 57815 + }, + { + "epoch": 8.61185582365207, + "grad_norm": 0.0152587890625, + "learning_rate": 0.0210730380928569, + "loss": 0.7935, + "num_input_tokens_seen": 33567848, + "step": 57820 + }, + { + "epoch": 8.61260053619303, + "grad_norm": 0.02294921875, + "learning_rate": 0.021071255328479182, + "loss": 0.7981, + "num_input_tokens_seen": 33570696, + "step": 57825 + }, + { + "epoch": 8.613345248733989, + "grad_norm": 0.0155029296875, + "learning_rate": 0.021069472461533775, + "loss": 0.8417, + "num_input_tokens_seen": 33573512, + "step": 57830 + }, + { + "epoch": 8.614089961274948, + "grad_norm": 0.037353515625, + "learning_rate": 0.021067689492050796, + "loss": 0.8108, + "num_input_tokens_seen": 33576296, + "step": 57835 + }, + { + "epoch": 8.614834673815906, + "grad_norm": 0.0211181640625, + "learning_rate": 0.021065906420060378, + "loss": 0.7789, + "num_input_tokens_seen": 33579304, + "step": 57840 + }, + { + "epoch": 8.615579386356867, + "grad_norm": 0.0166015625, + "learning_rate": 0.02106412324559263, + "loss": 0.8222, + "num_input_tokens_seen": 33582152, + "step": 57845 + }, + { + "epoch": 8.616324098897826, + "grad_norm": 0.029541015625, + "learning_rate": 0.02106233996867768, + "loss": 0.8116, + "num_input_tokens_seen": 33584968, + "step": 57850 + }, + { + "epoch": 8.617068811438784, + "grad_norm": 0.031982421875, + "learning_rate": 0.021060556589345664, + "loss": 0.8097, + "num_input_tokens_seen": 33587784, + "step": 57855 + }, + { + "epoch": 8.617813523979743, + "grad_norm": 0.0224609375, + "learning_rate": 0.021058773107626702, + "loss": 0.7821, + "num_input_tokens_seen": 33590760, + "step": 57860 + }, + { + "epoch": 8.618558236520704, + "grad_norm": 0.0296630859375, + "learning_rate": 0.021056989523550926, + "loss": 0.7762, + "num_input_tokens_seen": 33593640, + "step": 57865 + }, + { + "epoch": 8.619302949061662, + "grad_norm": 0.0185546875, + "learning_rate": 0.02105520583714847, + "loss": 0.8011, + "num_input_tokens_seen": 33596648, + "step": 57870 + }, + { + "epoch": 8.620047661602621, + "grad_norm": 0.0230712890625, + "learning_rate": 0.021053422048449465, + "loss": 0.7825, + "num_input_tokens_seen": 33599592, + "step": 57875 + }, + { + "epoch": 8.62079237414358, + "grad_norm": 0.02099609375, + "learning_rate": 0.02105163815748404, + "loss": 0.7941, + "num_input_tokens_seen": 33602472, + "step": 57880 + }, + { + "epoch": 8.62153708668454, + "grad_norm": 0.0228271484375, + "learning_rate": 0.021049854164282348, + "loss": 0.7999, + "num_input_tokens_seen": 33605480, + "step": 57885 + }, + { + "epoch": 8.6222817992255, + "grad_norm": 0.022216796875, + "learning_rate": 0.021048070068874512, + "loss": 0.7877, + "num_input_tokens_seen": 33608392, + "step": 57890 + }, + { + "epoch": 8.623026511766458, + "grad_norm": 0.02001953125, + "learning_rate": 0.021046285871290683, + "loss": 0.8255, + "num_input_tokens_seen": 33611336, + "step": 57895 + }, + { + "epoch": 8.623771224307417, + "grad_norm": 0.038330078125, + "learning_rate": 0.021044501571560997, + "loss": 0.8092, + "num_input_tokens_seen": 33614376, + "step": 57900 + }, + { + "epoch": 8.624515936848377, + "grad_norm": 0.0252685546875, + "learning_rate": 0.021042717169715605, + "loss": 0.8056, + "num_input_tokens_seen": 33617288, + "step": 57905 + }, + { + "epoch": 8.625260649389336, + "grad_norm": 0.026611328125, + "learning_rate": 0.021040932665784648, + "loss": 0.8144, + "num_input_tokens_seen": 33620232, + "step": 57910 + }, + { + "epoch": 8.626005361930295, + "grad_norm": 0.033203125, + "learning_rate": 0.021039148059798268, + "loss": 0.7842, + "num_input_tokens_seen": 33623304, + "step": 57915 + }, + { + "epoch": 8.626750074471254, + "grad_norm": 0.029541015625, + "learning_rate": 0.02103736335178662, + "loss": 0.7768, + "num_input_tokens_seen": 33626120, + "step": 57920 + }, + { + "epoch": 8.627494787012214, + "grad_norm": 0.030029296875, + "learning_rate": 0.021035578541779856, + "loss": 0.7742, + "num_input_tokens_seen": 33628776, + "step": 57925 + }, + { + "epoch": 8.628239499553173, + "grad_norm": 0.0244140625, + "learning_rate": 0.021033793629808124, + "loss": 0.8368, + "num_input_tokens_seen": 33631720, + "step": 57930 + }, + { + "epoch": 8.628984212094132, + "grad_norm": 0.0250244140625, + "learning_rate": 0.021032008615901588, + "loss": 0.8012, + "num_input_tokens_seen": 33634696, + "step": 57935 + }, + { + "epoch": 8.62972892463509, + "grad_norm": 0.038818359375, + "learning_rate": 0.021030223500090393, + "loss": 0.7745, + "num_input_tokens_seen": 33637480, + "step": 57940 + }, + { + "epoch": 8.63047363717605, + "grad_norm": 0.0284423828125, + "learning_rate": 0.021028438282404704, + "loss": 0.8115, + "num_input_tokens_seen": 33640968, + "step": 57945 + }, + { + "epoch": 8.63121834971701, + "grad_norm": 0.0262451171875, + "learning_rate": 0.021026652962874672, + "loss": 0.8149, + "num_input_tokens_seen": 33644104, + "step": 57950 + }, + { + "epoch": 8.631963062257968, + "grad_norm": 0.023193359375, + "learning_rate": 0.021024867541530465, + "loss": 0.7915, + "num_input_tokens_seen": 33646856, + "step": 57955 + }, + { + "epoch": 8.632707774798927, + "grad_norm": 0.024658203125, + "learning_rate": 0.021023082018402243, + "loss": 0.8033, + "num_input_tokens_seen": 33649384, + "step": 57960 + }, + { + "epoch": 8.633452487339888, + "grad_norm": 0.0281982421875, + "learning_rate": 0.021021296393520173, + "loss": 0.799, + "num_input_tokens_seen": 33652328, + "step": 57965 + }, + { + "epoch": 8.634197199880846, + "grad_norm": 0.02197265625, + "learning_rate": 0.021019510666914425, + "loss": 0.7794, + "num_input_tokens_seen": 33655208, + "step": 57970 + }, + { + "epoch": 8.634941912421805, + "grad_norm": 0.03759765625, + "learning_rate": 0.021017724838615155, + "loss": 0.7951, + "num_input_tokens_seen": 33657928, + "step": 57975 + }, + { + "epoch": 8.635686624962764, + "grad_norm": 0.0201416015625, + "learning_rate": 0.021015938908652543, + "loss": 0.7957, + "num_input_tokens_seen": 33660904, + "step": 57980 + }, + { + "epoch": 8.636431337503723, + "grad_norm": 0.02734375, + "learning_rate": 0.021014152877056762, + "loss": 0.7773, + "num_input_tokens_seen": 33663720, + "step": 57985 + }, + { + "epoch": 8.637176050044683, + "grad_norm": 0.0172119140625, + "learning_rate": 0.021012366743857973, + "loss": 0.7747, + "num_input_tokens_seen": 33666440, + "step": 57990 + }, + { + "epoch": 8.637920762585642, + "grad_norm": 0.048583984375, + "learning_rate": 0.02101058050908636, + "loss": 0.824, + "num_input_tokens_seen": 33669128, + "step": 57995 + }, + { + "epoch": 8.6386654751266, + "grad_norm": 0.025146484375, + "learning_rate": 0.021008794172772103, + "loss": 0.7944, + "num_input_tokens_seen": 33671816, + "step": 58000 + }, + { + "epoch": 8.63941018766756, + "grad_norm": 0.03857421875, + "learning_rate": 0.02100700773494537, + "loss": 0.8064, + "num_input_tokens_seen": 33674696, + "step": 58005 + }, + { + "epoch": 8.64015490020852, + "grad_norm": 0.035400390625, + "learning_rate": 0.021005221195636352, + "loss": 0.7665, + "num_input_tokens_seen": 33677640, + "step": 58010 + }, + { + "epoch": 8.640899612749479, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02100343455487522, + "loss": 0.7895, + "num_input_tokens_seen": 33680648, + "step": 58015 + }, + { + "epoch": 8.641644325290438, + "grad_norm": 0.0311279296875, + "learning_rate": 0.021001647812692165, + "loss": 0.7833, + "num_input_tokens_seen": 33683944, + "step": 58020 + }, + { + "epoch": 8.642389037831396, + "grad_norm": 0.032470703125, + "learning_rate": 0.020999860969117372, + "loss": 0.8246, + "num_input_tokens_seen": 33686760, + "step": 58025 + }, + { + "epoch": 8.643133750372357, + "grad_norm": 0.021484375, + "learning_rate": 0.020998074024181024, + "loss": 0.7673, + "num_input_tokens_seen": 33689608, + "step": 58030 + }, + { + "epoch": 8.643878462913316, + "grad_norm": 0.0228271484375, + "learning_rate": 0.020996286977913312, + "loss": 0.7931, + "num_input_tokens_seen": 33692904, + "step": 58035 + }, + { + "epoch": 8.644623175454274, + "grad_norm": 0.0281982421875, + "learning_rate": 0.020994499830344425, + "loss": 0.8262, + "num_input_tokens_seen": 33695784, + "step": 58040 + }, + { + "epoch": 8.645367887995233, + "grad_norm": 0.0189208984375, + "learning_rate": 0.020992712581504556, + "loss": 0.8088, + "num_input_tokens_seen": 33698760, + "step": 58045 + }, + { + "epoch": 8.646112600536194, + "grad_norm": 0.015380859375, + "learning_rate": 0.020990925231423898, + "loss": 0.81, + "num_input_tokens_seen": 33701768, + "step": 58050 + }, + { + "epoch": 8.646857313077152, + "grad_norm": 0.04052734375, + "learning_rate": 0.020989137780132647, + "loss": 0.8251, + "num_input_tokens_seen": 33704424, + "step": 58055 + }, + { + "epoch": 8.647602025618111, + "grad_norm": 0.035400390625, + "learning_rate": 0.020987350227661003, + "loss": 0.8049, + "num_input_tokens_seen": 33707240, + "step": 58060 + }, + { + "epoch": 8.64834673815907, + "grad_norm": 0.02685546875, + "learning_rate": 0.02098556257403916, + "loss": 0.8125, + "num_input_tokens_seen": 33710088, + "step": 58065 + }, + { + "epoch": 8.64909145070003, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02098377481929732, + "loss": 0.8231, + "num_input_tokens_seen": 33713256, + "step": 58070 + }, + { + "epoch": 8.64983616324099, + "grad_norm": 0.01318359375, + "learning_rate": 0.02098198696346569, + "loss": 0.7825, + "num_input_tokens_seen": 33716008, + "step": 58075 + }, + { + "epoch": 8.650580875781948, + "grad_norm": 0.023681640625, + "learning_rate": 0.020980199006574465, + "loss": 0.8006, + "num_input_tokens_seen": 33719176, + "step": 58080 + }, + { + "epoch": 8.651325588322907, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02097841094865386, + "loss": 0.8048, + "num_input_tokens_seen": 33721992, + "step": 58085 + }, + { + "epoch": 8.652070300863867, + "grad_norm": 0.0234375, + "learning_rate": 0.020976622789734077, + "loss": 0.8196, + "num_input_tokens_seen": 33724648, + "step": 58090 + }, + { + "epoch": 8.652815013404826, + "grad_norm": 0.0244140625, + "learning_rate": 0.02097483452984532, + "loss": 0.7878, + "num_input_tokens_seen": 33727624, + "step": 58095 + }, + { + "epoch": 8.653559725945785, + "grad_norm": 0.0167236328125, + "learning_rate": 0.02097304616901782, + "loss": 0.7755, + "num_input_tokens_seen": 33730248, + "step": 58100 + }, + { + "epoch": 8.654304438486744, + "grad_norm": 0.019287109375, + "learning_rate": 0.020971257707281764, + "loss": 0.7879, + "num_input_tokens_seen": 33733000, + "step": 58105 + }, + { + "epoch": 8.655049151027704, + "grad_norm": 0.029296875, + "learning_rate": 0.020969469144667387, + "loss": 0.7875, + "num_input_tokens_seen": 33735880, + "step": 58110 + }, + { + "epoch": 8.655793863568663, + "grad_norm": 0.01708984375, + "learning_rate": 0.02096768048120489, + "loss": 0.794, + "num_input_tokens_seen": 33738536, + "step": 58115 + }, + { + "epoch": 8.656538576109622, + "grad_norm": 0.033203125, + "learning_rate": 0.020965891716924496, + "loss": 0.8019, + "num_input_tokens_seen": 33741256, + "step": 58120 + }, + { + "epoch": 8.65728328865058, + "grad_norm": 0.01312255859375, + "learning_rate": 0.020964102851856424, + "loss": 0.7874, + "num_input_tokens_seen": 33744168, + "step": 58125 + }, + { + "epoch": 8.65802800119154, + "grad_norm": 0.0264892578125, + "learning_rate": 0.020962313886030894, + "loss": 0.841, + "num_input_tokens_seen": 33746888, + "step": 58130 + }, + { + "epoch": 8.6587727137325, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02096052481947814, + "loss": 0.7753, + "num_input_tokens_seen": 33749832, + "step": 58135 + }, + { + "epoch": 8.659517426273458, + "grad_norm": 0.02734375, + "learning_rate": 0.02095873565222837, + "loss": 0.7926, + "num_input_tokens_seen": 33752872, + "step": 58140 + }, + { + "epoch": 8.660262138814417, + "grad_norm": 0.023193359375, + "learning_rate": 0.020956946384311823, + "loss": 0.8116, + "num_input_tokens_seen": 33755848, + "step": 58145 + }, + { + "epoch": 8.661006851355378, + "grad_norm": 0.0322265625, + "learning_rate": 0.02095515701575872, + "loss": 0.8149, + "num_input_tokens_seen": 33758824, + "step": 58150 + }, + { + "epoch": 8.661751563896336, + "grad_norm": 0.02197265625, + "learning_rate": 0.020953367546599287, + "loss": 0.7862, + "num_input_tokens_seen": 33761480, + "step": 58155 + }, + { + "epoch": 8.662496276437295, + "grad_norm": 0.03857421875, + "learning_rate": 0.02095157797686376, + "loss": 0.8274, + "num_input_tokens_seen": 33764360, + "step": 58160 + }, + { + "epoch": 8.663240988978254, + "grad_norm": 0.034912109375, + "learning_rate": 0.020949788306582375, + "loss": 0.8175, + "num_input_tokens_seen": 33767336, + "step": 58165 + }, + { + "epoch": 8.663985701519213, + "grad_norm": 0.02099609375, + "learning_rate": 0.02094799853578536, + "loss": 0.8117, + "num_input_tokens_seen": 33770376, + "step": 58170 + }, + { + "epoch": 8.664730414060173, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02094620866450296, + "loss": 0.8001, + "num_input_tokens_seen": 33773384, + "step": 58175 + }, + { + "epoch": 8.665475126601132, + "grad_norm": 0.02197265625, + "learning_rate": 0.020944418692765407, + "loss": 0.7898, + "num_input_tokens_seen": 33776296, + "step": 58180 + }, + { + "epoch": 8.66621983914209, + "grad_norm": 0.024658203125, + "learning_rate": 0.020942628620602942, + "loss": 0.8076, + "num_input_tokens_seen": 33779208, + "step": 58185 + }, + { + "epoch": 8.66696455168305, + "grad_norm": 0.019287109375, + "learning_rate": 0.020940838448045805, + "loss": 0.8169, + "num_input_tokens_seen": 33782280, + "step": 58190 + }, + { + "epoch": 8.66770926422401, + "grad_norm": 0.0235595703125, + "learning_rate": 0.020939048175124236, + "loss": 0.806, + "num_input_tokens_seen": 33785192, + "step": 58195 + }, + { + "epoch": 8.668453976764969, + "grad_norm": 0.0211181640625, + "learning_rate": 0.020937257801868493, + "loss": 0.805, + "num_input_tokens_seen": 33788040, + "step": 58200 + }, + { + "epoch": 8.669198689305928, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02093546732830881, + "loss": 0.7913, + "num_input_tokens_seen": 33791080, + "step": 58205 + }, + { + "epoch": 8.669943401846886, + "grad_norm": 0.021240234375, + "learning_rate": 0.02093367675447544, + "loss": 0.7977, + "num_input_tokens_seen": 33793896, + "step": 58210 + }, + { + "epoch": 8.670688114387847, + "grad_norm": 0.0294189453125, + "learning_rate": 0.020931886080398625, + "loss": 0.7731, + "num_input_tokens_seen": 33796808, + "step": 58215 + }, + { + "epoch": 8.671432826928806, + "grad_norm": 0.021728515625, + "learning_rate": 0.02093009530610863, + "loss": 0.7829, + "num_input_tokens_seen": 33799912, + "step": 58220 + }, + { + "epoch": 8.672177539469764, + "grad_norm": 0.031494140625, + "learning_rate": 0.0209283044316357, + "loss": 0.8099, + "num_input_tokens_seen": 33803048, + "step": 58225 + }, + { + "epoch": 8.672922252010723, + "grad_norm": 0.02001953125, + "learning_rate": 0.020926513457010097, + "loss": 0.8039, + "num_input_tokens_seen": 33806056, + "step": 58230 + }, + { + "epoch": 8.673666964551684, + "grad_norm": 0.0244140625, + "learning_rate": 0.020924722382262066, + "loss": 0.7951, + "num_input_tokens_seen": 33809192, + "step": 58235 + }, + { + "epoch": 8.674411677092642, + "grad_norm": 0.03125, + "learning_rate": 0.020922931207421877, + "loss": 0.7996, + "num_input_tokens_seen": 33811848, + "step": 58240 + }, + { + "epoch": 8.675156389633601, + "grad_norm": 0.0250244140625, + "learning_rate": 0.020921139932519782, + "loss": 0.8164, + "num_input_tokens_seen": 33814920, + "step": 58245 + }, + { + "epoch": 8.67590110217456, + "grad_norm": 0.0257568359375, + "learning_rate": 0.020919348557586044, + "loss": 0.8127, + "num_input_tokens_seen": 33817704, + "step": 58250 + }, + { + "epoch": 8.67664581471552, + "grad_norm": 0.02294921875, + "learning_rate": 0.02091755708265093, + "loss": 0.8053, + "num_input_tokens_seen": 33820488, + "step": 58255 + }, + { + "epoch": 8.67739052725648, + "grad_norm": 0.0233154296875, + "learning_rate": 0.020915765507744703, + "loss": 0.7977, + "num_input_tokens_seen": 33823400, + "step": 58260 + }, + { + "epoch": 8.678135239797438, + "grad_norm": 0.0308837890625, + "learning_rate": 0.020913973832897635, + "loss": 0.8047, + "num_input_tokens_seen": 33826216, + "step": 58265 + }, + { + "epoch": 8.678879952338397, + "grad_norm": 0.0164794921875, + "learning_rate": 0.020912182058139984, + "loss": 0.7885, + "num_input_tokens_seen": 33829064, + "step": 58270 + }, + { + "epoch": 8.679624664879357, + "grad_norm": 0.03857421875, + "learning_rate": 0.02091039018350203, + "loss": 0.7992, + "num_input_tokens_seen": 33832104, + "step": 58275 + }, + { + "epoch": 8.680369377420316, + "grad_norm": 0.019775390625, + "learning_rate": 0.02090859820901404, + "loss": 0.8066, + "num_input_tokens_seen": 33835016, + "step": 58280 + }, + { + "epoch": 8.681114089961275, + "grad_norm": 0.0322265625, + "learning_rate": 0.020906806134706288, + "loss": 0.7855, + "num_input_tokens_seen": 33838248, + "step": 58285 + }, + { + "epoch": 8.681858802502234, + "grad_norm": 0.01531982421875, + "learning_rate": 0.020905013960609047, + "loss": 0.7893, + "num_input_tokens_seen": 33841160, + "step": 58290 + }, + { + "epoch": 8.682603515043194, + "grad_norm": 0.0269775390625, + "learning_rate": 0.020903221686752595, + "loss": 0.8029, + "num_input_tokens_seen": 33844008, + "step": 58295 + }, + { + "epoch": 8.683348227584153, + "grad_norm": 0.0191650390625, + "learning_rate": 0.02090142931316722, + "loss": 0.8041, + "num_input_tokens_seen": 33846696, + "step": 58300 + }, + { + "epoch": 8.684092940125112, + "grad_norm": 0.03662109375, + "learning_rate": 0.020899636839883193, + "loss": 0.789, + "num_input_tokens_seen": 33849736, + "step": 58305 + }, + { + "epoch": 8.68483765266607, + "grad_norm": 0.030517578125, + "learning_rate": 0.020897844266930796, + "loss": 0.7708, + "num_input_tokens_seen": 33852552, + "step": 58310 + }, + { + "epoch": 8.68558236520703, + "grad_norm": 0.04541015625, + "learning_rate": 0.020896051594340312, + "loss": 0.8339, + "num_input_tokens_seen": 33855400, + "step": 58315 + }, + { + "epoch": 8.68632707774799, + "grad_norm": 0.03369140625, + "learning_rate": 0.02089425882214203, + "loss": 0.8242, + "num_input_tokens_seen": 33858248, + "step": 58320 + }, + { + "epoch": 8.687071790288948, + "grad_norm": 0.031982421875, + "learning_rate": 0.020892465950366236, + "loss": 0.7861, + "num_input_tokens_seen": 33860904, + "step": 58325 + }, + { + "epoch": 8.687816502829907, + "grad_norm": 0.023681640625, + "learning_rate": 0.02089067297904322, + "loss": 0.7811, + "num_input_tokens_seen": 33863816, + "step": 58330 + }, + { + "epoch": 8.688561215370868, + "grad_norm": 0.03173828125, + "learning_rate": 0.020888879908203272, + "loss": 0.8119, + "num_input_tokens_seen": 33867048, + "step": 58335 + }, + { + "epoch": 8.689305927911827, + "grad_norm": 0.036376953125, + "learning_rate": 0.020887086737876683, + "loss": 0.8051, + "num_input_tokens_seen": 33870088, + "step": 58340 + }, + { + "epoch": 8.690050640452785, + "grad_norm": 0.0245361328125, + "learning_rate": 0.020885293468093744, + "loss": 0.8093, + "num_input_tokens_seen": 33873576, + "step": 58345 + }, + { + "epoch": 8.690795352993744, + "grad_norm": 0.022216796875, + "learning_rate": 0.020883500098884754, + "loss": 0.8146, + "num_input_tokens_seen": 33876520, + "step": 58350 + }, + { + "epoch": 8.691540065534703, + "grad_norm": 0.0252685546875, + "learning_rate": 0.020881706630280012, + "loss": 0.7726, + "num_input_tokens_seen": 33879368, + "step": 58355 + }, + { + "epoch": 8.692284778075663, + "grad_norm": 0.033935546875, + "learning_rate": 0.02087991306230981, + "loss": 0.794, + "num_input_tokens_seen": 33882536, + "step": 58360 + }, + { + "epoch": 8.693029490616622, + "grad_norm": 0.027587890625, + "learning_rate": 0.020878119395004457, + "loss": 0.8032, + "num_input_tokens_seen": 33885576, + "step": 58365 + }, + { + "epoch": 8.69377420315758, + "grad_norm": 0.0244140625, + "learning_rate": 0.020876325628394254, + "loss": 0.7812, + "num_input_tokens_seen": 33888456, + "step": 58370 + }, + { + "epoch": 8.69451891569854, + "grad_norm": 0.032470703125, + "learning_rate": 0.020874531762509495, + "loss": 0.8227, + "num_input_tokens_seen": 33891304, + "step": 58375 + }, + { + "epoch": 8.6952636282395, + "grad_norm": 0.0341796875, + "learning_rate": 0.020872737797380494, + "loss": 0.8266, + "num_input_tokens_seen": 33894024, + "step": 58380 + }, + { + "epoch": 8.696008340780459, + "grad_norm": 0.0263671875, + "learning_rate": 0.020870943733037557, + "loss": 0.7907, + "num_input_tokens_seen": 33896744, + "step": 58385 + }, + { + "epoch": 8.696753053321418, + "grad_norm": 0.02734375, + "learning_rate": 0.020869149569510994, + "loss": 0.8217, + "num_input_tokens_seen": 33899880, + "step": 58390 + }, + { + "epoch": 8.697497765862376, + "grad_norm": 0.03271484375, + "learning_rate": 0.020867355306831117, + "loss": 0.7787, + "num_input_tokens_seen": 33903080, + "step": 58395 + }, + { + "epoch": 8.698242478403337, + "grad_norm": 0.037353515625, + "learning_rate": 0.02086556094502823, + "loss": 0.7912, + "num_input_tokens_seen": 33906024, + "step": 58400 + }, + { + "epoch": 8.698987190944296, + "grad_norm": 0.0264892578125, + "learning_rate": 0.020863766484132656, + "loss": 0.7999, + "num_input_tokens_seen": 33908904, + "step": 58405 + }, + { + "epoch": 8.699731903485254, + "grad_norm": 0.0213623046875, + "learning_rate": 0.020861971924174706, + "loss": 0.8008, + "num_input_tokens_seen": 33911944, + "step": 58410 + }, + { + "epoch": 8.700476616026213, + "grad_norm": 0.0211181640625, + "learning_rate": 0.020860177265184694, + "loss": 0.8051, + "num_input_tokens_seen": 33914824, + "step": 58415 + }, + { + "epoch": 8.701221328567174, + "grad_norm": 0.02587890625, + "learning_rate": 0.020858382507192946, + "loss": 0.8189, + "num_input_tokens_seen": 33917544, + "step": 58420 + }, + { + "epoch": 8.701966041108133, + "grad_norm": 0.02734375, + "learning_rate": 0.020856587650229783, + "loss": 0.7627, + "num_input_tokens_seen": 33920168, + "step": 58425 + }, + { + "epoch": 8.702710753649091, + "grad_norm": 0.023193359375, + "learning_rate": 0.02085479269432552, + "loss": 0.8231, + "num_input_tokens_seen": 33923176, + "step": 58430 + }, + { + "epoch": 8.70345546619005, + "grad_norm": 0.0228271484375, + "learning_rate": 0.020852997639510488, + "loss": 0.8035, + "num_input_tokens_seen": 33926120, + "step": 58435 + }, + { + "epoch": 8.70420017873101, + "grad_norm": 0.029541015625, + "learning_rate": 0.02085120248581501, + "loss": 0.8245, + "num_input_tokens_seen": 33928744, + "step": 58440 + }, + { + "epoch": 8.70494489127197, + "grad_norm": 0.0220947265625, + "learning_rate": 0.020849407233269408, + "loss": 0.8052, + "num_input_tokens_seen": 33931464, + "step": 58445 + }, + { + "epoch": 8.705689603812928, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02084761188190402, + "loss": 0.7979, + "num_input_tokens_seen": 33934024, + "step": 58450 + }, + { + "epoch": 8.706434316353887, + "grad_norm": 0.02880859375, + "learning_rate": 0.020845816431749167, + "loss": 0.8138, + "num_input_tokens_seen": 33936840, + "step": 58455 + }, + { + "epoch": 8.707179028894847, + "grad_norm": 0.021240234375, + "learning_rate": 0.02084402088283519, + "loss": 0.8015, + "num_input_tokens_seen": 33939688, + "step": 58460 + }, + { + "epoch": 8.707923741435806, + "grad_norm": 0.034423828125, + "learning_rate": 0.020842225235192427, + "loss": 0.7882, + "num_input_tokens_seen": 33942408, + "step": 58465 + }, + { + "epoch": 8.708668453976765, + "grad_norm": 0.02685546875, + "learning_rate": 0.0208404294888512, + "loss": 0.7888, + "num_input_tokens_seen": 33945608, + "step": 58470 + }, + { + "epoch": 8.709413166517724, + "grad_norm": 0.0279541015625, + "learning_rate": 0.020838633643841854, + "loss": 0.808, + "num_input_tokens_seen": 33948584, + "step": 58475 + }, + { + "epoch": 8.710157879058684, + "grad_norm": 0.02001953125, + "learning_rate": 0.020836837700194726, + "loss": 0.8211, + "num_input_tokens_seen": 33951368, + "step": 58480 + }, + { + "epoch": 8.710902591599643, + "grad_norm": 0.015869140625, + "learning_rate": 0.020835041657940152, + "loss": 0.793, + "num_input_tokens_seen": 33954344, + "step": 58485 + }, + { + "epoch": 8.711647304140602, + "grad_norm": 0.015869140625, + "learning_rate": 0.020833245517108488, + "loss": 0.8076, + "num_input_tokens_seen": 33957192, + "step": 58490 + }, + { + "epoch": 8.71239201668156, + "grad_norm": 0.01373291015625, + "learning_rate": 0.020831449277730063, + "loss": 0.7811, + "num_input_tokens_seen": 33959912, + "step": 58495 + }, + { + "epoch": 8.71313672922252, + "grad_norm": 0.0289306640625, + "learning_rate": 0.020829652939835235, + "loss": 0.8002, + "num_input_tokens_seen": 33963016, + "step": 58500 + }, + { + "epoch": 8.71388144176348, + "grad_norm": 0.019775390625, + "learning_rate": 0.020827856503454343, + "loss": 0.7949, + "num_input_tokens_seen": 33966184, + "step": 58505 + }, + { + "epoch": 8.714626154304439, + "grad_norm": 0.0240478515625, + "learning_rate": 0.020826059968617743, + "loss": 0.7926, + "num_input_tokens_seen": 33969768, + "step": 58510 + }, + { + "epoch": 8.715370866845397, + "grad_norm": 0.0245361328125, + "learning_rate": 0.020824263335355774, + "loss": 0.7999, + "num_input_tokens_seen": 33972520, + "step": 58515 + }, + { + "epoch": 8.716115579386356, + "grad_norm": 0.02880859375, + "learning_rate": 0.020822466603698797, + "loss": 0.821, + "num_input_tokens_seen": 33975592, + "step": 58520 + }, + { + "epoch": 8.716860291927317, + "grad_norm": 0.031494140625, + "learning_rate": 0.020820669773677164, + "loss": 0.7933, + "num_input_tokens_seen": 33978600, + "step": 58525 + }, + { + "epoch": 8.717605004468275, + "grad_norm": 0.03173828125, + "learning_rate": 0.020818872845321232, + "loss": 0.7928, + "num_input_tokens_seen": 33981640, + "step": 58530 + }, + { + "epoch": 8.718349717009234, + "grad_norm": 0.0218505859375, + "learning_rate": 0.020817075818661362, + "loss": 0.7836, + "num_input_tokens_seen": 33984648, + "step": 58535 + }, + { + "epoch": 8.719094429550193, + "grad_norm": 0.0267333984375, + "learning_rate": 0.020815278693727905, + "loss": 0.7992, + "num_input_tokens_seen": 33987336, + "step": 58540 + }, + { + "epoch": 8.719839142091153, + "grad_norm": 0.0255126953125, + "learning_rate": 0.02081348147055122, + "loss": 0.8058, + "num_input_tokens_seen": 33990248, + "step": 58545 + }, + { + "epoch": 8.720583854632112, + "grad_norm": 0.018798828125, + "learning_rate": 0.02081168414916168, + "loss": 0.7826, + "num_input_tokens_seen": 33993320, + "step": 58550 + }, + { + "epoch": 8.721328567173071, + "grad_norm": 0.021240234375, + "learning_rate": 0.020809886729589642, + "loss": 0.803, + "num_input_tokens_seen": 33996232, + "step": 58555 + }, + { + "epoch": 8.72207327971403, + "grad_norm": 0.0169677734375, + "learning_rate": 0.02080808921186547, + "loss": 0.7852, + "num_input_tokens_seen": 33999016, + "step": 58560 + }, + { + "epoch": 8.72281799225499, + "grad_norm": 0.037841796875, + "learning_rate": 0.020806291596019537, + "loss": 0.8046, + "num_input_tokens_seen": 34002056, + "step": 58565 + }, + { + "epoch": 8.723562704795949, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02080449388208221, + "loss": 0.808, + "num_input_tokens_seen": 34005256, + "step": 58570 + }, + { + "epoch": 8.724307417336908, + "grad_norm": 0.02392578125, + "learning_rate": 0.020802696070083854, + "loss": 0.8214, + "num_input_tokens_seen": 34007656, + "step": 58575 + }, + { + "epoch": 8.725052129877866, + "grad_norm": 0.0247802734375, + "learning_rate": 0.020800898160054846, + "loss": 0.7893, + "num_input_tokens_seen": 34010600, + "step": 58580 + }, + { + "epoch": 8.725796842418827, + "grad_norm": 0.0247802734375, + "learning_rate": 0.02079910015202556, + "loss": 0.799, + "num_input_tokens_seen": 34013640, + "step": 58585 + }, + { + "epoch": 8.726541554959786, + "grad_norm": 0.02197265625, + "learning_rate": 0.020797302046026374, + "loss": 0.8023, + "num_input_tokens_seen": 34016552, + "step": 58590 + }, + { + "epoch": 8.727286267500745, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02079550384208766, + "loss": 0.8041, + "num_input_tokens_seen": 34019720, + "step": 58595 + }, + { + "epoch": 8.728030980041703, + "grad_norm": 0.02001953125, + "learning_rate": 0.0207937055402398, + "loss": 0.7963, + "num_input_tokens_seen": 34022376, + "step": 58600 + }, + { + "epoch": 8.728775692582664, + "grad_norm": 0.031494140625, + "learning_rate": 0.020791907140513172, + "loss": 0.8199, + "num_input_tokens_seen": 34025320, + "step": 58605 + }, + { + "epoch": 8.729520405123623, + "grad_norm": 0.01495361328125, + "learning_rate": 0.020790108642938156, + "loss": 0.7896, + "num_input_tokens_seen": 34028008, + "step": 58610 + }, + { + "epoch": 8.730265117664581, + "grad_norm": 0.02734375, + "learning_rate": 0.020788310047545142, + "loss": 0.8255, + "num_input_tokens_seen": 34030792, + "step": 58615 + }, + { + "epoch": 8.73100983020554, + "grad_norm": 0.0263671875, + "learning_rate": 0.020786511354364514, + "loss": 0.7776, + "num_input_tokens_seen": 34033800, + "step": 58620 + }, + { + "epoch": 8.7317545427465, + "grad_norm": 0.0281982421875, + "learning_rate": 0.020784712563426656, + "loss": 0.7962, + "num_input_tokens_seen": 34036520, + "step": 58625 + }, + { + "epoch": 8.73249925528746, + "grad_norm": 0.025390625, + "learning_rate": 0.020782913674761965, + "loss": 0.8484, + "num_input_tokens_seen": 34039240, + "step": 58630 + }, + { + "epoch": 8.733243967828418, + "grad_norm": 0.016845703125, + "learning_rate": 0.02078111468840082, + "loss": 0.816, + "num_input_tokens_seen": 34041864, + "step": 58635 + }, + { + "epoch": 8.733988680369377, + "grad_norm": 0.019775390625, + "learning_rate": 0.020779315604373617, + "loss": 0.7941, + "num_input_tokens_seen": 34044648, + "step": 58640 + }, + { + "epoch": 8.734733392910336, + "grad_norm": 0.02197265625, + "learning_rate": 0.02077751642271075, + "loss": 0.8065, + "num_input_tokens_seen": 34047560, + "step": 58645 + }, + { + "epoch": 8.735478105451296, + "grad_norm": 0.0218505859375, + "learning_rate": 0.020775717143442615, + "loss": 0.7956, + "num_input_tokens_seen": 34050152, + "step": 58650 + }, + { + "epoch": 8.736222817992255, + "grad_norm": 0.029296875, + "learning_rate": 0.02077391776659961, + "loss": 0.8057, + "num_input_tokens_seen": 34053096, + "step": 58655 + }, + { + "epoch": 8.736967530533214, + "grad_norm": 0.0157470703125, + "learning_rate": 0.020772118292212132, + "loss": 0.8047, + "num_input_tokens_seen": 34056040, + "step": 58660 + }, + { + "epoch": 8.737712243074174, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02077031872031059, + "loss": 0.8248, + "num_input_tokens_seen": 34058984, + "step": 58665 + }, + { + "epoch": 8.738456955615133, + "grad_norm": 0.03076171875, + "learning_rate": 0.02076851905092537, + "loss": 0.7998, + "num_input_tokens_seen": 34061960, + "step": 58670 + }, + { + "epoch": 8.739201668156092, + "grad_norm": 0.0223388671875, + "learning_rate": 0.020766719284086882, + "loss": 0.797, + "num_input_tokens_seen": 34064872, + "step": 58675 + }, + { + "epoch": 8.73994638069705, + "grad_norm": 0.034423828125, + "learning_rate": 0.020764919419825535, + "loss": 0.8138, + "num_input_tokens_seen": 34067496, + "step": 58680 + }, + { + "epoch": 8.74069109323801, + "grad_norm": 0.0159912109375, + "learning_rate": 0.020763119458171733, + "loss": 0.7971, + "num_input_tokens_seen": 34070504, + "step": 58685 + }, + { + "epoch": 8.74143580577897, + "grad_norm": 0.03271484375, + "learning_rate": 0.020761319399155886, + "loss": 0.8185, + "num_input_tokens_seen": 34073576, + "step": 58690 + }, + { + "epoch": 8.742180518319929, + "grad_norm": 0.0478515625, + "learning_rate": 0.020759519242808402, + "loss": 0.8159, + "num_input_tokens_seen": 34076328, + "step": 58695 + }, + { + "epoch": 8.742925230860887, + "grad_norm": 0.0233154296875, + "learning_rate": 0.020757718989159697, + "loss": 0.7841, + "num_input_tokens_seen": 34078952, + "step": 58700 + }, + { + "epoch": 8.743669943401846, + "grad_norm": 0.0162353515625, + "learning_rate": 0.020755918638240178, + "loss": 0.8084, + "num_input_tokens_seen": 34081800, + "step": 58705 + }, + { + "epoch": 8.744414655942807, + "grad_norm": 0.01513671875, + "learning_rate": 0.020754118190080266, + "loss": 0.7939, + "num_input_tokens_seen": 34085128, + "step": 58710 + }, + { + "epoch": 8.745159368483765, + "grad_norm": 0.023193359375, + "learning_rate": 0.020752317644710378, + "loss": 0.8054, + "num_input_tokens_seen": 34087816, + "step": 58715 + }, + { + "epoch": 8.745904081024724, + "grad_norm": 0.02880859375, + "learning_rate": 0.020750517002160924, + "loss": 0.8133, + "num_input_tokens_seen": 34090920, + "step": 58720 + }, + { + "epoch": 8.746648793565683, + "grad_norm": 0.030029296875, + "learning_rate": 0.020748716262462333, + "loss": 0.8085, + "num_input_tokens_seen": 34093608, + "step": 58725 + }, + { + "epoch": 8.747393506106643, + "grad_norm": 0.0208740234375, + "learning_rate": 0.020746915425645024, + "loss": 0.8142, + "num_input_tokens_seen": 34096552, + "step": 58730 + }, + { + "epoch": 8.748138218647602, + "grad_norm": 0.021728515625, + "learning_rate": 0.02074511449173942, + "loss": 0.7898, + "num_input_tokens_seen": 34099336, + "step": 58735 + }, + { + "epoch": 8.748882931188561, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02074331346077594, + "loss": 0.7959, + "num_input_tokens_seen": 34102280, + "step": 58740 + }, + { + "epoch": 8.74962764372952, + "grad_norm": 0.0234375, + "learning_rate": 0.020741512332785022, + "loss": 0.7841, + "num_input_tokens_seen": 34105192, + "step": 58745 + }, + { + "epoch": 8.75037235627048, + "grad_norm": 0.0206298828125, + "learning_rate": 0.02073971110779709, + "loss": 0.78, + "num_input_tokens_seen": 34108040, + "step": 58750 + }, + { + "epoch": 8.751117068811439, + "grad_norm": 0.018310546875, + "learning_rate": 0.02073790978584257, + "loss": 0.812, + "num_input_tokens_seen": 34111112, + "step": 58755 + }, + { + "epoch": 8.751861781352398, + "grad_norm": 0.0267333984375, + "learning_rate": 0.020736108366951896, + "loss": 0.8078, + "num_input_tokens_seen": 34114408, + "step": 58760 + }, + { + "epoch": 8.752606493893357, + "grad_norm": 0.029296875, + "learning_rate": 0.020734306851155498, + "loss": 0.8066, + "num_input_tokens_seen": 34117160, + "step": 58765 + }, + { + "epoch": 8.753351206434317, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02073250523848382, + "loss": 0.8253, + "num_input_tokens_seen": 34120072, + "step": 58770 + }, + { + "epoch": 8.754095918975276, + "grad_norm": 0.031005859375, + "learning_rate": 0.02073070352896729, + "loss": 0.816, + "num_input_tokens_seen": 34123112, + "step": 58775 + }, + { + "epoch": 8.754840631516235, + "grad_norm": 0.02294921875, + "learning_rate": 0.020728901722636344, + "loss": 0.8134, + "num_input_tokens_seen": 34125864, + "step": 58780 + }, + { + "epoch": 8.755585344057193, + "grad_norm": 0.0240478515625, + "learning_rate": 0.02072709981952143, + "loss": 0.7937, + "num_input_tokens_seen": 34129032, + "step": 58785 + }, + { + "epoch": 8.756330056598154, + "grad_norm": 0.0218505859375, + "learning_rate": 0.020725297819652988, + "loss": 0.8038, + "num_input_tokens_seen": 34131848, + "step": 58790 + }, + { + "epoch": 8.757074769139113, + "grad_norm": 0.0233154296875, + "learning_rate": 0.020723495723061452, + "loss": 0.805, + "num_input_tokens_seen": 34134920, + "step": 58795 + }, + { + "epoch": 8.757819481680071, + "grad_norm": 0.026123046875, + "learning_rate": 0.02072169352977728, + "loss": 0.7953, + "num_input_tokens_seen": 34137672, + "step": 58800 + }, + { + "epoch": 8.75856419422103, + "grad_norm": 0.022705078125, + "learning_rate": 0.020719891239830906, + "loss": 0.8071, + "num_input_tokens_seen": 34141160, + "step": 58805 + }, + { + "epoch": 8.75930890676199, + "grad_norm": 0.041259765625, + "learning_rate": 0.02071808885325278, + "loss": 0.8147, + "num_input_tokens_seen": 34144232, + "step": 58810 + }, + { + "epoch": 8.76005361930295, + "grad_norm": 0.03173828125, + "learning_rate": 0.020716286370073358, + "loss": 0.796, + "num_input_tokens_seen": 34147304, + "step": 58815 + }, + { + "epoch": 8.760798331843908, + "grad_norm": 0.0245361328125, + "learning_rate": 0.020714483790323087, + "loss": 0.7978, + "num_input_tokens_seen": 34149736, + "step": 58820 + }, + { + "epoch": 8.761543044384867, + "grad_norm": 0.048583984375, + "learning_rate": 0.020712681114032423, + "loss": 0.8099, + "num_input_tokens_seen": 34152552, + "step": 58825 + }, + { + "epoch": 8.762287756925826, + "grad_norm": 0.0218505859375, + "learning_rate": 0.020710878341231816, + "loss": 0.7844, + "num_input_tokens_seen": 34155400, + "step": 58830 + }, + { + "epoch": 8.763032469466786, + "grad_norm": 0.033935546875, + "learning_rate": 0.020709075471951725, + "loss": 0.7815, + "num_input_tokens_seen": 34158568, + "step": 58835 + }, + { + "epoch": 8.763777182007745, + "grad_norm": 0.0264892578125, + "learning_rate": 0.020707272506222604, + "loss": 0.8031, + "num_input_tokens_seen": 34161352, + "step": 58840 + }, + { + "epoch": 8.764521894548704, + "grad_norm": 0.016357421875, + "learning_rate": 0.020705469444074916, + "loss": 0.7981, + "num_input_tokens_seen": 34164328, + "step": 58845 + }, + { + "epoch": 8.765266607089664, + "grad_norm": 0.0299072265625, + "learning_rate": 0.02070366628553912, + "loss": 0.8001, + "num_input_tokens_seen": 34167176, + "step": 58850 + }, + { + "epoch": 8.766011319630623, + "grad_norm": 0.016845703125, + "learning_rate": 0.020701863030645674, + "loss": 0.8121, + "num_input_tokens_seen": 34170088, + "step": 58855 + }, + { + "epoch": 8.766756032171582, + "grad_norm": 0.02197265625, + "learning_rate": 0.020700059679425054, + "loss": 0.8169, + "num_input_tokens_seen": 34172808, + "step": 58860 + }, + { + "epoch": 8.76750074471254, + "grad_norm": 0.0201416015625, + "learning_rate": 0.02069825623190772, + "loss": 0.7649, + "num_input_tokens_seen": 34175624, + "step": 58865 + }, + { + "epoch": 8.7682454572535, + "grad_norm": 0.016845703125, + "learning_rate": 0.020696452688124138, + "loss": 0.8068, + "num_input_tokens_seen": 34178504, + "step": 58870 + }, + { + "epoch": 8.76899016979446, + "grad_norm": 0.040771484375, + "learning_rate": 0.020694649048104773, + "loss": 0.8049, + "num_input_tokens_seen": 34181224, + "step": 58875 + }, + { + "epoch": 8.769734882335419, + "grad_norm": 0.01373291015625, + "learning_rate": 0.020692845311880103, + "loss": 0.8171, + "num_input_tokens_seen": 34184072, + "step": 58880 + }, + { + "epoch": 8.770479594876377, + "grad_norm": 0.0206298828125, + "learning_rate": 0.020691041479480594, + "loss": 0.7859, + "num_input_tokens_seen": 34186600, + "step": 58885 + }, + { + "epoch": 8.771224307417336, + "grad_norm": 0.026611328125, + "learning_rate": 0.020689237550936726, + "loss": 0.7839, + "num_input_tokens_seen": 34189160, + "step": 58890 + }, + { + "epoch": 8.771969019958297, + "grad_norm": 0.0255126953125, + "learning_rate": 0.020687433526278966, + "loss": 0.8052, + "num_input_tokens_seen": 34192072, + "step": 58895 + }, + { + "epoch": 8.772713732499255, + "grad_norm": 0.03173828125, + "learning_rate": 0.02068562940553781, + "loss": 0.7952, + "num_input_tokens_seen": 34194920, + "step": 58900 + }, + { + "epoch": 8.773458445040214, + "grad_norm": 0.025146484375, + "learning_rate": 0.020683825188743715, + "loss": 0.7963, + "num_input_tokens_seen": 34197672, + "step": 58905 + }, + { + "epoch": 8.774203157581173, + "grad_norm": 0.028076171875, + "learning_rate": 0.020682020875927173, + "loss": 0.8005, + "num_input_tokens_seen": 34200648, + "step": 58910 + }, + { + "epoch": 8.774947870122134, + "grad_norm": 0.022705078125, + "learning_rate": 0.020680216467118664, + "loss": 0.8179, + "num_input_tokens_seen": 34203848, + "step": 58915 + }, + { + "epoch": 8.775692582663092, + "grad_norm": 0.03466796875, + "learning_rate": 0.020678411962348667, + "loss": 0.8055, + "num_input_tokens_seen": 34206888, + "step": 58920 + }, + { + "epoch": 8.776437295204051, + "grad_norm": 0.0257568359375, + "learning_rate": 0.020676607361647673, + "loss": 0.8018, + "num_input_tokens_seen": 34209768, + "step": 58925 + }, + { + "epoch": 8.77718200774501, + "grad_norm": 0.024658203125, + "learning_rate": 0.020674802665046163, + "loss": 0.8018, + "num_input_tokens_seen": 34212648, + "step": 58930 + }, + { + "epoch": 8.77792672028597, + "grad_norm": 0.03759765625, + "learning_rate": 0.02067299787257464, + "loss": 0.7944, + "num_input_tokens_seen": 34215560, + "step": 58935 + }, + { + "epoch": 8.778671432826929, + "grad_norm": 0.0279541015625, + "learning_rate": 0.020671192984263575, + "loss": 0.7904, + "num_input_tokens_seen": 34218568, + "step": 58940 + }, + { + "epoch": 8.779416145367888, + "grad_norm": 0.03125, + "learning_rate": 0.02066938800014347, + "loss": 0.7975, + "num_input_tokens_seen": 34221288, + "step": 58945 + }, + { + "epoch": 8.780160857908847, + "grad_norm": 0.0244140625, + "learning_rate": 0.020667582920244815, + "loss": 0.7852, + "num_input_tokens_seen": 34224072, + "step": 58950 + }, + { + "epoch": 8.780905570449807, + "grad_norm": 0.02685546875, + "learning_rate": 0.020665777744598112, + "loss": 0.7867, + "num_input_tokens_seen": 34227240, + "step": 58955 + }, + { + "epoch": 8.781650282990766, + "grad_norm": 0.022216796875, + "learning_rate": 0.02066397247323385, + "loss": 0.8062, + "num_input_tokens_seen": 34230344, + "step": 58960 + }, + { + "epoch": 8.782394995531725, + "grad_norm": 0.0213623046875, + "learning_rate": 0.020662167106182534, + "loss": 0.7651, + "num_input_tokens_seen": 34233544, + "step": 58965 + }, + { + "epoch": 8.783139708072683, + "grad_norm": 0.02587890625, + "learning_rate": 0.02066036164347465, + "loss": 0.7789, + "num_input_tokens_seen": 34236296, + "step": 58970 + }, + { + "epoch": 8.783884420613644, + "grad_norm": 0.0341796875, + "learning_rate": 0.020658556085140715, + "loss": 0.8078, + "num_input_tokens_seen": 34239240, + "step": 58975 + }, + { + "epoch": 8.784629133154603, + "grad_norm": 0.0263671875, + "learning_rate": 0.020656750431211228, + "loss": 0.8026, + "num_input_tokens_seen": 34242024, + "step": 58980 + }, + { + "epoch": 8.785373845695561, + "grad_norm": 0.0235595703125, + "learning_rate": 0.020654944681716685, + "loss": 0.791, + "num_input_tokens_seen": 34245032, + "step": 58985 + }, + { + "epoch": 8.78611855823652, + "grad_norm": 0.03515625, + "learning_rate": 0.020653138836687603, + "loss": 0.8424, + "num_input_tokens_seen": 34247752, + "step": 58990 + }, + { + "epoch": 8.78686327077748, + "grad_norm": 0.044189453125, + "learning_rate": 0.02065133289615449, + "loss": 0.7714, + "num_input_tokens_seen": 34250856, + "step": 58995 + }, + { + "epoch": 8.78760798331844, + "grad_norm": 0.0269775390625, + "learning_rate": 0.02064952686014785, + "loss": 0.7765, + "num_input_tokens_seen": 34253768, + "step": 59000 + }, + { + "epoch": 8.788352695859398, + "grad_norm": 0.0361328125, + "learning_rate": 0.02064772072869819, + "loss": 0.8155, + "num_input_tokens_seen": 34256872, + "step": 59005 + }, + { + "epoch": 8.789097408400357, + "grad_norm": 0.024658203125, + "learning_rate": 0.020645914501836032, + "loss": 0.858, + "num_input_tokens_seen": 34259720, + "step": 59010 + }, + { + "epoch": 8.789842120941316, + "grad_norm": 0.0198974609375, + "learning_rate": 0.020644108179591884, + "loss": 0.8035, + "num_input_tokens_seen": 34262408, + "step": 59015 + }, + { + "epoch": 8.790586833482276, + "grad_norm": 0.0220947265625, + "learning_rate": 0.020642301761996266, + "loss": 0.7838, + "num_input_tokens_seen": 34265416, + "step": 59020 + }, + { + "epoch": 8.791331546023235, + "grad_norm": 0.018798828125, + "learning_rate": 0.020640495249079696, + "loss": 0.7827, + "num_input_tokens_seen": 34268776, + "step": 59025 + }, + { + "epoch": 8.792076258564194, + "grad_norm": 0.023681640625, + "learning_rate": 0.02063868864087269, + "loss": 0.84, + "num_input_tokens_seen": 34271752, + "step": 59030 + }, + { + "epoch": 8.792820971105153, + "grad_norm": 0.0225830078125, + "learning_rate": 0.020636881937405777, + "loss": 0.8118, + "num_input_tokens_seen": 34274568, + "step": 59035 + }, + { + "epoch": 8.793565683646113, + "grad_norm": 0.0255126953125, + "learning_rate": 0.020635075138709467, + "loss": 0.8167, + "num_input_tokens_seen": 34278088, + "step": 59040 + }, + { + "epoch": 8.794310396187072, + "grad_norm": 0.01409912109375, + "learning_rate": 0.020633268244814287, + "loss": 0.7898, + "num_input_tokens_seen": 34281032, + "step": 59045 + }, + { + "epoch": 8.79505510872803, + "grad_norm": 0.0257568359375, + "learning_rate": 0.020631461255750765, + "loss": 0.7883, + "num_input_tokens_seen": 34283976, + "step": 59050 + }, + { + "epoch": 8.79579982126899, + "grad_norm": 0.025390625, + "learning_rate": 0.02062965417154943, + "loss": 0.8093, + "num_input_tokens_seen": 34286952, + "step": 59055 + }, + { + "epoch": 8.79654453380995, + "grad_norm": 0.035400390625, + "learning_rate": 0.020627846992240812, + "loss": 0.7963, + "num_input_tokens_seen": 34289928, + "step": 59060 + }, + { + "epoch": 8.797289246350909, + "grad_norm": 0.0322265625, + "learning_rate": 0.02062603971785544, + "loss": 0.8102, + "num_input_tokens_seen": 34293224, + "step": 59065 + }, + { + "epoch": 8.798033958891867, + "grad_norm": 0.0311279296875, + "learning_rate": 0.020624232348423845, + "loss": 0.7958, + "num_input_tokens_seen": 34295880, + "step": 59070 + }, + { + "epoch": 8.798778671432826, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02062242488397656, + "loss": 0.7942, + "num_input_tokens_seen": 34298792, + "step": 59075 + }, + { + "epoch": 8.799523383973787, + "grad_norm": 0.030029296875, + "learning_rate": 0.020620617324544118, + "loss": 0.7935, + "num_input_tokens_seen": 34301576, + "step": 59080 + }, + { + "epoch": 8.800268096514746, + "grad_norm": 0.021728515625, + "learning_rate": 0.02061880967015706, + "loss": 0.8073, + "num_input_tokens_seen": 34304744, + "step": 59085 + }, + { + "epoch": 8.801012809055704, + "grad_norm": 0.0289306640625, + "learning_rate": 0.020617001920845923, + "loss": 0.8329, + "num_input_tokens_seen": 34307784, + "step": 59090 + }, + { + "epoch": 8.801757521596663, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02061519407664125, + "loss": 0.8093, + "num_input_tokens_seen": 34310600, + "step": 59095 + }, + { + "epoch": 8.802502234137624, + "grad_norm": 0.0245361328125, + "learning_rate": 0.020613386137573575, + "loss": 0.8079, + "num_input_tokens_seen": 34313384, + "step": 59100 + }, + { + "epoch": 8.803246946678582, + "grad_norm": 0.03173828125, + "learning_rate": 0.020611578103673454, + "loss": 0.7715, + "num_input_tokens_seen": 34316392, + "step": 59105 + }, + { + "epoch": 8.803991659219541, + "grad_norm": 0.0279541015625, + "learning_rate": 0.020609769974971416, + "loss": 0.7877, + "num_input_tokens_seen": 34319112, + "step": 59110 + }, + { + "epoch": 8.8047363717605, + "grad_norm": 0.023681640625, + "learning_rate": 0.02060796175149802, + "loss": 0.8065, + "num_input_tokens_seen": 34322088, + "step": 59115 + }, + { + "epoch": 8.80548108430146, + "grad_norm": 0.026123046875, + "learning_rate": 0.020606153433283814, + "loss": 0.7818, + "num_input_tokens_seen": 34324712, + "step": 59120 + }, + { + "epoch": 8.80622579684242, + "grad_norm": 0.0244140625, + "learning_rate": 0.02060434502035934, + "loss": 0.8035, + "num_input_tokens_seen": 34327656, + "step": 59125 + }, + { + "epoch": 8.806970509383378, + "grad_norm": 0.024658203125, + "learning_rate": 0.020602536512755152, + "loss": 0.7854, + "num_input_tokens_seen": 34330472, + "step": 59130 + }, + { + "epoch": 8.807715221924337, + "grad_norm": 0.01708984375, + "learning_rate": 0.0206007279105018, + "loss": 0.8043, + "num_input_tokens_seen": 34333192, + "step": 59135 + }, + { + "epoch": 8.808459934465297, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02059891921362985, + "loss": 0.8054, + "num_input_tokens_seen": 34336200, + "step": 59140 + }, + { + "epoch": 8.809204647006256, + "grad_norm": 0.0255126953125, + "learning_rate": 0.020597110422169843, + "loss": 0.8111, + "num_input_tokens_seen": 34339112, + "step": 59145 + }, + { + "epoch": 8.809949359547215, + "grad_norm": 0.0311279296875, + "learning_rate": 0.020595301536152348, + "loss": 0.8342, + "num_input_tokens_seen": 34341928, + "step": 59150 + }, + { + "epoch": 8.810694072088173, + "grad_norm": 0.030517578125, + "learning_rate": 0.020593492555607922, + "loss": 0.8307, + "num_input_tokens_seen": 34344776, + "step": 59155 + }, + { + "epoch": 8.811438784629132, + "grad_norm": 0.0218505859375, + "learning_rate": 0.020591683480567124, + "loss": 0.7883, + "num_input_tokens_seen": 34347816, + "step": 59160 + }, + { + "epoch": 8.812183497170093, + "grad_norm": 0.026611328125, + "learning_rate": 0.020589874311060515, + "loss": 0.7626, + "num_input_tokens_seen": 34350568, + "step": 59165 + }, + { + "epoch": 8.812928209711052, + "grad_norm": 0.0252685546875, + "learning_rate": 0.020588065047118662, + "loss": 0.7728, + "num_input_tokens_seen": 34353384, + "step": 59170 + }, + { + "epoch": 8.81367292225201, + "grad_norm": 0.0244140625, + "learning_rate": 0.020586255688772124, + "loss": 0.7991, + "num_input_tokens_seen": 34356200, + "step": 59175 + }, + { + "epoch": 8.81441763479297, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02058444623605148, + "loss": 0.7817, + "num_input_tokens_seen": 34359240, + "step": 59180 + }, + { + "epoch": 8.81516234733393, + "grad_norm": 0.0224609375, + "learning_rate": 0.02058263668898729, + "loss": 0.8096, + "num_input_tokens_seen": 34361992, + "step": 59185 + }, + { + "epoch": 8.815907059874888, + "grad_norm": 0.038818359375, + "learning_rate": 0.020580827047610127, + "loss": 0.831, + "num_input_tokens_seen": 34364776, + "step": 59190 + }, + { + "epoch": 8.816651772415847, + "grad_norm": 0.0322265625, + "learning_rate": 0.020579017311950564, + "loss": 0.8235, + "num_input_tokens_seen": 34367880, + "step": 59195 + }, + { + "epoch": 8.817396484956806, + "grad_norm": 0.0196533203125, + "learning_rate": 0.020577207482039176, + "loss": 0.8244, + "num_input_tokens_seen": 34371048, + "step": 59200 + }, + { + "epoch": 8.818141197497766, + "grad_norm": 0.0194091796875, + "learning_rate": 0.020575397557906532, + "loss": 0.7913, + "num_input_tokens_seen": 34373736, + "step": 59205 + }, + { + "epoch": 8.818885910038725, + "grad_norm": 0.022705078125, + "learning_rate": 0.020573587539583216, + "loss": 0.779, + "num_input_tokens_seen": 34376648, + "step": 59210 + }, + { + "epoch": 8.819630622579684, + "grad_norm": 0.026123046875, + "learning_rate": 0.020571777427099803, + "loss": 0.7768, + "num_input_tokens_seen": 34379368, + "step": 59215 + }, + { + "epoch": 8.820375335120643, + "grad_norm": 0.0225830078125, + "learning_rate": 0.02056996722048687, + "loss": 0.772, + "num_input_tokens_seen": 34382152, + "step": 59220 + }, + { + "epoch": 8.821120047661603, + "grad_norm": 0.03271484375, + "learning_rate": 0.020568156919775003, + "loss": 0.7998, + "num_input_tokens_seen": 34385032, + "step": 59225 + }, + { + "epoch": 8.821864760202562, + "grad_norm": 0.0205078125, + "learning_rate": 0.020566346524994783, + "loss": 0.7987, + "num_input_tokens_seen": 34388072, + "step": 59230 + }, + { + "epoch": 8.82260947274352, + "grad_norm": 0.0218505859375, + "learning_rate": 0.020564536036176806, + "loss": 0.7872, + "num_input_tokens_seen": 34390856, + "step": 59235 + }, + { + "epoch": 8.82335418528448, + "grad_norm": 0.033203125, + "learning_rate": 0.020562725453351638, + "loss": 0.8017, + "num_input_tokens_seen": 34393672, + "step": 59240 + }, + { + "epoch": 8.82409889782544, + "grad_norm": 0.034912109375, + "learning_rate": 0.020560914776549876, + "loss": 0.7969, + "num_input_tokens_seen": 34396616, + "step": 59245 + }, + { + "epoch": 8.824843610366399, + "grad_norm": 0.032470703125, + "learning_rate": 0.020559104005802114, + "loss": 0.8118, + "num_input_tokens_seen": 34399336, + "step": 59250 + }, + { + "epoch": 8.825588322907358, + "grad_norm": 0.0216064453125, + "learning_rate": 0.020557293141138938, + "loss": 0.7695, + "num_input_tokens_seen": 34402184, + "step": 59255 + }, + { + "epoch": 8.826333035448316, + "grad_norm": 0.0242919921875, + "learning_rate": 0.020555482182590946, + "loss": 0.7831, + "num_input_tokens_seen": 34405064, + "step": 59260 + }, + { + "epoch": 8.827077747989277, + "grad_norm": 0.01483154296875, + "learning_rate": 0.02055367113018873, + "loss": 0.8301, + "num_input_tokens_seen": 34407656, + "step": 59265 + }, + { + "epoch": 8.827822460530236, + "grad_norm": 0.03173828125, + "learning_rate": 0.02055185998396288, + "loss": 0.8108, + "num_input_tokens_seen": 34410376, + "step": 59270 + }, + { + "epoch": 8.828567173071194, + "grad_norm": 0.0267333984375, + "learning_rate": 0.020550048743944, + "loss": 0.7853, + "num_input_tokens_seen": 34413352, + "step": 59275 + }, + { + "epoch": 8.829311885612153, + "grad_norm": 0.045654296875, + "learning_rate": 0.02054823741016269, + "loss": 0.8227, + "num_input_tokens_seen": 34416424, + "step": 59280 + }, + { + "epoch": 8.830056598153114, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02054642598264954, + "loss": 0.7935, + "num_input_tokens_seen": 34419624, + "step": 59285 + }, + { + "epoch": 8.830801310694072, + "grad_norm": 0.0267333984375, + "learning_rate": 0.020544614461435167, + "loss": 0.8093, + "num_input_tokens_seen": 34422280, + "step": 59290 + }, + { + "epoch": 8.831546023235031, + "grad_norm": 0.0279541015625, + "learning_rate": 0.02054280284655017, + "loss": 0.788, + "num_input_tokens_seen": 34425032, + "step": 59295 + }, + { + "epoch": 8.83229073577599, + "grad_norm": 0.0184326171875, + "learning_rate": 0.02054099113802515, + "loss": 0.8186, + "num_input_tokens_seen": 34427848, + "step": 59300 + }, + { + "epoch": 8.83303544831695, + "grad_norm": 0.02197265625, + "learning_rate": 0.020539179335890712, + "loss": 0.797, + "num_input_tokens_seen": 34430600, + "step": 59305 + }, + { + "epoch": 8.83378016085791, + "grad_norm": 0.03857421875, + "learning_rate": 0.020537367440177475, + "loss": 0.7893, + "num_input_tokens_seen": 34433384, + "step": 59310 + }, + { + "epoch": 8.834524873398868, + "grad_norm": 0.036376953125, + "learning_rate": 0.020535555450916038, + "loss": 0.809, + "num_input_tokens_seen": 34436264, + "step": 59315 + }, + { + "epoch": 8.835269585939827, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02053374336813702, + "loss": 0.7976, + "num_input_tokens_seen": 34439112, + "step": 59320 + }, + { + "epoch": 8.836014298480787, + "grad_norm": 0.03173828125, + "learning_rate": 0.020531931191871033, + "loss": 0.7959, + "num_input_tokens_seen": 34442056, + "step": 59325 + }, + { + "epoch": 8.836759011021746, + "grad_norm": 0.03076171875, + "learning_rate": 0.02053011892214869, + "loss": 0.8253, + "num_input_tokens_seen": 34444808, + "step": 59330 + }, + { + "epoch": 8.837503723562705, + "grad_norm": 0.032470703125, + "learning_rate": 0.02052830655900061, + "loss": 0.7976, + "num_input_tokens_seen": 34448040, + "step": 59335 + }, + { + "epoch": 8.838248436103664, + "grad_norm": 0.0380859375, + "learning_rate": 0.020526494102457406, + "loss": 0.8177, + "num_input_tokens_seen": 34451016, + "step": 59340 + }, + { + "epoch": 8.838993148644622, + "grad_norm": 0.038818359375, + "learning_rate": 0.0205246815525497, + "loss": 0.7887, + "num_input_tokens_seen": 34454088, + "step": 59345 + }, + { + "epoch": 8.839737861185583, + "grad_norm": 0.0233154296875, + "learning_rate": 0.020522868909308115, + "loss": 0.7999, + "num_input_tokens_seen": 34456936, + "step": 59350 + }, + { + "epoch": 8.840482573726542, + "grad_norm": 0.0272216796875, + "learning_rate": 0.02052105617276328, + "loss": 0.826, + "num_input_tokens_seen": 34460168, + "step": 59355 + }, + { + "epoch": 8.8412272862675, + "grad_norm": 0.0213623046875, + "learning_rate": 0.020519243342945803, + "loss": 0.8226, + "num_input_tokens_seen": 34463112, + "step": 59360 + }, + { + "epoch": 8.84197199880846, + "grad_norm": 0.0233154296875, + "learning_rate": 0.020517430419886327, + "loss": 0.7939, + "num_input_tokens_seen": 34465896, + "step": 59365 + }, + { + "epoch": 8.84271671134942, + "grad_norm": 0.0302734375, + "learning_rate": 0.020515617403615467, + "loss": 0.8145, + "num_input_tokens_seen": 34468776, + "step": 59370 + }, + { + "epoch": 8.843461423890378, + "grad_norm": 0.02978515625, + "learning_rate": 0.020513804294163857, + "loss": 0.7874, + "num_input_tokens_seen": 34471624, + "step": 59375 + }, + { + "epoch": 8.844206136431337, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02051199109156213, + "loss": 0.8057, + "num_input_tokens_seen": 34474536, + "step": 59380 + }, + { + "epoch": 8.844950848972296, + "grad_norm": 0.027099609375, + "learning_rate": 0.02051017779584091, + "loss": 0.8294, + "num_input_tokens_seen": 34477512, + "step": 59385 + }, + { + "epoch": 8.845695561513256, + "grad_norm": 0.024658203125, + "learning_rate": 0.020508364407030844, + "loss": 0.8075, + "num_input_tokens_seen": 34480168, + "step": 59390 + }, + { + "epoch": 8.846440274054215, + "grad_norm": 0.018798828125, + "learning_rate": 0.020506550925162556, + "loss": 0.8117, + "num_input_tokens_seen": 34483144, + "step": 59395 + }, + { + "epoch": 8.847184986595174, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02050473735026669, + "loss": 0.8194, + "num_input_tokens_seen": 34485832, + "step": 59400 + }, + { + "epoch": 8.847929699136133, + "grad_norm": 0.045166015625, + "learning_rate": 0.02050292368237388, + "loss": 0.8148, + "num_input_tokens_seen": 34488872, + "step": 59405 + }, + { + "epoch": 8.848674411677093, + "grad_norm": 0.026611328125, + "learning_rate": 0.020501109921514764, + "loss": 0.7882, + "num_input_tokens_seen": 34491720, + "step": 59410 + }, + { + "epoch": 8.849419124218052, + "grad_norm": 0.029296875, + "learning_rate": 0.020499296067719986, + "loss": 0.8207, + "num_input_tokens_seen": 34494632, + "step": 59415 + }, + { + "epoch": 8.85016383675901, + "grad_norm": 0.02490234375, + "learning_rate": 0.02049748212102019, + "loss": 0.7999, + "num_input_tokens_seen": 34497672, + "step": 59420 + }, + { + "epoch": 8.85090854929997, + "grad_norm": 0.033447265625, + "learning_rate": 0.020495668081446025, + "loss": 0.7911, + "num_input_tokens_seen": 34500424, + "step": 59425 + }, + { + "epoch": 8.85165326184093, + "grad_norm": 0.02099609375, + "learning_rate": 0.020493853949028133, + "loss": 0.8016, + "num_input_tokens_seen": 34503208, + "step": 59430 + }, + { + "epoch": 8.852397974381889, + "grad_norm": 0.03515625, + "learning_rate": 0.02049203972379716, + "loss": 0.8075, + "num_input_tokens_seen": 34506024, + "step": 59435 + }, + { + "epoch": 8.853142686922848, + "grad_norm": 0.03955078125, + "learning_rate": 0.02049022540578376, + "loss": 0.7938, + "num_input_tokens_seen": 34508968, + "step": 59440 + }, + { + "epoch": 8.853887399463806, + "grad_norm": 0.022216796875, + "learning_rate": 0.02048841099501858, + "loss": 0.8097, + "num_input_tokens_seen": 34511656, + "step": 59445 + }, + { + "epoch": 8.854632112004767, + "grad_norm": 0.0303955078125, + "learning_rate": 0.020486596491532276, + "loss": 0.8192, + "num_input_tokens_seen": 34514696, + "step": 59450 + }, + { + "epoch": 8.855376824545726, + "grad_norm": 0.023681640625, + "learning_rate": 0.020484781895355495, + "loss": 0.7994, + "num_input_tokens_seen": 34517480, + "step": 59455 + }, + { + "epoch": 8.856121537086684, + "grad_norm": 0.024169921875, + "learning_rate": 0.020482967206518907, + "loss": 0.8188, + "num_input_tokens_seen": 34520488, + "step": 59460 + }, + { + "epoch": 8.856866249627643, + "grad_norm": 0.031494140625, + "learning_rate": 0.02048115242505316, + "loss": 0.7786, + "num_input_tokens_seen": 34523208, + "step": 59465 + }, + { + "epoch": 8.857610962168604, + "grad_norm": 0.01556396484375, + "learning_rate": 0.020479337550988904, + "loss": 0.8185, + "num_input_tokens_seen": 34526248, + "step": 59470 + }, + { + "epoch": 8.858355674709562, + "grad_norm": 0.04541015625, + "learning_rate": 0.02047752258435682, + "loss": 0.803, + "num_input_tokens_seen": 34529000, + "step": 59475 + }, + { + "epoch": 8.859100387250521, + "grad_norm": 0.01611328125, + "learning_rate": 0.020475707525187552, + "loss": 0.8, + "num_input_tokens_seen": 34531976, + "step": 59480 + }, + { + "epoch": 8.85984509979148, + "grad_norm": 0.021484375, + "learning_rate": 0.020473892373511774, + "loss": 0.7831, + "num_input_tokens_seen": 34534728, + "step": 59485 + }, + { + "epoch": 8.86058981233244, + "grad_norm": 0.0220947265625, + "learning_rate": 0.020472077129360146, + "loss": 0.8007, + "num_input_tokens_seen": 34537800, + "step": 59490 + }, + { + "epoch": 8.8613345248734, + "grad_norm": 0.025634765625, + "learning_rate": 0.020470261792763338, + "loss": 0.7768, + "num_input_tokens_seen": 34540392, + "step": 59495 + }, + { + "epoch": 8.862079237414358, + "grad_norm": 0.03662109375, + "learning_rate": 0.020468446363752016, + "loss": 0.7803, + "num_input_tokens_seen": 34543240, + "step": 59500 + }, + { + "epoch": 8.862823949955317, + "grad_norm": 0.027099609375, + "learning_rate": 0.02046663084235685, + "loss": 0.8179, + "num_input_tokens_seen": 34545960, + "step": 59505 + }, + { + "epoch": 8.863568662496277, + "grad_norm": 0.017822265625, + "learning_rate": 0.02046481522860851, + "loss": 0.8113, + "num_input_tokens_seen": 34548616, + "step": 59510 + }, + { + "epoch": 8.864313375037236, + "grad_norm": 0.025634765625, + "learning_rate": 0.02046299952253768, + "loss": 0.82, + "num_input_tokens_seen": 34551432, + "step": 59515 + }, + { + "epoch": 8.865058087578195, + "grad_norm": 0.0230712890625, + "learning_rate": 0.020461183724175016, + "loss": 0.7993, + "num_input_tokens_seen": 34554216, + "step": 59520 + }, + { + "epoch": 8.865802800119154, + "grad_norm": 0.0242919921875, + "learning_rate": 0.020459367833551206, + "loss": 0.8056, + "num_input_tokens_seen": 34557576, + "step": 59525 + }, + { + "epoch": 8.866547512660112, + "grad_norm": 0.0242919921875, + "learning_rate": 0.020457551850696927, + "loss": 0.8001, + "num_input_tokens_seen": 34560328, + "step": 59530 + }, + { + "epoch": 8.867292225201073, + "grad_norm": 0.0238037109375, + "learning_rate": 0.020455735775642855, + "loss": 0.7967, + "num_input_tokens_seen": 34563176, + "step": 59535 + }, + { + "epoch": 8.868036937742032, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02045391960841967, + "loss": 0.813, + "num_input_tokens_seen": 34566184, + "step": 59540 + }, + { + "epoch": 8.86878165028299, + "grad_norm": 0.0260009765625, + "learning_rate": 0.020452103349058056, + "loss": 0.7998, + "num_input_tokens_seen": 34569000, + "step": 59545 + }, + { + "epoch": 8.86952636282395, + "grad_norm": 0.025634765625, + "learning_rate": 0.0204502869975887, + "loss": 0.8059, + "num_input_tokens_seen": 34571720, + "step": 59550 + }, + { + "epoch": 8.87027107536491, + "grad_norm": 0.0208740234375, + "learning_rate": 0.020448470554042284, + "loss": 0.8041, + "num_input_tokens_seen": 34574376, + "step": 59555 + }, + { + "epoch": 8.871015787905868, + "grad_norm": 0.037353515625, + "learning_rate": 0.020446654018449497, + "loss": 0.8027, + "num_input_tokens_seen": 34577032, + "step": 59560 + }, + { + "epoch": 8.871760500446827, + "grad_norm": 0.0263671875, + "learning_rate": 0.020444837390841023, + "loss": 0.7998, + "num_input_tokens_seen": 34579944, + "step": 59565 + }, + { + "epoch": 8.872505212987786, + "grad_norm": 0.022705078125, + "learning_rate": 0.020443020671247557, + "loss": 0.8048, + "num_input_tokens_seen": 34582504, + "step": 59570 + }, + { + "epoch": 8.873249925528746, + "grad_norm": 0.033203125, + "learning_rate": 0.02044120385969979, + "loss": 0.8005, + "num_input_tokens_seen": 34585288, + "step": 59575 + }, + { + "epoch": 8.873994638069705, + "grad_norm": 0.0267333984375, + "learning_rate": 0.02043938695622841, + "loss": 0.793, + "num_input_tokens_seen": 34588168, + "step": 59580 + }, + { + "epoch": 8.874739350610664, + "grad_norm": 0.019287109375, + "learning_rate": 0.02043756996086412, + "loss": 0.7969, + "num_input_tokens_seen": 34590792, + "step": 59585 + }, + { + "epoch": 8.875484063151623, + "grad_norm": 0.0240478515625, + "learning_rate": 0.020435752873637605, + "loss": 0.7834, + "num_input_tokens_seen": 34593576, + "step": 59590 + }, + { + "epoch": 8.876228775692583, + "grad_norm": 0.0263671875, + "learning_rate": 0.020433935694579578, + "loss": 0.8035, + "num_input_tokens_seen": 34596616, + "step": 59595 + }, + { + "epoch": 8.876973488233542, + "grad_norm": 0.01806640625, + "learning_rate": 0.02043211842372073, + "loss": 0.7933, + "num_input_tokens_seen": 34599656, + "step": 59600 + }, + { + "epoch": 8.8777182007745, + "grad_norm": 0.0147705078125, + "learning_rate": 0.020430301061091758, + "loss": 0.8042, + "num_input_tokens_seen": 34602408, + "step": 59605 + }, + { + "epoch": 8.87846291331546, + "grad_norm": 0.0262451171875, + "learning_rate": 0.020428483606723367, + "loss": 0.7989, + "num_input_tokens_seen": 34605448, + "step": 59610 + }, + { + "epoch": 8.87920762585642, + "grad_norm": 0.0257568359375, + "learning_rate": 0.02042666606064627, + "loss": 0.7982, + "num_input_tokens_seen": 34608520, + "step": 59615 + }, + { + "epoch": 8.879952338397379, + "grad_norm": 0.0213623046875, + "learning_rate": 0.02042484842289116, + "loss": 0.8085, + "num_input_tokens_seen": 34611752, + "step": 59620 + }, + { + "epoch": 8.880697050938338, + "grad_norm": 0.03515625, + "learning_rate": 0.02042303069348875, + "loss": 0.8166, + "num_input_tokens_seen": 34614568, + "step": 59625 + }, + { + "epoch": 8.881441763479296, + "grad_norm": 0.02001953125, + "learning_rate": 0.02042121287246975, + "loss": 0.7511, + "num_input_tokens_seen": 34617256, + "step": 59630 + }, + { + "epoch": 8.882186476020257, + "grad_norm": 0.0162353515625, + "learning_rate": 0.020419394959864865, + "loss": 0.7886, + "num_input_tokens_seen": 34620168, + "step": 59635 + }, + { + "epoch": 8.882931188561216, + "grad_norm": 0.0238037109375, + "learning_rate": 0.020417576955704814, + "loss": 0.8079, + "num_input_tokens_seen": 34623048, + "step": 59640 + }, + { + "epoch": 8.883675901102174, + "grad_norm": 0.03271484375, + "learning_rate": 0.020415758860020304, + "loss": 0.7828, + "num_input_tokens_seen": 34625896, + "step": 59645 + }, + { + "epoch": 8.884420613643133, + "grad_norm": 0.0284423828125, + "learning_rate": 0.02041394067284205, + "loss": 0.8304, + "num_input_tokens_seen": 34628744, + "step": 59650 + }, + { + "epoch": 8.885165326184094, + "grad_norm": 0.0269775390625, + "learning_rate": 0.020412122394200777, + "loss": 0.8491, + "num_input_tokens_seen": 34631464, + "step": 59655 + }, + { + "epoch": 8.885910038725052, + "grad_norm": 0.02392578125, + "learning_rate": 0.020410304024127197, + "loss": 0.8016, + "num_input_tokens_seen": 34634376, + "step": 59660 + }, + { + "epoch": 8.886654751266011, + "grad_norm": 0.023193359375, + "learning_rate": 0.020408485562652023, + "loss": 0.8022, + "num_input_tokens_seen": 34637192, + "step": 59665 + }, + { + "epoch": 8.88739946380697, + "grad_norm": 0.022216796875, + "learning_rate": 0.020406667009805986, + "loss": 0.8152, + "num_input_tokens_seen": 34639848, + "step": 59670 + }, + { + "epoch": 8.88814417634793, + "grad_norm": 0.0228271484375, + "learning_rate": 0.020404848365619805, + "loss": 0.7878, + "num_input_tokens_seen": 34642472, + "step": 59675 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.02294921875, + "learning_rate": 0.020403029630124206, + "loss": 0.8058, + "num_input_tokens_seen": 34645288, + "step": 59680 + }, + { + "epoch": 8.889633601429848, + "grad_norm": 0.0303955078125, + "learning_rate": 0.020401210803349907, + "loss": 0.7962, + "num_input_tokens_seen": 34648136, + "step": 59685 + }, + { + "epoch": 8.890378313970807, + "grad_norm": 0.0284423828125, + "learning_rate": 0.020399391885327647, + "loss": 0.8005, + "num_input_tokens_seen": 34650856, + "step": 59690 + }, + { + "epoch": 8.891123026511767, + "grad_norm": 0.0206298828125, + "learning_rate": 0.020397572876088146, + "loss": 0.8195, + "num_input_tokens_seen": 34653800, + "step": 59695 + }, + { + "epoch": 8.891867739052726, + "grad_norm": 0.02294921875, + "learning_rate": 0.020395753775662134, + "loss": 0.8248, + "num_input_tokens_seen": 34656712, + "step": 59700 + }, + { + "epoch": 8.892612451593685, + "grad_norm": 0.0179443359375, + "learning_rate": 0.02039393458408035, + "loss": 0.8184, + "num_input_tokens_seen": 34659528, + "step": 59705 + }, + { + "epoch": 8.893357164134644, + "grad_norm": 0.0164794921875, + "learning_rate": 0.020392115301373522, + "loss": 0.8179, + "num_input_tokens_seen": 34662216, + "step": 59710 + }, + { + "epoch": 8.894101876675602, + "grad_norm": 0.03125, + "learning_rate": 0.020390295927572386, + "loss": 0.8072, + "num_input_tokens_seen": 34665192, + "step": 59715 + }, + { + "epoch": 8.894846589216563, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02038847646270768, + "loss": 0.8162, + "num_input_tokens_seen": 34667784, + "step": 59720 + }, + { + "epoch": 8.895591301757522, + "grad_norm": 0.017822265625, + "learning_rate": 0.02038665690681014, + "loss": 0.8028, + "num_input_tokens_seen": 34670824, + "step": 59725 + }, + { + "epoch": 8.89633601429848, + "grad_norm": 0.0191650390625, + "learning_rate": 0.020384837259910513, + "loss": 0.7954, + "num_input_tokens_seen": 34673608, + "step": 59730 + }, + { + "epoch": 8.89708072683944, + "grad_norm": 0.0238037109375, + "learning_rate": 0.020383017522039526, + "loss": 0.8141, + "num_input_tokens_seen": 34676584, + "step": 59735 + }, + { + "epoch": 8.8978254393804, + "grad_norm": 0.0213623046875, + "learning_rate": 0.020381197693227927, + "loss": 0.7867, + "num_input_tokens_seen": 34679368, + "step": 59740 + }, + { + "epoch": 8.898570151921358, + "grad_norm": 0.0291748046875, + "learning_rate": 0.020379377773506464, + "loss": 0.8037, + "num_input_tokens_seen": 34682248, + "step": 59745 + }, + { + "epoch": 8.899314864462317, + "grad_norm": 0.0283203125, + "learning_rate": 0.020377557762905882, + "loss": 0.8207, + "num_input_tokens_seen": 34685032, + "step": 59750 + }, + { + "epoch": 8.900059577003276, + "grad_norm": 0.0267333984375, + "learning_rate": 0.020375737661456925, + "loss": 0.8029, + "num_input_tokens_seen": 34687688, + "step": 59755 + }, + { + "epoch": 8.900804289544237, + "grad_norm": 0.0203857421875, + "learning_rate": 0.02037391746919035, + "loss": 0.7822, + "num_input_tokens_seen": 34690728, + "step": 59760 + }, + { + "epoch": 8.901549002085195, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0203720971861369, + "loss": 0.8049, + "num_input_tokens_seen": 34693864, + "step": 59765 + }, + { + "epoch": 8.902293714626154, + "grad_norm": 0.021240234375, + "learning_rate": 0.020370276812327323, + "loss": 0.7944, + "num_input_tokens_seen": 34696904, + "step": 59770 + }, + { + "epoch": 8.903038427167113, + "grad_norm": 0.021240234375, + "learning_rate": 0.02036845634779238, + "loss": 0.7994, + "num_input_tokens_seen": 34699944, + "step": 59775 + }, + { + "epoch": 8.903783139708073, + "grad_norm": 0.0308837890625, + "learning_rate": 0.020366635792562822, + "loss": 0.8147, + "num_input_tokens_seen": 34703048, + "step": 59780 + }, + { + "epoch": 8.904527852249032, + "grad_norm": 0.0189208984375, + "learning_rate": 0.020364815146669404, + "loss": 0.8073, + "num_input_tokens_seen": 34705928, + "step": 59785 + }, + { + "epoch": 8.90527256478999, + "grad_norm": 0.0242919921875, + "learning_rate": 0.020362994410142894, + "loss": 0.8273, + "num_input_tokens_seen": 34708968, + "step": 59790 + }, + { + "epoch": 8.90601727733095, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02036117358301404, + "loss": 0.7983, + "num_input_tokens_seen": 34711816, + "step": 59795 + }, + { + "epoch": 8.90676198987191, + "grad_norm": 0.041015625, + "learning_rate": 0.02035935266531361, + "loss": 0.8022, + "num_input_tokens_seen": 34714440, + "step": 59800 + }, + { + "epoch": 8.907506702412869, + "grad_norm": 0.026123046875, + "learning_rate": 0.020357531657072363, + "loss": 0.7953, + "num_input_tokens_seen": 34717320, + "step": 59805 + }, + { + "epoch": 8.908251414953828, + "grad_norm": 0.03271484375, + "learning_rate": 0.02035571055832106, + "loss": 0.8028, + "num_input_tokens_seen": 34720328, + "step": 59810 + }, + { + "epoch": 8.908996127494786, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02035388936909047, + "loss": 0.8051, + "num_input_tokens_seen": 34723560, + "step": 59815 + }, + { + "epoch": 8.909740840035747, + "grad_norm": 0.0240478515625, + "learning_rate": 0.020352068089411365, + "loss": 0.7883, + "num_input_tokens_seen": 34726568, + "step": 59820 + }, + { + "epoch": 8.910485552576706, + "grad_norm": 0.0162353515625, + "learning_rate": 0.02035024671931451, + "loss": 0.8084, + "num_input_tokens_seen": 34729320, + "step": 59825 + }, + { + "epoch": 8.911230265117664, + "grad_norm": 0.0238037109375, + "learning_rate": 0.02034842525883067, + "loss": 0.8111, + "num_input_tokens_seen": 34732104, + "step": 59830 + }, + { + "epoch": 8.911974977658623, + "grad_norm": 0.015625, + "learning_rate": 0.020346603707990622, + "loss": 0.8007, + "num_input_tokens_seen": 34734856, + "step": 59835 + }, + { + "epoch": 8.912719690199584, + "grad_norm": 0.034912109375, + "learning_rate": 0.02034478206682514, + "loss": 0.7894, + "num_input_tokens_seen": 34737576, + "step": 59840 + }, + { + "epoch": 8.913464402740543, + "grad_norm": 0.0299072265625, + "learning_rate": 0.020342960335364996, + "loss": 0.8004, + "num_input_tokens_seen": 34740584, + "step": 59845 + }, + { + "epoch": 8.914209115281501, + "grad_norm": 0.0294189453125, + "learning_rate": 0.020341138513640968, + "loss": 0.7957, + "num_input_tokens_seen": 34743656, + "step": 59850 + }, + { + "epoch": 8.91495382782246, + "grad_norm": 0.0279541015625, + "learning_rate": 0.020339316601683834, + "loss": 0.8051, + "num_input_tokens_seen": 34746568, + "step": 59855 + }, + { + "epoch": 8.915698540363419, + "grad_norm": 0.0155029296875, + "learning_rate": 0.020337494599524372, + "loss": 0.7988, + "num_input_tokens_seen": 34749736, + "step": 59860 + }, + { + "epoch": 8.91644325290438, + "grad_norm": 0.0166015625, + "learning_rate": 0.020335672507193363, + "loss": 0.8016, + "num_input_tokens_seen": 34752648, + "step": 59865 + }, + { + "epoch": 8.917187965445338, + "grad_norm": 0.0223388671875, + "learning_rate": 0.02033385032472159, + "loss": 0.7935, + "num_input_tokens_seen": 34755880, + "step": 59870 + }, + { + "epoch": 8.917932677986297, + "grad_norm": 0.0233154296875, + "learning_rate": 0.020332028052139836, + "loss": 0.7853, + "num_input_tokens_seen": 34758696, + "step": 59875 + }, + { + "epoch": 8.918677390527257, + "grad_norm": 0.026611328125, + "learning_rate": 0.02033020568947889, + "loss": 0.802, + "num_input_tokens_seen": 34761480, + "step": 59880 + }, + { + "epoch": 8.919422103068216, + "grad_norm": 0.03857421875, + "learning_rate": 0.020328383236769533, + "loss": 0.8105, + "num_input_tokens_seen": 34764744, + "step": 59885 + }, + { + "epoch": 8.920166815609175, + "grad_norm": 0.029296875, + "learning_rate": 0.02032656069404256, + "loss": 0.8025, + "num_input_tokens_seen": 34767496, + "step": 59890 + }, + { + "epoch": 8.920911528150134, + "grad_norm": 0.03515625, + "learning_rate": 0.02032473806132876, + "loss": 0.7843, + "num_input_tokens_seen": 34771560, + "step": 59895 + }, + { + "epoch": 8.921656240691092, + "grad_norm": 0.0235595703125, + "learning_rate": 0.020322915338658914, + "loss": 0.7966, + "num_input_tokens_seen": 34774504, + "step": 59900 + }, + { + "epoch": 8.922400953232053, + "grad_norm": 0.0283203125, + "learning_rate": 0.020321092526063827, + "loss": 0.7725, + "num_input_tokens_seen": 34777352, + "step": 59905 + }, + { + "epoch": 8.923145665773012, + "grad_norm": 0.03759765625, + "learning_rate": 0.02031926962357429, + "loss": 0.7892, + "num_input_tokens_seen": 34780008, + "step": 59910 + }, + { + "epoch": 8.92389037831397, + "grad_norm": 0.046630859375, + "learning_rate": 0.0203174466312211, + "loss": 0.8186, + "num_input_tokens_seen": 34783144, + "step": 59915 + }, + { + "epoch": 8.92463509085493, + "grad_norm": 0.01556396484375, + "learning_rate": 0.020315623549035055, + "loss": 0.7918, + "num_input_tokens_seen": 34785960, + "step": 59920 + }, + { + "epoch": 8.92537980339589, + "grad_norm": 0.0166015625, + "learning_rate": 0.02031380037704695, + "loss": 0.8192, + "num_input_tokens_seen": 34788680, + "step": 59925 + }, + { + "epoch": 8.926124515936849, + "grad_norm": 0.0228271484375, + "learning_rate": 0.02031197711528759, + "loss": 0.7665, + "num_input_tokens_seen": 34791400, + "step": 59930 + }, + { + "epoch": 8.926869228477807, + "grad_norm": 0.0311279296875, + "learning_rate": 0.020310153763787777, + "loss": 0.7905, + "num_input_tokens_seen": 34794440, + "step": 59935 + }, + { + "epoch": 8.927613941018766, + "grad_norm": 0.03271484375, + "learning_rate": 0.020308330322578307, + "loss": 0.8169, + "num_input_tokens_seen": 34797768, + "step": 59940 + }, + { + "epoch": 8.928358653559727, + "grad_norm": 0.0296630859375, + "learning_rate": 0.020306506791689993, + "loss": 0.8287, + "num_input_tokens_seen": 34800456, + "step": 59945 + }, + { + "epoch": 8.929103366100685, + "grad_norm": 0.032958984375, + "learning_rate": 0.020304683171153647, + "loss": 0.8062, + "num_input_tokens_seen": 34803464, + "step": 59950 + }, + { + "epoch": 8.929848078641644, + "grad_norm": 0.0233154296875, + "learning_rate": 0.020302859461000064, + "loss": 0.8105, + "num_input_tokens_seen": 34806472, + "step": 59955 + }, + { + "epoch": 8.930592791182603, + "grad_norm": 0.045166015625, + "learning_rate": 0.020301035661260063, + "loss": 0.8127, + "num_input_tokens_seen": 34809480, + "step": 59960 + }, + { + "epoch": 8.931337503723563, + "grad_norm": 0.0274658203125, + "learning_rate": 0.02029921177196445, + "loss": 0.7942, + "num_input_tokens_seen": 34812328, + "step": 59965 + }, + { + "epoch": 8.932082216264522, + "grad_norm": 0.021240234375, + "learning_rate": 0.020297387793144044, + "loss": 0.8056, + "num_input_tokens_seen": 34814952, + "step": 59970 + }, + { + "epoch": 8.932826928805481, + "grad_norm": 0.03173828125, + "learning_rate": 0.020295563724829654, + "loss": 0.7992, + "num_input_tokens_seen": 34817512, + "step": 59975 + }, + { + "epoch": 8.93357164134644, + "grad_norm": 0.0546875, + "learning_rate": 0.020293739567052087, + "loss": 0.803, + "num_input_tokens_seen": 34820424, + "step": 59980 + }, + { + "epoch": 8.9343163538874, + "grad_norm": 0.042236328125, + "learning_rate": 0.02029191531984218, + "loss": 0.8042, + "num_input_tokens_seen": 34823112, + "step": 59985 + }, + { + "epoch": 8.935061066428359, + "grad_norm": 0.0291748046875, + "learning_rate": 0.020290090983230747, + "loss": 0.8158, + "num_input_tokens_seen": 34825800, + "step": 59990 + }, + { + "epoch": 8.935805778969318, + "grad_norm": 0.0615234375, + "learning_rate": 0.020288266557248594, + "loss": 0.7934, + "num_input_tokens_seen": 34828648, + "step": 59995 + }, + { + "epoch": 8.936550491510276, + "grad_norm": 0.03564453125, + "learning_rate": 0.020286442041926558, + "loss": 0.7917, + "num_input_tokens_seen": 34831496, + "step": 60000 + }, + { + "epoch": 8.937295204051237, + "grad_norm": 0.04248046875, + "learning_rate": 0.02028461743729545, + "loss": 0.7869, + "num_input_tokens_seen": 34834248, + "step": 60005 + }, + { + "epoch": 8.938039916592196, + "grad_norm": 0.020263671875, + "learning_rate": 0.020282792743386107, + "loss": 0.7965, + "num_input_tokens_seen": 34837064, + "step": 60010 + }, + { + "epoch": 8.938784629133155, + "grad_norm": 0.02587890625, + "learning_rate": 0.020280967960229348, + "loss": 0.8226, + "num_input_tokens_seen": 34839944, + "step": 60015 + }, + { + "epoch": 8.939529341674113, + "grad_norm": 0.0281982421875, + "learning_rate": 0.020279143087856, + "loss": 0.7746, + "num_input_tokens_seen": 34843176, + "step": 60020 + }, + { + "epoch": 8.940274054215074, + "grad_norm": 0.039794921875, + "learning_rate": 0.0202773181262969, + "loss": 0.7845, + "num_input_tokens_seen": 34845960, + "step": 60025 + }, + { + "epoch": 8.941018766756033, + "grad_norm": 0.041748046875, + "learning_rate": 0.020275493075582872, + "loss": 0.8116, + "num_input_tokens_seen": 34848872, + "step": 60030 + }, + { + "epoch": 8.941763479296991, + "grad_norm": 0.04638671875, + "learning_rate": 0.020273667935744748, + "loss": 0.8289, + "num_input_tokens_seen": 34852168, + "step": 60035 + }, + { + "epoch": 8.94250819183795, + "grad_norm": 0.259765625, + "learning_rate": 0.02027184270681337, + "loss": 0.8131, + "num_input_tokens_seen": 34855144, + "step": 60040 + }, + { + "epoch": 8.943252904378909, + "grad_norm": 0.11572265625, + "learning_rate": 0.02027001738881956, + "loss": 0.8203, + "num_input_tokens_seen": 34858152, + "step": 60045 + }, + { + "epoch": 8.94399761691987, + "grad_norm": 0.041748046875, + "learning_rate": 0.020268191981794168, + "loss": 0.8164, + "num_input_tokens_seen": 34860872, + "step": 60050 + }, + { + "epoch": 8.944742329460828, + "grad_norm": 0.04248046875, + "learning_rate": 0.02026636648576802, + "loss": 0.7982, + "num_input_tokens_seen": 34863560, + "step": 60055 + }, + { + "epoch": 8.945487042001787, + "grad_norm": 0.0478515625, + "learning_rate": 0.020264540900771972, + "loss": 0.8165, + "num_input_tokens_seen": 34866600, + "step": 60060 + }, + { + "epoch": 8.946231754542747, + "grad_norm": 0.037353515625, + "learning_rate": 0.020262715226836852, + "loss": 0.7972, + "num_input_tokens_seen": 34869608, + "step": 60065 + }, + { + "epoch": 8.946976467083706, + "grad_norm": 0.02392578125, + "learning_rate": 0.020260889463993504, + "loss": 0.8179, + "num_input_tokens_seen": 34872584, + "step": 60070 + }, + { + "epoch": 8.947721179624665, + "grad_norm": 0.033203125, + "learning_rate": 0.02025906361227278, + "loss": 0.8048, + "num_input_tokens_seen": 34875560, + "step": 60075 + }, + { + "epoch": 8.948465892165624, + "grad_norm": 0.03662109375, + "learning_rate": 0.02025723767170552, + "loss": 0.7948, + "num_input_tokens_seen": 34878216, + "step": 60080 + }, + { + "epoch": 8.949210604706582, + "grad_norm": 0.020263671875, + "learning_rate": 0.02025541164232257, + "loss": 0.7936, + "num_input_tokens_seen": 34880872, + "step": 60085 + }, + { + "epoch": 8.949955317247543, + "grad_norm": 0.03564453125, + "learning_rate": 0.020253585524154787, + "loss": 0.8135, + "num_input_tokens_seen": 34884072, + "step": 60090 + }, + { + "epoch": 8.950700029788502, + "grad_norm": 0.0303955078125, + "learning_rate": 0.020251759317233015, + "loss": 0.8079, + "num_input_tokens_seen": 34886952, + "step": 60095 + }, + { + "epoch": 8.95144474232946, + "grad_norm": 0.0189208984375, + "learning_rate": 0.020249933021588104, + "loss": 0.7943, + "num_input_tokens_seen": 34889928, + "step": 60100 + }, + { + "epoch": 8.95218945487042, + "grad_norm": 0.0291748046875, + "learning_rate": 0.02024810663725091, + "loss": 0.7951, + "num_input_tokens_seen": 34892584, + "step": 60105 + }, + { + "epoch": 8.95293416741138, + "grad_norm": 0.02197265625, + "learning_rate": 0.020246280164252287, + "loss": 0.7995, + "num_input_tokens_seen": 34895432, + "step": 60110 + }, + { + "epoch": 8.953678879952339, + "grad_norm": 0.045654296875, + "learning_rate": 0.020244453602623096, + "loss": 0.7962, + "num_input_tokens_seen": 34898280, + "step": 60115 + }, + { + "epoch": 8.954423592493297, + "grad_norm": 0.037353515625, + "learning_rate": 0.02024262695239419, + "loss": 0.8009, + "num_input_tokens_seen": 34901256, + "step": 60120 + }, + { + "epoch": 8.955168305034256, + "grad_norm": 0.0203857421875, + "learning_rate": 0.020240800213596435, + "loss": 0.8355, + "num_input_tokens_seen": 34904072, + "step": 60125 + }, + { + "epoch": 8.955913017575217, + "grad_norm": 0.0223388671875, + "learning_rate": 0.020238973386260684, + "loss": 0.8013, + "num_input_tokens_seen": 34906888, + "step": 60130 + }, + { + "epoch": 8.956657730116175, + "grad_norm": 0.03125, + "learning_rate": 0.0202371464704178, + "loss": 0.791, + "num_input_tokens_seen": 34909992, + "step": 60135 + }, + { + "epoch": 8.957402442657134, + "grad_norm": 0.031982421875, + "learning_rate": 0.02023531946609865, + "loss": 0.7823, + "num_input_tokens_seen": 34912840, + "step": 60140 + }, + { + "epoch": 8.958147155198093, + "grad_norm": 0.031494140625, + "learning_rate": 0.0202334923733341, + "loss": 0.7907, + "num_input_tokens_seen": 34915944, + "step": 60145 + }, + { + "epoch": 8.958891867739053, + "grad_norm": 0.02099609375, + "learning_rate": 0.02023166519215501, + "loss": 0.7782, + "num_input_tokens_seen": 34918696, + "step": 60150 + }, + { + "epoch": 8.959636580280012, + "grad_norm": 0.03466796875, + "learning_rate": 0.02022983792259226, + "loss": 0.8354, + "num_input_tokens_seen": 34921736, + "step": 60155 + }, + { + "epoch": 8.960381292820971, + "grad_norm": 0.045166015625, + "learning_rate": 0.020228010564676713, + "loss": 0.8216, + "num_input_tokens_seen": 34924872, + "step": 60160 + }, + { + "epoch": 8.96112600536193, + "grad_norm": 0.021484375, + "learning_rate": 0.020226183118439243, + "loss": 0.7935, + "num_input_tokens_seen": 34927656, + "step": 60165 + }, + { + "epoch": 8.96187071790289, + "grad_norm": 0.021728515625, + "learning_rate": 0.020224355583910718, + "loss": 0.7936, + "num_input_tokens_seen": 34930632, + "step": 60170 + }, + { + "epoch": 8.962615430443849, + "grad_norm": 0.0244140625, + "learning_rate": 0.020222527961122012, + "loss": 0.7799, + "num_input_tokens_seen": 34933736, + "step": 60175 + }, + { + "epoch": 8.963360142984808, + "grad_norm": 0.03125, + "learning_rate": 0.020220700250104005, + "loss": 0.8146, + "num_input_tokens_seen": 34936552, + "step": 60180 + }, + { + "epoch": 8.964104855525767, + "grad_norm": 0.0277099609375, + "learning_rate": 0.020218872450887576, + "loss": 0.8069, + "num_input_tokens_seen": 34939432, + "step": 60185 + }, + { + "epoch": 8.964849568066727, + "grad_norm": 0.0159912109375, + "learning_rate": 0.0202170445635036, + "loss": 0.821, + "num_input_tokens_seen": 34941992, + "step": 60190 + }, + { + "epoch": 8.965594280607686, + "grad_norm": 0.026123046875, + "learning_rate": 0.02021521658798296, + "loss": 0.823, + "num_input_tokens_seen": 34944808, + "step": 60195 + }, + { + "epoch": 8.966338993148645, + "grad_norm": 0.0201416015625, + "learning_rate": 0.020213388524356536, + "loss": 0.8038, + "num_input_tokens_seen": 34948072, + "step": 60200 + }, + { + "epoch": 8.967083705689603, + "grad_norm": 0.0303955078125, + "learning_rate": 0.02021156037265521, + "loss": 0.8173, + "num_input_tokens_seen": 34950696, + "step": 60205 + }, + { + "epoch": 8.967828418230564, + "grad_norm": 0.015869140625, + "learning_rate": 0.020209732132909868, + "loss": 0.8108, + "num_input_tokens_seen": 34953640, + "step": 60210 + }, + { + "epoch": 8.968573130771523, + "grad_norm": 0.040771484375, + "learning_rate": 0.0202079038051514, + "loss": 0.7986, + "num_input_tokens_seen": 34956392, + "step": 60215 + }, + { + "epoch": 8.969317843312481, + "grad_norm": 0.0419921875, + "learning_rate": 0.020206075389410685, + "loss": 0.8111, + "num_input_tokens_seen": 34959336, + "step": 60220 + }, + { + "epoch": 8.97006255585344, + "grad_norm": 0.02685546875, + "learning_rate": 0.020204246885718622, + "loss": 0.7977, + "num_input_tokens_seen": 34962024, + "step": 60225 + }, + { + "epoch": 8.970807268394399, + "grad_norm": 0.03466796875, + "learning_rate": 0.020202418294106097, + "loss": 0.8016, + "num_input_tokens_seen": 34964968, + "step": 60230 + }, + { + "epoch": 8.97155198093536, + "grad_norm": 0.043212890625, + "learning_rate": 0.020200589614604, + "loss": 0.8046, + "num_input_tokens_seen": 34967912, + "step": 60235 + }, + { + "epoch": 8.972296693476318, + "grad_norm": 0.031494140625, + "learning_rate": 0.020198760847243227, + "loss": 0.8155, + "num_input_tokens_seen": 34970632, + "step": 60240 + }, + { + "epoch": 8.973041406017277, + "grad_norm": 0.026123046875, + "learning_rate": 0.02019693199205468, + "loss": 0.8031, + "num_input_tokens_seen": 34973320, + "step": 60245 + }, + { + "epoch": 8.973786118558236, + "grad_norm": 0.024658203125, + "learning_rate": 0.020195103049069237, + "loss": 0.8052, + "num_input_tokens_seen": 34976264, + "step": 60250 + }, + { + "epoch": 8.974530831099196, + "grad_norm": 0.02294921875, + "learning_rate": 0.020193274018317817, + "loss": 0.8041, + "num_input_tokens_seen": 34978984, + "step": 60255 + }, + { + "epoch": 8.975275543640155, + "grad_norm": 0.0208740234375, + "learning_rate": 0.02019144489983131, + "loss": 0.801, + "num_input_tokens_seen": 34981928, + "step": 60260 + }, + { + "epoch": 8.976020256181114, + "grad_norm": 0.0322265625, + "learning_rate": 0.020189615693640615, + "loss": 0.7933, + "num_input_tokens_seen": 34984680, + "step": 60265 + }, + { + "epoch": 8.976764968722073, + "grad_norm": 0.052978515625, + "learning_rate": 0.020187786399776635, + "loss": 0.809, + "num_input_tokens_seen": 34987400, + "step": 60270 + }, + { + "epoch": 8.977509681263033, + "grad_norm": 0.023681640625, + "learning_rate": 0.02018595701827028, + "loss": 0.8017, + "num_input_tokens_seen": 34990376, + "step": 60275 + }, + { + "epoch": 8.978254393803992, + "grad_norm": 0.04052734375, + "learning_rate": 0.020184127549152452, + "loss": 0.8122, + "num_input_tokens_seen": 34993128, + "step": 60280 + }, + { + "epoch": 8.97899910634495, + "grad_norm": 0.031494140625, + "learning_rate": 0.020182297992454058, + "loss": 0.8087, + "num_input_tokens_seen": 34995848, + "step": 60285 + }, + { + "epoch": 8.97974381888591, + "grad_norm": 0.0145263671875, + "learning_rate": 0.020180468348206006, + "loss": 0.8032, + "num_input_tokens_seen": 34998568, + "step": 60290 + }, + { + "epoch": 8.98048853142687, + "grad_norm": 0.0230712890625, + "learning_rate": 0.020178638616439207, + "loss": 0.7947, + "num_input_tokens_seen": 35001512, + "step": 60295 + }, + { + "epoch": 8.981233243967829, + "grad_norm": 0.0289306640625, + "learning_rate": 0.020176808797184573, + "loss": 0.8168, + "num_input_tokens_seen": 35004680, + "step": 60300 + }, + { + "epoch": 8.981977956508787, + "grad_norm": 0.0380859375, + "learning_rate": 0.02017497889047301, + "loss": 0.7879, + "num_input_tokens_seen": 35007560, + "step": 60305 + }, + { + "epoch": 8.982722669049746, + "grad_norm": 0.0308837890625, + "learning_rate": 0.020173148896335445, + "loss": 0.8083, + "num_input_tokens_seen": 35010344, + "step": 60310 + }, + { + "epoch": 8.983467381590707, + "grad_norm": 0.0181884765625, + "learning_rate": 0.020171318814802785, + "loss": 0.7915, + "num_input_tokens_seen": 35013160, + "step": 60315 + }, + { + "epoch": 8.984212094131665, + "grad_norm": 0.0194091796875, + "learning_rate": 0.02016948864590595, + "loss": 0.8169, + "num_input_tokens_seen": 35016008, + "step": 60320 + }, + { + "epoch": 8.984956806672624, + "grad_norm": 0.0301513671875, + "learning_rate": 0.020167658389675856, + "loss": 0.7969, + "num_input_tokens_seen": 35019208, + "step": 60325 + }, + { + "epoch": 8.985701519213583, + "grad_norm": 0.02734375, + "learning_rate": 0.02016582804614343, + "loss": 0.8128, + "num_input_tokens_seen": 35021928, + "step": 60330 + }, + { + "epoch": 8.986446231754543, + "grad_norm": 0.0322265625, + "learning_rate": 0.020163997615339584, + "loss": 0.8117, + "num_input_tokens_seen": 35024904, + "step": 60335 + }, + { + "epoch": 8.987190944295502, + "grad_norm": 0.026611328125, + "learning_rate": 0.02016216709729525, + "loss": 0.8195, + "num_input_tokens_seen": 35027848, + "step": 60340 + }, + { + "epoch": 8.987935656836461, + "grad_norm": 0.03466796875, + "learning_rate": 0.02016033649204135, + "loss": 0.8014, + "num_input_tokens_seen": 35030856, + "step": 60345 + }, + { + "epoch": 8.98868036937742, + "grad_norm": 0.0240478515625, + "learning_rate": 0.020158505799608806, + "loss": 0.7912, + "num_input_tokens_seen": 35033800, + "step": 60350 + }, + { + "epoch": 8.98942508191838, + "grad_norm": 0.0308837890625, + "learning_rate": 0.020156675020028554, + "loss": 0.8042, + "num_input_tokens_seen": 35036712, + "step": 60355 + }, + { + "epoch": 8.990169794459339, + "grad_norm": 0.0155029296875, + "learning_rate": 0.020154844153331516, + "loss": 0.8192, + "num_input_tokens_seen": 35039336, + "step": 60360 + }, + { + "epoch": 8.990914507000298, + "grad_norm": 0.022705078125, + "learning_rate": 0.020153013199548625, + "loss": 0.8071, + "num_input_tokens_seen": 35042152, + "step": 60365 + }, + { + "epoch": 8.991659219541257, + "grad_norm": 0.0238037109375, + "learning_rate": 0.020151182158710812, + "loss": 0.7946, + "num_input_tokens_seen": 35044936, + "step": 60370 + }, + { + "epoch": 8.992403932082215, + "grad_norm": 0.021484375, + "learning_rate": 0.020149351030849017, + "loss": 0.7939, + "num_input_tokens_seen": 35047816, + "step": 60375 + }, + { + "epoch": 8.993148644623176, + "grad_norm": 0.0267333984375, + "learning_rate": 0.020147519815994164, + "loss": 0.82, + "num_input_tokens_seen": 35050856, + "step": 60380 + }, + { + "epoch": 8.993893357164135, + "grad_norm": 0.037353515625, + "learning_rate": 0.020145688514177196, + "loss": 0.7908, + "num_input_tokens_seen": 35053672, + "step": 60385 + }, + { + "epoch": 8.994638069705093, + "grad_norm": 0.022705078125, + "learning_rate": 0.020143857125429057, + "loss": 0.7995, + "num_input_tokens_seen": 35056360, + "step": 60390 + }, + { + "epoch": 8.995382782246054, + "grad_norm": 0.0299072265625, + "learning_rate": 0.020142025649780675, + "loss": 0.7864, + "num_input_tokens_seen": 35059336, + "step": 60395 + }, + { + "epoch": 8.996127494787013, + "grad_norm": 0.02197265625, + "learning_rate": 0.020140194087262996, + "loss": 0.7866, + "num_input_tokens_seen": 35062120, + "step": 60400 + }, + { + "epoch": 8.996872207327971, + "grad_norm": 0.0225830078125, + "learning_rate": 0.020138362437906964, + "loss": 0.7781, + "num_input_tokens_seen": 35064904, + "step": 60405 + }, + { + "epoch": 8.99761691986893, + "grad_norm": 0.02734375, + "learning_rate": 0.020136530701743517, + "loss": 0.7967, + "num_input_tokens_seen": 35068200, + "step": 60410 + }, + { + "epoch": 8.998361632409889, + "grad_norm": 0.0250244140625, + "learning_rate": 0.020134698878803606, + "loss": 0.8183, + "num_input_tokens_seen": 35071304, + "step": 60415 + }, + { + "epoch": 8.99910634495085, + "grad_norm": 0.0260009765625, + "learning_rate": 0.020132866969118177, + "loss": 0.7962, + "num_input_tokens_seen": 35074088, + "step": 60420 + }, + { + "epoch": 8.999851057491808, + "grad_norm": 0.023681640625, + "learning_rate": 0.02013103497271818, + "loss": 0.8132, + "num_input_tokens_seen": 35076936, + "step": 60425 + }, + { + "epoch": 9.0, + "eval_loss": 0.8010132908821106, + "eval_runtime": 70.6792, + "eval_samples_per_second": 42.219, + "eval_steps_per_second": 10.555, + "num_input_tokens_seen": 35077032, + "step": 60426 + }, + { + "epoch": 9.000595770032767, + "grad_norm": 0.025390625, + "learning_rate": 0.020129202889634556, + "loss": 0.7883, + "num_input_tokens_seen": 35079464, + "step": 60430 + }, + { + "epoch": 9.001340482573726, + "grad_norm": 0.012939453125, + "learning_rate": 0.020127370719898268, + "loss": 0.7973, + "num_input_tokens_seen": 35082408, + "step": 60435 + }, + { + "epoch": 9.002085195114686, + "grad_norm": 0.0308837890625, + "learning_rate": 0.020125538463540257, + "loss": 0.804, + "num_input_tokens_seen": 35085608, + "step": 60440 + }, + { + "epoch": 9.002829907655645, + "grad_norm": 0.032470703125, + "learning_rate": 0.02012370612059149, + "loss": 0.7779, + "num_input_tokens_seen": 35088904, + "step": 60445 + }, + { + "epoch": 9.003574620196604, + "grad_norm": 0.0269775390625, + "learning_rate": 0.020121873691082908, + "loss": 0.7915, + "num_input_tokens_seen": 35091816, + "step": 60450 + }, + { + "epoch": 9.004319332737563, + "grad_norm": 0.01434326171875, + "learning_rate": 0.02012004117504548, + "loss": 0.7941, + "num_input_tokens_seen": 35094760, + "step": 60455 + }, + { + "epoch": 9.005064045278523, + "grad_norm": 0.03515625, + "learning_rate": 0.020118208572510166, + "loss": 0.7911, + "num_input_tokens_seen": 35097512, + "step": 60460 + }, + { + "epoch": 9.005808757819482, + "grad_norm": 0.02783203125, + "learning_rate": 0.02011637588350791, + "loss": 0.8061, + "num_input_tokens_seen": 35100456, + "step": 60465 + }, + { + "epoch": 9.00655347036044, + "grad_norm": 0.036376953125, + "learning_rate": 0.020114543108069687, + "loss": 0.8059, + "num_input_tokens_seen": 35103304, + "step": 60470 + }, + { + "epoch": 9.0072981829014, + "grad_norm": 0.03857421875, + "learning_rate": 0.020112710246226455, + "loss": 0.8099, + "num_input_tokens_seen": 35106216, + "step": 60475 + }, + { + "epoch": 9.00804289544236, + "grad_norm": 0.02001953125, + "learning_rate": 0.020110877298009183, + "loss": 0.8109, + "num_input_tokens_seen": 35109384, + "step": 60480 + }, + { + "epoch": 9.008787607983319, + "grad_norm": 0.024658203125, + "learning_rate": 0.02010904426344883, + "loss": 0.778, + "num_input_tokens_seen": 35112328, + "step": 60485 + }, + { + "epoch": 9.009532320524277, + "grad_norm": 0.0234375, + "learning_rate": 0.02010721114257637, + "loss": 0.799, + "num_input_tokens_seen": 35115176, + "step": 60490 + }, + { + "epoch": 9.010277033065236, + "grad_norm": 0.0198974609375, + "learning_rate": 0.02010537793542277, + "loss": 0.792, + "num_input_tokens_seen": 35118088, + "step": 60495 + }, + { + "epoch": 9.011021745606197, + "grad_norm": 0.038330078125, + "learning_rate": 0.020103544642018994, + "loss": 0.7882, + "num_input_tokens_seen": 35120904, + "step": 60500 + }, + { + "epoch": 9.011766458147155, + "grad_norm": 0.029541015625, + "learning_rate": 0.02010171126239602, + "loss": 0.78, + "num_input_tokens_seen": 35123592, + "step": 60505 + }, + { + "epoch": 9.012511170688114, + "grad_norm": 0.019287109375, + "learning_rate": 0.02009987779658482, + "loss": 0.8028, + "num_input_tokens_seen": 35126408, + "step": 60510 + }, + { + "epoch": 9.013255883229073, + "grad_norm": 0.02392578125, + "learning_rate": 0.020098044244616364, + "loss": 0.8017, + "num_input_tokens_seen": 35129512, + "step": 60515 + }, + { + "epoch": 9.014000595770034, + "grad_norm": 0.0206298828125, + "learning_rate": 0.020096210606521638, + "loss": 0.7709, + "num_input_tokens_seen": 35132360, + "step": 60520 + }, + { + "epoch": 9.014745308310992, + "grad_norm": 0.03076171875, + "learning_rate": 0.020094376882331615, + "loss": 0.8211, + "num_input_tokens_seen": 35135272, + "step": 60525 + }, + { + "epoch": 9.015490020851951, + "grad_norm": 0.0205078125, + "learning_rate": 0.020092543072077265, + "loss": 0.8051, + "num_input_tokens_seen": 35138248, + "step": 60530 + }, + { + "epoch": 9.01623473339291, + "grad_norm": 0.0289306640625, + "learning_rate": 0.02009070917578958, + "loss": 0.7777, + "num_input_tokens_seen": 35140936, + "step": 60535 + }, + { + "epoch": 9.01697944593387, + "grad_norm": 0.029296875, + "learning_rate": 0.020088875193499535, + "loss": 0.8249, + "num_input_tokens_seen": 35143752, + "step": 60540 + }, + { + "epoch": 9.017724158474829, + "grad_norm": 0.029296875, + "learning_rate": 0.020087041125238115, + "loss": 0.7949, + "num_input_tokens_seen": 35146536, + "step": 60545 + }, + { + "epoch": 9.018468871015788, + "grad_norm": 0.0218505859375, + "learning_rate": 0.020085206971036308, + "loss": 0.7812, + "num_input_tokens_seen": 35149800, + "step": 60550 + }, + { + "epoch": 9.019213583556747, + "grad_norm": 0.018310546875, + "learning_rate": 0.020083372730925097, + "loss": 0.8142, + "num_input_tokens_seen": 35152872, + "step": 60555 + }, + { + "epoch": 9.019958296097707, + "grad_norm": 0.019775390625, + "learning_rate": 0.020081538404935467, + "loss": 0.7756, + "num_input_tokens_seen": 35155752, + "step": 60560 + }, + { + "epoch": 9.020703008638666, + "grad_norm": 0.0286865234375, + "learning_rate": 0.02007970399309842, + "loss": 0.8221, + "num_input_tokens_seen": 35158792, + "step": 60565 + }, + { + "epoch": 9.021447721179625, + "grad_norm": 0.0242919921875, + "learning_rate": 0.020077869495444926, + "loss": 0.7895, + "num_input_tokens_seen": 35161832, + "step": 60570 + }, + { + "epoch": 9.022192433720583, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02007603491200599, + "loss": 0.7758, + "num_input_tokens_seen": 35164616, + "step": 60575 + }, + { + "epoch": 9.022937146261542, + "grad_norm": 0.0252685546875, + "learning_rate": 0.020074200242812604, + "loss": 0.7953, + "num_input_tokens_seen": 35167304, + "step": 60580 + }, + { + "epoch": 9.023681858802503, + "grad_norm": 0.03125, + "learning_rate": 0.020072365487895764, + "loss": 0.7821, + "num_input_tokens_seen": 35170312, + "step": 60585 + }, + { + "epoch": 9.024426571343461, + "grad_norm": 0.021728515625, + "learning_rate": 0.020070530647286467, + "loss": 0.7882, + "num_input_tokens_seen": 35173288, + "step": 60590 + }, + { + "epoch": 9.02517128388442, + "grad_norm": 0.0283203125, + "learning_rate": 0.020068695721015706, + "loss": 0.8193, + "num_input_tokens_seen": 35176424, + "step": 60595 + }, + { + "epoch": 9.025915996425379, + "grad_norm": 0.030029296875, + "learning_rate": 0.020066860709114478, + "loss": 0.826, + "num_input_tokens_seen": 35179656, + "step": 60600 + }, + { + "epoch": 9.02666070896634, + "grad_norm": 0.0247802734375, + "learning_rate": 0.020065025611613795, + "loss": 0.7762, + "num_input_tokens_seen": 35182568, + "step": 60605 + }, + { + "epoch": 9.027405421507298, + "grad_norm": 0.0218505859375, + "learning_rate": 0.02006319042854465, + "loss": 0.8104, + "num_input_tokens_seen": 35185768, + "step": 60610 + }, + { + "epoch": 9.028150134048257, + "grad_norm": 0.0306396484375, + "learning_rate": 0.02006135515993804, + "loss": 0.798, + "num_input_tokens_seen": 35188456, + "step": 60615 + }, + { + "epoch": 9.028894846589216, + "grad_norm": 0.041015625, + "learning_rate": 0.020059519805824988, + "loss": 0.8186, + "num_input_tokens_seen": 35191720, + "step": 60620 + }, + { + "epoch": 9.029639559130176, + "grad_norm": 0.0235595703125, + "learning_rate": 0.02005768436623649, + "loss": 0.8091, + "num_input_tokens_seen": 35194664, + "step": 60625 + }, + { + "epoch": 9.030384271671135, + "grad_norm": 0.029541015625, + "learning_rate": 0.020055848841203556, + "loss": 0.803, + "num_input_tokens_seen": 35197576, + "step": 60630 + }, + { + "epoch": 9.031128984212094, + "grad_norm": 0.033203125, + "learning_rate": 0.02005401323075719, + "loss": 0.8121, + "num_input_tokens_seen": 35200552, + "step": 60635 + }, + { + "epoch": 9.031873696753053, + "grad_norm": 0.031005859375, + "learning_rate": 0.020052177534928408, + "loss": 0.815, + "num_input_tokens_seen": 35203432, + "step": 60640 + }, + { + "epoch": 9.032618409294013, + "grad_norm": 0.02587890625, + "learning_rate": 0.020050341753748223, + "loss": 0.817, + "num_input_tokens_seen": 35206088, + "step": 60645 + }, + { + "epoch": 9.033363121834972, + "grad_norm": 0.022216796875, + "learning_rate": 0.020048505887247648, + "loss": 0.8041, + "num_input_tokens_seen": 35208968, + "step": 60650 + }, + { + "epoch": 9.03410783437593, + "grad_norm": 0.0262451171875, + "learning_rate": 0.020046669935457697, + "loss": 0.818, + "num_input_tokens_seen": 35211496, + "step": 60655 + }, + { + "epoch": 9.03485254691689, + "grad_norm": 0.016357421875, + "learning_rate": 0.020044833898409383, + "loss": 0.7812, + "num_input_tokens_seen": 35214184, + "step": 60660 + }, + { + "epoch": 9.03559725945785, + "grad_norm": 0.01470947265625, + "learning_rate": 0.020042997776133734, + "loss": 0.7901, + "num_input_tokens_seen": 35216968, + "step": 60665 + }, + { + "epoch": 9.036341971998809, + "grad_norm": 0.0260009765625, + "learning_rate": 0.020041161568661755, + "loss": 0.8049, + "num_input_tokens_seen": 35219592, + "step": 60670 + }, + { + "epoch": 9.037086684539767, + "grad_norm": 0.039794921875, + "learning_rate": 0.020039325276024478, + "loss": 0.8173, + "num_input_tokens_seen": 35222568, + "step": 60675 + }, + { + "epoch": 9.037831397080726, + "grad_norm": 0.024169921875, + "learning_rate": 0.020037488898252926, + "loss": 0.8119, + "num_input_tokens_seen": 35225384, + "step": 60680 + }, + { + "epoch": 9.038576109621687, + "grad_norm": 0.0205078125, + "learning_rate": 0.020035652435378117, + "loss": 0.8074, + "num_input_tokens_seen": 35228200, + "step": 60685 + }, + { + "epoch": 9.039320822162646, + "grad_norm": 0.0189208984375, + "learning_rate": 0.02003381588743108, + "loss": 0.8071, + "num_input_tokens_seen": 35231016, + "step": 60690 + }, + { + "epoch": 9.040065534703604, + "grad_norm": 0.034423828125, + "learning_rate": 0.020031979254442837, + "loss": 0.7914, + "num_input_tokens_seen": 35233896, + "step": 60695 + }, + { + "epoch": 9.040810247244563, + "grad_norm": 0.029541015625, + "learning_rate": 0.020030142536444414, + "loss": 0.8156, + "num_input_tokens_seen": 35236552, + "step": 60700 + }, + { + "epoch": 9.041554959785524, + "grad_norm": 0.0303955078125, + "learning_rate": 0.020028305733466853, + "loss": 0.8108, + "num_input_tokens_seen": 35239304, + "step": 60705 + }, + { + "epoch": 9.042299672326482, + "grad_norm": 0.0247802734375, + "learning_rate": 0.020026468845541174, + "loss": 0.7958, + "num_input_tokens_seen": 35241864, + "step": 60710 + }, + { + "epoch": 9.043044384867441, + "grad_norm": 0.0252685546875, + "learning_rate": 0.02002463187269841, + "loss": 0.8193, + "num_input_tokens_seen": 35244744, + "step": 60715 + }, + { + "epoch": 9.0437890974084, + "grad_norm": 0.037841796875, + "learning_rate": 0.020022794814969602, + "loss": 0.8037, + "num_input_tokens_seen": 35247656, + "step": 60720 + }, + { + "epoch": 9.04453380994936, + "grad_norm": 0.0155029296875, + "learning_rate": 0.020020957672385778, + "loss": 0.788, + "num_input_tokens_seen": 35250408, + "step": 60725 + }, + { + "epoch": 9.04527852249032, + "grad_norm": 0.0242919921875, + "learning_rate": 0.02001912044497798, + "loss": 0.7965, + "num_input_tokens_seen": 35253192, + "step": 60730 + }, + { + "epoch": 9.046023235031278, + "grad_norm": 0.0224609375, + "learning_rate": 0.020017283132777244, + "loss": 0.8028, + "num_input_tokens_seen": 35256136, + "step": 60735 + }, + { + "epoch": 9.046767947572237, + "grad_norm": 0.025146484375, + "learning_rate": 0.0200154457358146, + "loss": 0.7951, + "num_input_tokens_seen": 35259112, + "step": 60740 + }, + { + "epoch": 9.047512660113195, + "grad_norm": 0.033447265625, + "learning_rate": 0.020013608254121105, + "loss": 0.8034, + "num_input_tokens_seen": 35262184, + "step": 60745 + }, + { + "epoch": 9.048257372654156, + "grad_norm": 0.0341796875, + "learning_rate": 0.020011770687727793, + "loss": 0.8355, + "num_input_tokens_seen": 35264968, + "step": 60750 + }, + { + "epoch": 9.049002085195115, + "grad_norm": 0.02587890625, + "learning_rate": 0.02000993303666571, + "loss": 0.7989, + "num_input_tokens_seen": 35267976, + "step": 60755 + }, + { + "epoch": 9.049746797736073, + "grad_norm": 0.031494140625, + "learning_rate": 0.020008095300965898, + "loss": 0.7946, + "num_input_tokens_seen": 35271400, + "step": 60760 + }, + { + "epoch": 9.050491510277032, + "grad_norm": 0.02294921875, + "learning_rate": 0.020006257480659408, + "loss": 0.8043, + "num_input_tokens_seen": 35274312, + "step": 60765 + }, + { + "epoch": 9.051236222817993, + "grad_norm": 0.040771484375, + "learning_rate": 0.020004419575777286, + "loss": 0.8188, + "num_input_tokens_seen": 35277096, + "step": 60770 + }, + { + "epoch": 9.051980935358952, + "grad_norm": 0.0245361328125, + "learning_rate": 0.02000258158635058, + "loss": 0.8089, + "num_input_tokens_seen": 35280072, + "step": 60775 + }, + { + "epoch": 9.05272564789991, + "grad_norm": 0.0233154296875, + "learning_rate": 0.02000074351241034, + "loss": 0.8037, + "num_input_tokens_seen": 35282920, + "step": 60780 + }, + { + "epoch": 9.053470360440869, + "grad_norm": 0.02197265625, + "learning_rate": 0.019998905353987625, + "loss": 0.7914, + "num_input_tokens_seen": 35285864, + "step": 60785 + }, + { + "epoch": 9.05421507298183, + "grad_norm": 0.0225830078125, + "learning_rate": 0.019997067111113483, + "loss": 0.7996, + "num_input_tokens_seen": 35288552, + "step": 60790 + }, + { + "epoch": 9.054959785522788, + "grad_norm": 0.0235595703125, + "learning_rate": 0.019995228783818972, + "loss": 0.7865, + "num_input_tokens_seen": 35291176, + "step": 60795 + }, + { + "epoch": 9.055704498063747, + "grad_norm": 0.0234375, + "learning_rate": 0.019993390372135145, + "loss": 0.8349, + "num_input_tokens_seen": 35294120, + "step": 60800 + }, + { + "epoch": 9.056449210604706, + "grad_norm": 0.0218505859375, + "learning_rate": 0.019991551876093066, + "loss": 0.8014, + "num_input_tokens_seen": 35297128, + "step": 60805 + }, + { + "epoch": 9.057193923145666, + "grad_norm": 0.0213623046875, + "learning_rate": 0.019989713295723792, + "loss": 0.8096, + "num_input_tokens_seen": 35300104, + "step": 60810 + }, + { + "epoch": 9.057938635686625, + "grad_norm": 0.030517578125, + "learning_rate": 0.019987874631058382, + "loss": 0.8217, + "num_input_tokens_seen": 35303048, + "step": 60815 + }, + { + "epoch": 9.058683348227584, + "grad_norm": 0.033203125, + "learning_rate": 0.019986035882127898, + "loss": 0.7914, + "num_input_tokens_seen": 35306216, + "step": 60820 + }, + { + "epoch": 9.059428060768543, + "grad_norm": 0.0181884765625, + "learning_rate": 0.01998419704896341, + "loss": 0.811, + "num_input_tokens_seen": 35308904, + "step": 60825 + }, + { + "epoch": 9.060172773309503, + "grad_norm": 0.03173828125, + "learning_rate": 0.019982358131595972, + "loss": 0.7971, + "num_input_tokens_seen": 35311880, + "step": 60830 + }, + { + "epoch": 9.060917485850462, + "grad_norm": 0.0230712890625, + "learning_rate": 0.01998051913005666, + "loss": 0.8008, + "num_input_tokens_seen": 35314984, + "step": 60835 + }, + { + "epoch": 9.06166219839142, + "grad_norm": 0.0213623046875, + "learning_rate": 0.01997868004437654, + "loss": 0.7849, + "num_input_tokens_seen": 35317832, + "step": 60840 + }, + { + "epoch": 9.06240691093238, + "grad_norm": 0.0224609375, + "learning_rate": 0.019976840874586677, + "loss": 0.8059, + "num_input_tokens_seen": 35320776, + "step": 60845 + }, + { + "epoch": 9.06315162347334, + "grad_norm": 0.03173828125, + "learning_rate": 0.01997500162071815, + "loss": 0.8168, + "num_input_tokens_seen": 35323592, + "step": 60850 + }, + { + "epoch": 9.063896336014299, + "grad_norm": 0.0172119140625, + "learning_rate": 0.019973162282802027, + "loss": 0.7908, + "num_input_tokens_seen": 35326856, + "step": 60855 + }, + { + "epoch": 9.064641048555258, + "grad_norm": 0.0234375, + "learning_rate": 0.019971322860869382, + "loss": 0.8126, + "num_input_tokens_seen": 35329544, + "step": 60860 + }, + { + "epoch": 9.065385761096216, + "grad_norm": 0.0279541015625, + "learning_rate": 0.01996948335495129, + "loss": 0.8021, + "num_input_tokens_seen": 35333000, + "step": 60865 + }, + { + "epoch": 9.066130473637177, + "grad_norm": 0.0167236328125, + "learning_rate": 0.019967643765078823, + "loss": 0.7884, + "num_input_tokens_seen": 35335816, + "step": 60870 + }, + { + "epoch": 9.066875186178136, + "grad_norm": 0.037109375, + "learning_rate": 0.019965804091283065, + "loss": 0.8148, + "num_input_tokens_seen": 35339112, + "step": 60875 + }, + { + "epoch": 9.067619898719094, + "grad_norm": 0.0186767578125, + "learning_rate": 0.019963964333595093, + "loss": 0.8085, + "num_input_tokens_seen": 35342344, + "step": 60880 + }, + { + "epoch": 9.068364611260053, + "grad_norm": 0.02783203125, + "learning_rate": 0.019962124492045995, + "loss": 0.8074, + "num_input_tokens_seen": 35345352, + "step": 60885 + }, + { + "epoch": 9.069109323801014, + "grad_norm": 0.0224609375, + "learning_rate": 0.019960284566666844, + "loss": 0.7851, + "num_input_tokens_seen": 35348232, + "step": 60890 + }, + { + "epoch": 9.069854036341972, + "grad_norm": 0.0322265625, + "learning_rate": 0.019958444557488724, + "loss": 0.8094, + "num_input_tokens_seen": 35351400, + "step": 60895 + }, + { + "epoch": 9.070598748882931, + "grad_norm": 0.02734375, + "learning_rate": 0.019956604464542727, + "loss": 0.8075, + "num_input_tokens_seen": 35354184, + "step": 60900 + }, + { + "epoch": 9.07134346142389, + "grad_norm": 0.031494140625, + "learning_rate": 0.019954764287859927, + "loss": 0.7935, + "num_input_tokens_seen": 35357192, + "step": 60905 + }, + { + "epoch": 9.07208817396485, + "grad_norm": 0.0216064453125, + "learning_rate": 0.019952924027471425, + "loss": 0.7864, + "num_input_tokens_seen": 35360072, + "step": 60910 + }, + { + "epoch": 9.07283288650581, + "grad_norm": 0.034912109375, + "learning_rate": 0.019951083683408307, + "loss": 0.7849, + "num_input_tokens_seen": 35362728, + "step": 60915 + }, + { + "epoch": 9.073577599046768, + "grad_norm": 0.04443359375, + "learning_rate": 0.019949243255701663, + "loss": 0.8052, + "num_input_tokens_seen": 35365512, + "step": 60920 + }, + { + "epoch": 9.074322311587727, + "grad_norm": 0.0185546875, + "learning_rate": 0.019947402744382583, + "loss": 0.7972, + "num_input_tokens_seen": 35368296, + "step": 60925 + }, + { + "epoch": 9.075067024128685, + "grad_norm": 0.0361328125, + "learning_rate": 0.019945562149482166, + "loss": 0.8055, + "num_input_tokens_seen": 35371496, + "step": 60930 + }, + { + "epoch": 9.075811736669646, + "grad_norm": 0.0216064453125, + "learning_rate": 0.019943721471031496, + "loss": 0.7887, + "num_input_tokens_seen": 35374184, + "step": 60935 + }, + { + "epoch": 9.076556449210605, + "grad_norm": 0.035400390625, + "learning_rate": 0.019941880709061676, + "loss": 0.7998, + "num_input_tokens_seen": 35377064, + "step": 60940 + }, + { + "epoch": 9.077301161751564, + "grad_norm": 0.036865234375, + "learning_rate": 0.019940039863603806, + "loss": 0.8286, + "num_input_tokens_seen": 35380040, + "step": 60945 + }, + { + "epoch": 9.078045874292522, + "grad_norm": 0.0201416015625, + "learning_rate": 0.019938198934688983, + "loss": 0.8042, + "num_input_tokens_seen": 35382920, + "step": 60950 + }, + { + "epoch": 9.078790586833483, + "grad_norm": 0.0225830078125, + "learning_rate": 0.019936357922348314, + "loss": 0.8022, + "num_input_tokens_seen": 35385672, + "step": 60955 + }, + { + "epoch": 9.079535299374442, + "grad_norm": 0.0181884765625, + "learning_rate": 0.01993451682661289, + "loss": 0.8011, + "num_input_tokens_seen": 35388712, + "step": 60960 + }, + { + "epoch": 9.0802800119154, + "grad_norm": 0.036376953125, + "learning_rate": 0.01993267564751382, + "loss": 0.8178, + "num_input_tokens_seen": 35391912, + "step": 60965 + }, + { + "epoch": 9.081024724456359, + "grad_norm": 0.0157470703125, + "learning_rate": 0.01993083438508221, + "loss": 0.802, + "num_input_tokens_seen": 35394600, + "step": 60970 + }, + { + "epoch": 9.08176943699732, + "grad_norm": 0.021240234375, + "learning_rate": 0.019928993039349162, + "loss": 0.811, + "num_input_tokens_seen": 35397352, + "step": 60975 + }, + { + "epoch": 9.082514149538278, + "grad_norm": 0.0277099609375, + "learning_rate": 0.019927151610345785, + "loss": 0.8032, + "num_input_tokens_seen": 35399912, + "step": 60980 + }, + { + "epoch": 9.083258862079237, + "grad_norm": 0.03662109375, + "learning_rate": 0.01992531009810319, + "loss": 0.8097, + "num_input_tokens_seen": 35402600, + "step": 60985 + }, + { + "epoch": 9.084003574620196, + "grad_norm": 0.0322265625, + "learning_rate": 0.01992346850265249, + "loss": 0.7956, + "num_input_tokens_seen": 35405512, + "step": 60990 + }, + { + "epoch": 9.084748287161156, + "grad_norm": 0.041748046875, + "learning_rate": 0.019921626824024787, + "loss": 0.8042, + "num_input_tokens_seen": 35408200, + "step": 60995 + }, + { + "epoch": 9.085492999702115, + "grad_norm": 0.03173828125, + "learning_rate": 0.019919785062251207, + "loss": 0.81, + "num_input_tokens_seen": 35411272, + "step": 61000 + }, + { + "epoch": 9.086237712243074, + "grad_norm": 0.0177001953125, + "learning_rate": 0.019917943217362857, + "loss": 0.8011, + "num_input_tokens_seen": 35414152, + "step": 61005 + }, + { + "epoch": 9.086982424784033, + "grad_norm": 0.024658203125, + "learning_rate": 0.019916101289390857, + "loss": 0.7945, + "num_input_tokens_seen": 35416776, + "step": 61010 + }, + { + "epoch": 9.087727137324993, + "grad_norm": 0.041015625, + "learning_rate": 0.019914259278366316, + "loss": 0.7999, + "num_input_tokens_seen": 35419624, + "step": 61015 + }, + { + "epoch": 9.088471849865952, + "grad_norm": 0.038818359375, + "learning_rate": 0.019912417184320366, + "loss": 0.8113, + "num_input_tokens_seen": 35422760, + "step": 61020 + }, + { + "epoch": 9.08921656240691, + "grad_norm": 0.0224609375, + "learning_rate": 0.019910575007284118, + "loss": 0.8017, + "num_input_tokens_seen": 35425736, + "step": 61025 + }, + { + "epoch": 9.08996127494787, + "grad_norm": 0.0250244140625, + "learning_rate": 0.019908732747288698, + "loss": 0.7989, + "num_input_tokens_seen": 35428552, + "step": 61030 + }, + { + "epoch": 9.09070598748883, + "grad_norm": 0.0286865234375, + "learning_rate": 0.019906890404365222, + "loss": 0.7841, + "num_input_tokens_seen": 35431496, + "step": 61035 + }, + { + "epoch": 9.091450700029789, + "grad_norm": 0.027099609375, + "learning_rate": 0.019905047978544824, + "loss": 0.8095, + "num_input_tokens_seen": 35434248, + "step": 61040 + }, + { + "epoch": 9.092195412570748, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01990320546985862, + "loss": 0.8155, + "num_input_tokens_seen": 35437032, + "step": 61045 + }, + { + "epoch": 9.092940125111706, + "grad_norm": 0.0234375, + "learning_rate": 0.019901362878337752, + "loss": 0.7998, + "num_input_tokens_seen": 35439976, + "step": 61050 + }, + { + "epoch": 9.093684837652667, + "grad_norm": 0.033935546875, + "learning_rate": 0.019899520204013334, + "loss": 0.7793, + "num_input_tokens_seen": 35442824, + "step": 61055 + }, + { + "epoch": 9.094429550193626, + "grad_norm": 0.022705078125, + "learning_rate": 0.019897677446916503, + "loss": 0.8039, + "num_input_tokens_seen": 35445736, + "step": 61060 + }, + { + "epoch": 9.095174262734584, + "grad_norm": 0.0224609375, + "learning_rate": 0.01989583460707839, + "loss": 0.7938, + "num_input_tokens_seen": 35448520, + "step": 61065 + }, + { + "epoch": 9.095918975275543, + "grad_norm": 0.028564453125, + "learning_rate": 0.019893991684530123, + "loss": 0.8051, + "num_input_tokens_seen": 35451432, + "step": 61070 + }, + { + "epoch": 9.096663687816504, + "grad_norm": 0.040283203125, + "learning_rate": 0.01989214867930284, + "loss": 0.8127, + "num_input_tokens_seen": 35454504, + "step": 61075 + }, + { + "epoch": 9.097408400357462, + "grad_norm": 0.040771484375, + "learning_rate": 0.01989030559142768, + "loss": 0.8087, + "num_input_tokens_seen": 35457640, + "step": 61080 + }, + { + "epoch": 9.098153112898421, + "grad_norm": 0.041015625, + "learning_rate": 0.019888462420935782, + "loss": 0.8038, + "num_input_tokens_seen": 35460872, + "step": 61085 + }, + { + "epoch": 9.09889782543938, + "grad_norm": 0.0181884765625, + "learning_rate": 0.01988661916785828, + "loss": 0.7933, + "num_input_tokens_seen": 35463688, + "step": 61090 + }, + { + "epoch": 9.099642537980339, + "grad_norm": 0.03271484375, + "learning_rate": 0.01988477583222631, + "loss": 0.7852, + "num_input_tokens_seen": 35466440, + "step": 61095 + }, + { + "epoch": 9.1003872505213, + "grad_norm": 0.0361328125, + "learning_rate": 0.019882932414071016, + "loss": 0.8338, + "num_input_tokens_seen": 35469448, + "step": 61100 + }, + { + "epoch": 9.101131963062258, + "grad_norm": 0.033203125, + "learning_rate": 0.019881088913423543, + "loss": 0.8003, + "num_input_tokens_seen": 35472552, + "step": 61105 + }, + { + "epoch": 9.101876675603217, + "grad_norm": 0.028564453125, + "learning_rate": 0.019879245330315033, + "loss": 0.8161, + "num_input_tokens_seen": 35475944, + "step": 61110 + }, + { + "epoch": 9.102621388144176, + "grad_norm": 0.0361328125, + "learning_rate": 0.019877401664776632, + "loss": 0.8119, + "num_input_tokens_seen": 35479336, + "step": 61115 + }, + { + "epoch": 9.103366100685136, + "grad_norm": 0.043701171875, + "learning_rate": 0.01987555791683949, + "loss": 0.8004, + "num_input_tokens_seen": 35482088, + "step": 61120 + }, + { + "epoch": 9.104110813226095, + "grad_norm": 0.03125, + "learning_rate": 0.019873714086534745, + "loss": 0.8077, + "num_input_tokens_seen": 35485000, + "step": 61125 + }, + { + "epoch": 9.104855525767054, + "grad_norm": 0.0233154296875, + "learning_rate": 0.019871870173893563, + "loss": 0.8032, + "num_input_tokens_seen": 35487944, + "step": 61130 + }, + { + "epoch": 9.105600238308012, + "grad_norm": 0.032470703125, + "learning_rate": 0.01987002617894708, + "loss": 0.7903, + "num_input_tokens_seen": 35490760, + "step": 61135 + }, + { + "epoch": 9.106344950848973, + "grad_norm": 0.030029296875, + "learning_rate": 0.019868182101726455, + "loss": 0.789, + "num_input_tokens_seen": 35493768, + "step": 61140 + }, + { + "epoch": 9.107089663389932, + "grad_norm": 0.034912109375, + "learning_rate": 0.019866337942262842, + "loss": 0.7993, + "num_input_tokens_seen": 35496584, + "step": 61145 + }, + { + "epoch": 9.10783437593089, + "grad_norm": 0.034912109375, + "learning_rate": 0.019864493700587394, + "loss": 0.7829, + "num_input_tokens_seen": 35499144, + "step": 61150 + }, + { + "epoch": 9.10857908847185, + "grad_norm": 0.031494140625, + "learning_rate": 0.019862649376731274, + "loss": 0.8127, + "num_input_tokens_seen": 35501864, + "step": 61155 + }, + { + "epoch": 9.10932380101281, + "grad_norm": 0.0301513671875, + "learning_rate": 0.01986080497072563, + "loss": 0.8004, + "num_input_tokens_seen": 35504808, + "step": 61160 + }, + { + "epoch": 9.110068513553768, + "grad_norm": 0.0289306640625, + "learning_rate": 0.019858960482601627, + "loss": 0.7889, + "num_input_tokens_seen": 35507816, + "step": 61165 + }, + { + "epoch": 9.110813226094727, + "grad_norm": 0.036376953125, + "learning_rate": 0.01985711591239043, + "loss": 0.7985, + "num_input_tokens_seen": 35510824, + "step": 61170 + }, + { + "epoch": 9.111557938635686, + "grad_norm": 0.058837890625, + "learning_rate": 0.01985527126012319, + "loss": 0.7963, + "num_input_tokens_seen": 35513800, + "step": 61175 + }, + { + "epoch": 9.112302651176647, + "grad_norm": 0.0185546875, + "learning_rate": 0.019853426525831077, + "loss": 0.7986, + "num_input_tokens_seen": 35516520, + "step": 61180 + }, + { + "epoch": 9.113047363717605, + "grad_norm": 0.03173828125, + "learning_rate": 0.01985158170954526, + "loss": 0.813, + "num_input_tokens_seen": 35519336, + "step": 61185 + }, + { + "epoch": 9.113792076258564, + "grad_norm": 0.028076171875, + "learning_rate": 0.0198497368112969, + "loss": 0.7891, + "num_input_tokens_seen": 35522216, + "step": 61190 + }, + { + "epoch": 9.114536788799523, + "grad_norm": 0.0311279296875, + "learning_rate": 0.01984789183111716, + "loss": 0.8084, + "num_input_tokens_seen": 35524904, + "step": 61195 + }, + { + "epoch": 9.115281501340483, + "grad_norm": 0.0279541015625, + "learning_rate": 0.019846046769037222, + "loss": 0.8089, + "num_input_tokens_seen": 35527720, + "step": 61200 + }, + { + "epoch": 9.116026213881442, + "grad_norm": 0.0260009765625, + "learning_rate": 0.019844201625088244, + "loss": 0.797, + "num_input_tokens_seen": 35530856, + "step": 61205 + }, + { + "epoch": 9.1167709264224, + "grad_norm": 0.0498046875, + "learning_rate": 0.019842356399301406, + "loss": 0.8123, + "num_input_tokens_seen": 35533832, + "step": 61210 + }, + { + "epoch": 9.11751563896336, + "grad_norm": 0.041015625, + "learning_rate": 0.01984051109170788, + "loss": 0.8397, + "num_input_tokens_seen": 35536552, + "step": 61215 + }, + { + "epoch": 9.11826035150432, + "grad_norm": 0.0279541015625, + "learning_rate": 0.019838665702338835, + "loss": 0.7967, + "num_input_tokens_seen": 35539336, + "step": 61220 + }, + { + "epoch": 9.119005064045279, + "grad_norm": 0.02294921875, + "learning_rate": 0.01983682023122545, + "loss": 0.8167, + "num_input_tokens_seen": 35542184, + "step": 61225 + }, + { + "epoch": 9.119749776586238, + "grad_norm": 0.028076171875, + "learning_rate": 0.019834974678398905, + "loss": 0.8066, + "num_input_tokens_seen": 35545032, + "step": 61230 + }, + { + "epoch": 9.120494489127196, + "grad_norm": 0.03271484375, + "learning_rate": 0.019833129043890375, + "loss": 0.7882, + "num_input_tokens_seen": 35548008, + "step": 61235 + }, + { + "epoch": 9.121239201668157, + "grad_norm": 0.023681640625, + "learning_rate": 0.019831283327731042, + "loss": 0.7867, + "num_input_tokens_seen": 35550728, + "step": 61240 + }, + { + "epoch": 9.121983914209116, + "grad_norm": 0.0302734375, + "learning_rate": 0.019829437529952088, + "loss": 0.7989, + "num_input_tokens_seen": 35553544, + "step": 61245 + }, + { + "epoch": 9.122728626750074, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0198275916505847, + "loss": 0.8055, + "num_input_tokens_seen": 35556712, + "step": 61250 + }, + { + "epoch": 9.123473339291033, + "grad_norm": 0.0245361328125, + "learning_rate": 0.019825745689660057, + "loss": 0.7952, + "num_input_tokens_seen": 35559432, + "step": 61255 + }, + { + "epoch": 9.124218051831992, + "grad_norm": 0.034912109375, + "learning_rate": 0.01982389964720934, + "loss": 0.8028, + "num_input_tokens_seen": 35562248, + "step": 61260 + }, + { + "epoch": 9.124962764372953, + "grad_norm": 0.0203857421875, + "learning_rate": 0.01982205352326374, + "loss": 0.8009, + "num_input_tokens_seen": 35565256, + "step": 61265 + }, + { + "epoch": 9.125707476913911, + "grad_norm": 0.0211181640625, + "learning_rate": 0.01982020731785445, + "loss": 0.782, + "num_input_tokens_seen": 35568200, + "step": 61270 + }, + { + "epoch": 9.12645218945487, + "grad_norm": 0.0159912109375, + "learning_rate": 0.01981836103101266, + "loss": 0.7952, + "num_input_tokens_seen": 35570984, + "step": 61275 + }, + { + "epoch": 9.127196901995829, + "grad_norm": 0.0225830078125, + "learning_rate": 0.01981651466276955, + "loss": 0.8182, + "num_input_tokens_seen": 35573992, + "step": 61280 + }, + { + "epoch": 9.12794161453679, + "grad_norm": 0.022705078125, + "learning_rate": 0.01981466821315633, + "loss": 0.7993, + "num_input_tokens_seen": 35576872, + "step": 61285 + }, + { + "epoch": 9.128686327077748, + "grad_norm": 0.02734375, + "learning_rate": 0.01981282168220418, + "loss": 0.7842, + "num_input_tokens_seen": 35579944, + "step": 61290 + }, + { + "epoch": 9.129431039618707, + "grad_norm": 0.0267333984375, + "learning_rate": 0.019810975069944295, + "loss": 0.7821, + "num_input_tokens_seen": 35582568, + "step": 61295 + }, + { + "epoch": 9.130175752159666, + "grad_norm": 0.0255126953125, + "learning_rate": 0.019809128376407876, + "loss": 0.7777, + "num_input_tokens_seen": 35585544, + "step": 61300 + }, + { + "epoch": 9.130920464700626, + "grad_norm": 0.0220947265625, + "learning_rate": 0.019807281601626123, + "loss": 0.8177, + "num_input_tokens_seen": 35588648, + "step": 61305 + }, + { + "epoch": 9.131665177241585, + "grad_norm": 0.0234375, + "learning_rate": 0.01980543474563023, + "loss": 0.8109, + "num_input_tokens_seen": 35591688, + "step": 61310 + }, + { + "epoch": 9.132409889782544, + "grad_norm": 0.0299072265625, + "learning_rate": 0.019803587808451407, + "loss": 0.8109, + "num_input_tokens_seen": 35594760, + "step": 61315 + }, + { + "epoch": 9.133154602323502, + "grad_norm": 0.0211181640625, + "learning_rate": 0.01980174079012085, + "loss": 0.7845, + "num_input_tokens_seen": 35597576, + "step": 61320 + }, + { + "epoch": 9.133899314864463, + "grad_norm": 0.036376953125, + "learning_rate": 0.019799893690669756, + "loss": 0.8088, + "num_input_tokens_seen": 35600392, + "step": 61325 + }, + { + "epoch": 9.134644027405422, + "grad_norm": 0.0291748046875, + "learning_rate": 0.01979804651012934, + "loss": 0.8136, + "num_input_tokens_seen": 35603432, + "step": 61330 + }, + { + "epoch": 9.13538873994638, + "grad_norm": 0.0277099609375, + "learning_rate": 0.01979619924853081, + "loss": 0.8136, + "num_input_tokens_seen": 35606120, + "step": 61335 + }, + { + "epoch": 9.13613345248734, + "grad_norm": 0.03271484375, + "learning_rate": 0.019794351905905363, + "loss": 0.7746, + "num_input_tokens_seen": 35609256, + "step": 61340 + }, + { + "epoch": 9.1368781650283, + "grad_norm": 0.0211181640625, + "learning_rate": 0.01979250448228421, + "loss": 0.7907, + "num_input_tokens_seen": 35612360, + "step": 61345 + }, + { + "epoch": 9.137622877569259, + "grad_norm": 0.0250244140625, + "learning_rate": 0.019790656977698575, + "loss": 0.821, + "num_input_tokens_seen": 35615304, + "step": 61350 + }, + { + "epoch": 9.138367590110217, + "grad_norm": 0.041748046875, + "learning_rate": 0.01978880939217965, + "loss": 0.7934, + "num_input_tokens_seen": 35618376, + "step": 61355 + }, + { + "epoch": 9.139112302651176, + "grad_norm": 0.0284423828125, + "learning_rate": 0.019786961725758664, + "loss": 0.7653, + "num_input_tokens_seen": 35621256, + "step": 61360 + }, + { + "epoch": 9.139857015192137, + "grad_norm": 0.02685546875, + "learning_rate": 0.019785113978466818, + "loss": 0.7963, + "num_input_tokens_seen": 35624040, + "step": 61365 + }, + { + "epoch": 9.140601727733095, + "grad_norm": 0.0250244140625, + "learning_rate": 0.019783266150335335, + "loss": 0.7957, + "num_input_tokens_seen": 35626856, + "step": 61370 + }, + { + "epoch": 9.141346440274054, + "grad_norm": 0.0301513671875, + "learning_rate": 0.01978141824139544, + "loss": 0.817, + "num_input_tokens_seen": 35630152, + "step": 61375 + }, + { + "epoch": 9.142091152815013, + "grad_norm": 0.020751953125, + "learning_rate": 0.019779570251678338, + "loss": 0.7741, + "num_input_tokens_seen": 35633128, + "step": 61380 + }, + { + "epoch": 9.142835865355973, + "grad_norm": 0.0238037109375, + "learning_rate": 0.019777722181215256, + "loss": 0.8076, + "num_input_tokens_seen": 35636360, + "step": 61385 + }, + { + "epoch": 9.143580577896932, + "grad_norm": 0.015869140625, + "learning_rate": 0.01977587403003741, + "loss": 0.8083, + "num_input_tokens_seen": 35639144, + "step": 61390 + }, + { + "epoch": 9.14432529043789, + "grad_norm": 0.0303955078125, + "learning_rate": 0.01977402579817603, + "loss": 0.8022, + "num_input_tokens_seen": 35641928, + "step": 61395 + }, + { + "epoch": 9.14507000297885, + "grad_norm": 0.023193359375, + "learning_rate": 0.01977217748566233, + "loss": 0.8172, + "num_input_tokens_seen": 35644680, + "step": 61400 + }, + { + "epoch": 9.14581471551981, + "grad_norm": 0.0208740234375, + "learning_rate": 0.019770329092527544, + "loss": 0.7945, + "num_input_tokens_seen": 35647720, + "step": 61405 + }, + { + "epoch": 9.146559428060769, + "grad_norm": 0.0223388671875, + "learning_rate": 0.0197684806188029, + "loss": 0.7905, + "num_input_tokens_seen": 35650472, + "step": 61410 + }, + { + "epoch": 9.147304140601728, + "grad_norm": 0.0225830078125, + "learning_rate": 0.019766632064519625, + "loss": 0.7916, + "num_input_tokens_seen": 35653160, + "step": 61415 + }, + { + "epoch": 9.148048853142686, + "grad_norm": 0.0166015625, + "learning_rate": 0.019764783429708943, + "loss": 0.7817, + "num_input_tokens_seen": 35656456, + "step": 61420 + }, + { + "epoch": 9.148793565683647, + "grad_norm": 0.022705078125, + "learning_rate": 0.019762934714402085, + "loss": 0.7955, + "num_input_tokens_seen": 35659240, + "step": 61425 + }, + { + "epoch": 9.149538278224606, + "grad_norm": 0.0196533203125, + "learning_rate": 0.019761085918630288, + "loss": 0.7838, + "num_input_tokens_seen": 35662056, + "step": 61430 + }, + { + "epoch": 9.150282990765565, + "grad_norm": 0.030029296875, + "learning_rate": 0.01975923704242478, + "loss": 0.8079, + "num_input_tokens_seen": 35664968, + "step": 61435 + }, + { + "epoch": 9.151027703306523, + "grad_norm": 0.02587890625, + "learning_rate": 0.019757388085816806, + "loss": 0.7745, + "num_input_tokens_seen": 35667624, + "step": 61440 + }, + { + "epoch": 9.151772415847482, + "grad_norm": 0.02490234375, + "learning_rate": 0.01975553904883759, + "loss": 0.7861, + "num_input_tokens_seen": 35670504, + "step": 61445 + }, + { + "epoch": 9.152517128388443, + "grad_norm": 0.01953125, + "learning_rate": 0.019753689931518374, + "loss": 0.8043, + "num_input_tokens_seen": 35673448, + "step": 61450 + }, + { + "epoch": 9.153261840929401, + "grad_norm": 0.032470703125, + "learning_rate": 0.019751840733890406, + "loss": 0.7926, + "num_input_tokens_seen": 35676360, + "step": 61455 + }, + { + "epoch": 9.15400655347036, + "grad_norm": 0.0225830078125, + "learning_rate": 0.01974999145598491, + "loss": 0.802, + "num_input_tokens_seen": 35679304, + "step": 61460 + }, + { + "epoch": 9.154751266011319, + "grad_norm": 0.032958984375, + "learning_rate": 0.01974814209783314, + "loss": 0.7824, + "num_input_tokens_seen": 35682440, + "step": 61465 + }, + { + "epoch": 9.15549597855228, + "grad_norm": 0.044189453125, + "learning_rate": 0.01974629265946633, + "loss": 0.8042, + "num_input_tokens_seen": 35685256, + "step": 61470 + }, + { + "epoch": 9.156240691093238, + "grad_norm": 0.02880859375, + "learning_rate": 0.019744443140915734, + "loss": 0.8305, + "num_input_tokens_seen": 35688264, + "step": 61475 + }, + { + "epoch": 9.156985403634197, + "grad_norm": 0.030029296875, + "learning_rate": 0.019742593542212594, + "loss": 0.8101, + "num_input_tokens_seen": 35691272, + "step": 61480 + }, + { + "epoch": 9.157730116175156, + "grad_norm": 0.036376953125, + "learning_rate": 0.019740743863388158, + "loss": 0.8288, + "num_input_tokens_seen": 35694344, + "step": 61485 + }, + { + "epoch": 9.158474828716116, + "grad_norm": 0.0272216796875, + "learning_rate": 0.019738894104473666, + "loss": 0.7793, + "num_input_tokens_seen": 35697384, + "step": 61490 + }, + { + "epoch": 9.159219541257075, + "grad_norm": 0.0291748046875, + "learning_rate": 0.01973704426550038, + "loss": 0.8119, + "num_input_tokens_seen": 35700296, + "step": 61495 + }, + { + "epoch": 9.159964253798034, + "grad_norm": 0.0260009765625, + "learning_rate": 0.019735194346499545, + "loss": 0.8068, + "num_input_tokens_seen": 35702920, + "step": 61500 + }, + { + "epoch": 9.160708966338992, + "grad_norm": 0.018798828125, + "learning_rate": 0.01973334434750241, + "loss": 0.7814, + "num_input_tokens_seen": 35705672, + "step": 61505 + }, + { + "epoch": 9.161453678879953, + "grad_norm": 0.033447265625, + "learning_rate": 0.019731494268540234, + "loss": 0.8173, + "num_input_tokens_seen": 35708744, + "step": 61510 + }, + { + "epoch": 9.162198391420912, + "grad_norm": 0.03515625, + "learning_rate": 0.01972964410964428, + "loss": 0.7806, + "num_input_tokens_seen": 35711752, + "step": 61515 + }, + { + "epoch": 9.16294310396187, + "grad_norm": 0.034912109375, + "learning_rate": 0.019727793870845785, + "loss": 0.7936, + "num_input_tokens_seen": 35714696, + "step": 61520 + }, + { + "epoch": 9.16368781650283, + "grad_norm": 0.045654296875, + "learning_rate": 0.019725943552176023, + "loss": 0.8381, + "num_input_tokens_seen": 35717576, + "step": 61525 + }, + { + "epoch": 9.16443252904379, + "grad_norm": 0.0208740234375, + "learning_rate": 0.019724093153666244, + "loss": 0.8144, + "num_input_tokens_seen": 35720616, + "step": 61530 + }, + { + "epoch": 9.165177241584749, + "grad_norm": 0.02587890625, + "learning_rate": 0.019722242675347716, + "loss": 0.8123, + "num_input_tokens_seen": 35723368, + "step": 61535 + }, + { + "epoch": 9.165921954125707, + "grad_norm": 0.0162353515625, + "learning_rate": 0.019720392117251698, + "loss": 0.8002, + "num_input_tokens_seen": 35726120, + "step": 61540 + }, + { + "epoch": 9.166666666666666, + "grad_norm": 0.052001953125, + "learning_rate": 0.01971854147940945, + "loss": 0.8093, + "num_input_tokens_seen": 35728872, + "step": 61545 + }, + { + "epoch": 9.167411379207627, + "grad_norm": 0.04296875, + "learning_rate": 0.019716690761852243, + "loss": 0.8124, + "num_input_tokens_seen": 35731656, + "step": 61550 + }, + { + "epoch": 9.168156091748585, + "grad_norm": 0.0301513671875, + "learning_rate": 0.019714839964611337, + "loss": 0.8002, + "num_input_tokens_seen": 35734504, + "step": 61555 + }, + { + "epoch": 9.168900804289544, + "grad_norm": 0.053466796875, + "learning_rate": 0.019712989087718003, + "loss": 0.8026, + "num_input_tokens_seen": 35737320, + "step": 61560 + }, + { + "epoch": 9.169645516830503, + "grad_norm": 0.02783203125, + "learning_rate": 0.019711138131203502, + "loss": 0.799, + "num_input_tokens_seen": 35740392, + "step": 61565 + }, + { + "epoch": 9.170390229371463, + "grad_norm": 0.034912109375, + "learning_rate": 0.019709287095099115, + "loss": 0.8032, + "num_input_tokens_seen": 35743272, + "step": 61570 + }, + { + "epoch": 9.171134941912422, + "grad_norm": 0.034423828125, + "learning_rate": 0.019707435979436114, + "loss": 0.8, + "num_input_tokens_seen": 35746216, + "step": 61575 + }, + { + "epoch": 9.171879654453381, + "grad_norm": 0.0302734375, + "learning_rate": 0.01970558478424576, + "loss": 0.8046, + "num_input_tokens_seen": 35749032, + "step": 61580 + }, + { + "epoch": 9.17262436699434, + "grad_norm": 0.041748046875, + "learning_rate": 0.01970373350955934, + "loss": 0.8051, + "num_input_tokens_seen": 35752200, + "step": 61585 + }, + { + "epoch": 9.1733690795353, + "grad_norm": 0.03515625, + "learning_rate": 0.019701882155408117, + "loss": 0.8026, + "num_input_tokens_seen": 35754984, + "step": 61590 + }, + { + "epoch": 9.174113792076259, + "grad_norm": 0.026123046875, + "learning_rate": 0.019700030721823372, + "loss": 0.8013, + "num_input_tokens_seen": 35757768, + "step": 61595 + }, + { + "epoch": 9.174858504617218, + "grad_norm": 0.033447265625, + "learning_rate": 0.01969817920883639, + "loss": 0.8036, + "num_input_tokens_seen": 35760360, + "step": 61600 + }, + { + "epoch": 9.175603217158177, + "grad_norm": 0.0478515625, + "learning_rate": 0.01969632761647844, + "loss": 0.7962, + "num_input_tokens_seen": 35762824, + "step": 61605 + }, + { + "epoch": 9.176347929699135, + "grad_norm": 0.03857421875, + "learning_rate": 0.019694475944780812, + "loss": 0.8074, + "num_input_tokens_seen": 35765736, + "step": 61610 + }, + { + "epoch": 9.177092642240096, + "grad_norm": 0.0322265625, + "learning_rate": 0.019692624193774787, + "loss": 0.7936, + "num_input_tokens_seen": 35768968, + "step": 61615 + }, + { + "epoch": 9.177837354781055, + "grad_norm": 0.0272216796875, + "learning_rate": 0.019690772363491645, + "loss": 0.8014, + "num_input_tokens_seen": 35771880, + "step": 61620 + }, + { + "epoch": 9.178582067322013, + "grad_norm": 0.038330078125, + "learning_rate": 0.019688920453962668, + "loss": 0.8103, + "num_input_tokens_seen": 35774760, + "step": 61625 + }, + { + "epoch": 9.179326779862972, + "grad_norm": 0.0306396484375, + "learning_rate": 0.019687068465219146, + "loss": 0.8038, + "num_input_tokens_seen": 35777960, + "step": 61630 + }, + { + "epoch": 9.180071492403933, + "grad_norm": 0.01953125, + "learning_rate": 0.019685216397292364, + "loss": 0.8045, + "num_input_tokens_seen": 35780936, + "step": 61635 + }, + { + "epoch": 9.180816204944891, + "grad_norm": 0.037109375, + "learning_rate": 0.019683364250213616, + "loss": 0.7929, + "num_input_tokens_seen": 35783688, + "step": 61640 + }, + { + "epoch": 9.18156091748585, + "grad_norm": 0.0234375, + "learning_rate": 0.019681512024014188, + "loss": 0.8092, + "num_input_tokens_seen": 35786632, + "step": 61645 + }, + { + "epoch": 9.182305630026809, + "grad_norm": 0.0303955078125, + "learning_rate": 0.019679659718725373, + "loss": 0.8254, + "num_input_tokens_seen": 35789640, + "step": 61650 + }, + { + "epoch": 9.18305034256777, + "grad_norm": 0.017578125, + "learning_rate": 0.019677807334378468, + "loss": 0.7956, + "num_input_tokens_seen": 35792392, + "step": 61655 + }, + { + "epoch": 9.183795055108728, + "grad_norm": 0.0257568359375, + "learning_rate": 0.019675954871004757, + "loss": 0.795, + "num_input_tokens_seen": 35795496, + "step": 61660 + }, + { + "epoch": 9.184539767649687, + "grad_norm": 0.0260009765625, + "learning_rate": 0.019674102328635534, + "loss": 0.8099, + "num_input_tokens_seen": 35798472, + "step": 61665 + }, + { + "epoch": 9.185284480190646, + "grad_norm": 0.037109375, + "learning_rate": 0.019672249707302112, + "loss": 0.7846, + "num_input_tokens_seen": 35801160, + "step": 61670 + }, + { + "epoch": 9.186029192731606, + "grad_norm": 0.0263671875, + "learning_rate": 0.019670397007035777, + "loss": 0.8074, + "num_input_tokens_seen": 35804552, + "step": 61675 + }, + { + "epoch": 9.186773905272565, + "grad_norm": 0.021484375, + "learning_rate": 0.019668544227867835, + "loss": 0.8105, + "num_input_tokens_seen": 35807688, + "step": 61680 + }, + { + "epoch": 9.187518617813524, + "grad_norm": 0.041748046875, + "learning_rate": 0.019666691369829577, + "loss": 0.8087, + "num_input_tokens_seen": 35810568, + "step": 61685 + }, + { + "epoch": 9.188263330354483, + "grad_norm": 0.035400390625, + "learning_rate": 0.019664838432952317, + "loss": 0.7839, + "num_input_tokens_seen": 35813640, + "step": 61690 + }, + { + "epoch": 9.189008042895443, + "grad_norm": 0.0263671875, + "learning_rate": 0.01966298541726735, + "loss": 0.7883, + "num_input_tokens_seen": 35816424, + "step": 61695 + }, + { + "epoch": 9.189752755436402, + "grad_norm": 0.034423828125, + "learning_rate": 0.019661132322805985, + "loss": 0.7883, + "num_input_tokens_seen": 35819208, + "step": 61700 + }, + { + "epoch": 9.19049746797736, + "grad_norm": 0.020263671875, + "learning_rate": 0.019659279149599525, + "loss": 0.8077, + "num_input_tokens_seen": 35822120, + "step": 61705 + }, + { + "epoch": 9.19124218051832, + "grad_norm": 0.036865234375, + "learning_rate": 0.01965742589767928, + "loss": 0.8215, + "num_input_tokens_seen": 35825320, + "step": 61710 + }, + { + "epoch": 9.19198689305928, + "grad_norm": 0.03564453125, + "learning_rate": 0.01965557256707656, + "loss": 0.8124, + "num_input_tokens_seen": 35828232, + "step": 61715 + }, + { + "epoch": 9.192731605600239, + "grad_norm": 0.033935546875, + "learning_rate": 0.019653719157822668, + "loss": 0.8002, + "num_input_tokens_seen": 35830856, + "step": 61720 + }, + { + "epoch": 9.193476318141197, + "grad_norm": 0.025390625, + "learning_rate": 0.019651865669948924, + "loss": 0.7794, + "num_input_tokens_seen": 35833672, + "step": 61725 + }, + { + "epoch": 9.194221030682156, + "grad_norm": 0.03759765625, + "learning_rate": 0.019650012103486637, + "loss": 0.7801, + "num_input_tokens_seen": 35836776, + "step": 61730 + }, + { + "epoch": 9.194965743223117, + "grad_norm": 0.027587890625, + "learning_rate": 0.019648158458467124, + "loss": 0.8123, + "num_input_tokens_seen": 35839592, + "step": 61735 + }, + { + "epoch": 9.195710455764075, + "grad_norm": 0.03076171875, + "learning_rate": 0.019646304734921696, + "loss": 0.8141, + "num_input_tokens_seen": 35842280, + "step": 61740 + }, + { + "epoch": 9.196455168305034, + "grad_norm": 0.040283203125, + "learning_rate": 0.01964445093288167, + "loss": 0.7972, + "num_input_tokens_seen": 35845288, + "step": 61745 + }, + { + "epoch": 9.197199880845993, + "grad_norm": 0.02978515625, + "learning_rate": 0.01964259705237837, + "loss": 0.7974, + "num_input_tokens_seen": 35848008, + "step": 61750 + }, + { + "epoch": 9.197944593386953, + "grad_norm": 0.046630859375, + "learning_rate": 0.019640743093443106, + "loss": 0.7987, + "num_input_tokens_seen": 35850952, + "step": 61755 + }, + { + "epoch": 9.198689305927912, + "grad_norm": 0.0206298828125, + "learning_rate": 0.01963888905610721, + "loss": 0.7987, + "num_input_tokens_seen": 35853736, + "step": 61760 + }, + { + "epoch": 9.199434018468871, + "grad_norm": 0.037353515625, + "learning_rate": 0.019637034940401994, + "loss": 0.8314, + "num_input_tokens_seen": 35856872, + "step": 61765 + }, + { + "epoch": 9.20017873100983, + "grad_norm": 0.0242919921875, + "learning_rate": 0.019635180746358786, + "loss": 0.8019, + "num_input_tokens_seen": 35859880, + "step": 61770 + }, + { + "epoch": 9.200923443550789, + "grad_norm": 0.0224609375, + "learning_rate": 0.01963332647400891, + "loss": 0.8099, + "num_input_tokens_seen": 35862792, + "step": 61775 + }, + { + "epoch": 9.201668156091749, + "grad_norm": 0.022705078125, + "learning_rate": 0.019631472123383693, + "loss": 0.8031, + "num_input_tokens_seen": 35865992, + "step": 61780 + }, + { + "epoch": 9.202412868632708, + "grad_norm": 0.0289306640625, + "learning_rate": 0.019629617694514465, + "loss": 0.7935, + "num_input_tokens_seen": 35869032, + "step": 61785 + }, + { + "epoch": 9.203157581173667, + "grad_norm": 0.0299072265625, + "learning_rate": 0.019627763187432548, + "loss": 0.8159, + "num_input_tokens_seen": 35872104, + "step": 61790 + }, + { + "epoch": 9.203902293714625, + "grad_norm": 0.0263671875, + "learning_rate": 0.019625908602169277, + "loss": 0.7916, + "num_input_tokens_seen": 35874856, + "step": 61795 + }, + { + "epoch": 9.204647006255586, + "grad_norm": 0.0286865234375, + "learning_rate": 0.019624053938755978, + "loss": 0.8094, + "num_input_tokens_seen": 35877896, + "step": 61800 + }, + { + "epoch": 9.205391718796545, + "grad_norm": 0.033447265625, + "learning_rate": 0.01962219919722399, + "loss": 0.8101, + "num_input_tokens_seen": 35880936, + "step": 61805 + }, + { + "epoch": 9.206136431337503, + "grad_norm": 0.025146484375, + "learning_rate": 0.019620344377604643, + "loss": 0.8024, + "num_input_tokens_seen": 35883688, + "step": 61810 + }, + { + "epoch": 9.206881143878462, + "grad_norm": 0.0262451171875, + "learning_rate": 0.019618489479929276, + "loss": 0.8016, + "num_input_tokens_seen": 35886472, + "step": 61815 + }, + { + "epoch": 9.207625856419423, + "grad_norm": 0.0299072265625, + "learning_rate": 0.019616634504229225, + "loss": 0.799, + "num_input_tokens_seen": 35889288, + "step": 61820 + }, + { + "epoch": 9.208370568960381, + "grad_norm": 0.0228271484375, + "learning_rate": 0.019614779450535823, + "loss": 0.7819, + "num_input_tokens_seen": 35892168, + "step": 61825 + }, + { + "epoch": 9.20911528150134, + "grad_norm": 0.0260009765625, + "learning_rate": 0.019612924318880412, + "loss": 0.8344, + "num_input_tokens_seen": 35895208, + "step": 61830 + }, + { + "epoch": 9.209859994042299, + "grad_norm": 0.024658203125, + "learning_rate": 0.019611069109294332, + "loss": 0.7934, + "num_input_tokens_seen": 35897864, + "step": 61835 + }, + { + "epoch": 9.21060470658326, + "grad_norm": 0.0206298828125, + "learning_rate": 0.019609213821808927, + "loss": 0.7936, + "num_input_tokens_seen": 35900712, + "step": 61840 + }, + { + "epoch": 9.211349419124218, + "grad_norm": 0.021484375, + "learning_rate": 0.019607358456455542, + "loss": 0.7954, + "num_input_tokens_seen": 35903272, + "step": 61845 + }, + { + "epoch": 9.212094131665177, + "grad_norm": 0.03173828125, + "learning_rate": 0.019605503013265517, + "loss": 0.806, + "num_input_tokens_seen": 35906056, + "step": 61850 + }, + { + "epoch": 9.212838844206136, + "grad_norm": 0.03271484375, + "learning_rate": 0.0196036474922702, + "loss": 0.8176, + "num_input_tokens_seen": 35908968, + "step": 61855 + }, + { + "epoch": 9.213583556747096, + "grad_norm": 0.033447265625, + "learning_rate": 0.019601791893500937, + "loss": 0.7855, + "num_input_tokens_seen": 35911784, + "step": 61860 + }, + { + "epoch": 9.214328269288055, + "grad_norm": 0.0162353515625, + "learning_rate": 0.019599936216989074, + "loss": 0.8224, + "num_input_tokens_seen": 35914568, + "step": 61865 + }, + { + "epoch": 9.215072981829014, + "grad_norm": 0.039306640625, + "learning_rate": 0.019598080462765965, + "loss": 0.8126, + "num_input_tokens_seen": 35917512, + "step": 61870 + }, + { + "epoch": 9.215817694369973, + "grad_norm": 0.0162353515625, + "learning_rate": 0.01959622463086296, + "loss": 0.7921, + "num_input_tokens_seen": 35920200, + "step": 61875 + }, + { + "epoch": 9.216562406910933, + "grad_norm": 0.0263671875, + "learning_rate": 0.019594368721311418, + "loss": 0.8336, + "num_input_tokens_seen": 35923368, + "step": 61880 + }, + { + "epoch": 9.217307119451892, + "grad_norm": 0.0220947265625, + "learning_rate": 0.019592512734142677, + "loss": 0.7898, + "num_input_tokens_seen": 35926120, + "step": 61885 + }, + { + "epoch": 9.21805183199285, + "grad_norm": 0.03173828125, + "learning_rate": 0.019590656669388102, + "loss": 0.7898, + "num_input_tokens_seen": 35929160, + "step": 61890 + }, + { + "epoch": 9.21879654453381, + "grad_norm": 0.0274658203125, + "learning_rate": 0.019588800527079054, + "loss": 0.7934, + "num_input_tokens_seen": 35931848, + "step": 61895 + }, + { + "epoch": 9.21954125707477, + "grad_norm": 0.0206298828125, + "learning_rate": 0.019586944307246883, + "loss": 0.806, + "num_input_tokens_seen": 35934792, + "step": 61900 + }, + { + "epoch": 9.220285969615729, + "grad_norm": 0.0272216796875, + "learning_rate": 0.019585088009922945, + "loss": 0.7994, + "num_input_tokens_seen": 35937896, + "step": 61905 + }, + { + "epoch": 9.221030682156687, + "grad_norm": 0.0262451171875, + "learning_rate": 0.019583231635138608, + "loss": 0.7874, + "num_input_tokens_seen": 35940776, + "step": 61910 + }, + { + "epoch": 9.221775394697646, + "grad_norm": 0.024169921875, + "learning_rate": 0.019581375182925232, + "loss": 0.8013, + "num_input_tokens_seen": 35943592, + "step": 61915 + }, + { + "epoch": 9.222520107238607, + "grad_norm": 0.02978515625, + "learning_rate": 0.019579518653314178, + "loss": 0.7769, + "num_input_tokens_seen": 35946280, + "step": 61920 + }, + { + "epoch": 9.223264819779565, + "grad_norm": 0.022705078125, + "learning_rate": 0.019577662046336812, + "loss": 0.8345, + "num_input_tokens_seen": 35949096, + "step": 61925 + }, + { + "epoch": 9.224009532320524, + "grad_norm": 0.0240478515625, + "learning_rate": 0.019575805362024492, + "loss": 0.7758, + "num_input_tokens_seen": 35952040, + "step": 61930 + }, + { + "epoch": 9.224754244861483, + "grad_norm": 0.034423828125, + "learning_rate": 0.019573948600408595, + "loss": 0.786, + "num_input_tokens_seen": 35954760, + "step": 61935 + }, + { + "epoch": 9.225498957402444, + "grad_norm": 0.033935546875, + "learning_rate": 0.019572091761520492, + "loss": 0.7888, + "num_input_tokens_seen": 35957864, + "step": 61940 + }, + { + "epoch": 9.226243669943402, + "grad_norm": 0.0230712890625, + "learning_rate": 0.019570234845391537, + "loss": 0.8095, + "num_input_tokens_seen": 35960712, + "step": 61945 + }, + { + "epoch": 9.226988382484361, + "grad_norm": 0.03271484375, + "learning_rate": 0.019568377852053117, + "loss": 0.8003, + "num_input_tokens_seen": 35963656, + "step": 61950 + }, + { + "epoch": 9.22773309502532, + "grad_norm": 0.02490234375, + "learning_rate": 0.019566520781536586, + "loss": 0.784, + "num_input_tokens_seen": 35966472, + "step": 61955 + }, + { + "epoch": 9.228477807566279, + "grad_norm": 0.037353515625, + "learning_rate": 0.019564663633873332, + "loss": 0.797, + "num_input_tokens_seen": 35969384, + "step": 61960 + }, + { + "epoch": 9.229222520107239, + "grad_norm": 0.029052734375, + "learning_rate": 0.019562806409094725, + "loss": 0.8061, + "num_input_tokens_seen": 35972520, + "step": 61965 + }, + { + "epoch": 9.229967232648198, + "grad_norm": 0.0264892578125, + "learning_rate": 0.019560949107232144, + "loss": 0.8254, + "num_input_tokens_seen": 35975880, + "step": 61970 + }, + { + "epoch": 9.230711945189157, + "grad_norm": 0.033447265625, + "learning_rate": 0.01955909172831696, + "loss": 0.7936, + "num_input_tokens_seen": 35978632, + "step": 61975 + }, + { + "epoch": 9.231456657730115, + "grad_norm": 0.01409912109375, + "learning_rate": 0.01955723427238056, + "loss": 0.8052, + "num_input_tokens_seen": 35981416, + "step": 61980 + }, + { + "epoch": 9.232201370271076, + "grad_norm": 0.031494140625, + "learning_rate": 0.019555376739454313, + "loss": 0.7726, + "num_input_tokens_seen": 35984328, + "step": 61985 + }, + { + "epoch": 9.232946082812035, + "grad_norm": 0.01708984375, + "learning_rate": 0.01955351912956961, + "loss": 0.8194, + "num_input_tokens_seen": 35987016, + "step": 61990 + }, + { + "epoch": 9.233690795352993, + "grad_norm": 0.0274658203125, + "learning_rate": 0.019551661442757825, + "loss": 0.8286, + "num_input_tokens_seen": 35989800, + "step": 61995 + }, + { + "epoch": 9.234435507893952, + "grad_norm": 0.043701171875, + "learning_rate": 0.019549803679050348, + "loss": 0.8034, + "num_input_tokens_seen": 35992712, + "step": 62000 + }, + { + "epoch": 9.235180220434913, + "grad_norm": 0.02197265625, + "learning_rate": 0.01954794583847856, + "loss": 0.7827, + "num_input_tokens_seen": 35995816, + "step": 62005 + }, + { + "epoch": 9.235924932975871, + "grad_norm": 0.02587890625, + "learning_rate": 0.019546087921073853, + "loss": 0.8069, + "num_input_tokens_seen": 35998600, + "step": 62010 + }, + { + "epoch": 9.23666964551683, + "grad_norm": 0.0225830078125, + "learning_rate": 0.01954422992686761, + "loss": 0.838, + "num_input_tokens_seen": 36001352, + "step": 62015 + }, + { + "epoch": 9.237414358057789, + "grad_norm": 0.02783203125, + "learning_rate": 0.019542371855891225, + "loss": 0.812, + "num_input_tokens_seen": 36004456, + "step": 62020 + }, + { + "epoch": 9.23815907059875, + "grad_norm": 0.021484375, + "learning_rate": 0.019540513708176076, + "loss": 0.799, + "num_input_tokens_seen": 36007240, + "step": 62025 + }, + { + "epoch": 9.238903783139708, + "grad_norm": 0.031494140625, + "learning_rate": 0.019538655483753567, + "loss": 0.788, + "num_input_tokens_seen": 36009992, + "step": 62030 + }, + { + "epoch": 9.239648495680667, + "grad_norm": 0.021728515625, + "learning_rate": 0.019536797182655084, + "loss": 0.8282, + "num_input_tokens_seen": 36013000, + "step": 62035 + }, + { + "epoch": 9.240393208221626, + "grad_norm": 0.02294921875, + "learning_rate": 0.019534938804912024, + "loss": 0.831, + "num_input_tokens_seen": 36016200, + "step": 62040 + }, + { + "epoch": 9.241137920762586, + "grad_norm": 0.0262451171875, + "learning_rate": 0.019533080350555787, + "loss": 0.8077, + "num_input_tokens_seen": 36018920, + "step": 62045 + }, + { + "epoch": 9.241882633303545, + "grad_norm": 0.0255126953125, + "learning_rate": 0.01953122181961776, + "loss": 0.8293, + "num_input_tokens_seen": 36021576, + "step": 62050 + }, + { + "epoch": 9.242627345844504, + "grad_norm": 0.0267333984375, + "learning_rate": 0.019529363212129346, + "loss": 0.8149, + "num_input_tokens_seen": 36024776, + "step": 62055 + }, + { + "epoch": 9.243372058385463, + "grad_norm": 0.01556396484375, + "learning_rate": 0.019527504528121947, + "loss": 0.799, + "num_input_tokens_seen": 36027432, + "step": 62060 + }, + { + "epoch": 9.244116770926423, + "grad_norm": 0.0234375, + "learning_rate": 0.01952564576762696, + "loss": 0.8013, + "num_input_tokens_seen": 36030344, + "step": 62065 + }, + { + "epoch": 9.244861483467382, + "grad_norm": 0.03271484375, + "learning_rate": 0.019523786930675784, + "loss": 0.8067, + "num_input_tokens_seen": 36033672, + "step": 62070 + }, + { + "epoch": 9.24560619600834, + "grad_norm": 0.0242919921875, + "learning_rate": 0.019521928017299826, + "loss": 0.7882, + "num_input_tokens_seen": 36036616, + "step": 62075 + }, + { + "epoch": 9.2463509085493, + "grad_norm": 0.035400390625, + "learning_rate": 0.019520069027530493, + "loss": 0.7959, + "num_input_tokens_seen": 36039368, + "step": 62080 + }, + { + "epoch": 9.24709562109026, + "grad_norm": 0.030517578125, + "learning_rate": 0.019518209961399188, + "loss": 0.8021, + "num_input_tokens_seen": 36042152, + "step": 62085 + }, + { + "epoch": 9.247840333631219, + "grad_norm": 0.0242919921875, + "learning_rate": 0.019516350818937316, + "loss": 0.7798, + "num_input_tokens_seen": 36045096, + "step": 62090 + }, + { + "epoch": 9.248585046172177, + "grad_norm": 0.0279541015625, + "learning_rate": 0.019514491600176286, + "loss": 0.8054, + "num_input_tokens_seen": 36047976, + "step": 62095 + }, + { + "epoch": 9.249329758713136, + "grad_norm": 0.02978515625, + "learning_rate": 0.01951263230514751, + "loss": 0.7784, + "num_input_tokens_seen": 36050888, + "step": 62100 + }, + { + "epoch": 9.250074471254097, + "grad_norm": 0.0194091796875, + "learning_rate": 0.019510772933882398, + "loss": 0.7928, + "num_input_tokens_seen": 36053768, + "step": 62105 + }, + { + "epoch": 9.250819183795056, + "grad_norm": 0.0283203125, + "learning_rate": 0.019508913486412358, + "loss": 0.7981, + "num_input_tokens_seen": 36056680, + "step": 62110 + }, + { + "epoch": 9.251563896336014, + "grad_norm": 0.0257568359375, + "learning_rate": 0.019507053962768816, + "loss": 0.7996, + "num_input_tokens_seen": 36059560, + "step": 62115 + }, + { + "epoch": 9.252308608876973, + "grad_norm": 0.017578125, + "learning_rate": 0.01950519436298317, + "loss": 0.8184, + "num_input_tokens_seen": 36062312, + "step": 62120 + }, + { + "epoch": 9.253053321417934, + "grad_norm": 0.01446533203125, + "learning_rate": 0.019503334687086846, + "loss": 0.8074, + "num_input_tokens_seen": 36065128, + "step": 62125 + }, + { + "epoch": 9.253798033958892, + "grad_norm": 0.0264892578125, + "learning_rate": 0.01950147493511126, + "loss": 0.8195, + "num_input_tokens_seen": 36067912, + "step": 62130 + }, + { + "epoch": 9.254542746499851, + "grad_norm": 0.0244140625, + "learning_rate": 0.01949961510708783, + "loss": 0.7991, + "num_input_tokens_seen": 36070632, + "step": 62135 + }, + { + "epoch": 9.25528745904081, + "grad_norm": 0.020263671875, + "learning_rate": 0.019497755203047976, + "loss": 0.8108, + "num_input_tokens_seen": 36073512, + "step": 62140 + }, + { + "epoch": 9.256032171581769, + "grad_norm": 0.0301513671875, + "learning_rate": 0.019495895223023123, + "loss": 0.7858, + "num_input_tokens_seen": 36076328, + "step": 62145 + }, + { + "epoch": 9.25677688412273, + "grad_norm": 0.0286865234375, + "learning_rate": 0.019494035167044687, + "loss": 0.8027, + "num_input_tokens_seen": 36079432, + "step": 62150 + }, + { + "epoch": 9.257521596663688, + "grad_norm": 0.021484375, + "learning_rate": 0.019492175035144093, + "loss": 0.8345, + "num_input_tokens_seen": 36082312, + "step": 62155 + }, + { + "epoch": 9.258266309204647, + "grad_norm": 0.0213623046875, + "learning_rate": 0.019490314827352764, + "loss": 0.8142, + "num_input_tokens_seen": 36084904, + "step": 62160 + }, + { + "epoch": 9.259011021745605, + "grad_norm": 0.0234375, + "learning_rate": 0.019488454543702134, + "loss": 0.8198, + "num_input_tokens_seen": 36087784, + "step": 62165 + }, + { + "epoch": 9.259755734286566, + "grad_norm": 0.0194091796875, + "learning_rate": 0.019486594184223625, + "loss": 0.7971, + "num_input_tokens_seen": 36090600, + "step": 62170 + }, + { + "epoch": 9.260500446827525, + "grad_norm": 0.036376953125, + "learning_rate": 0.01948473374894867, + "loss": 0.8176, + "num_input_tokens_seen": 36093448, + "step": 62175 + }, + { + "epoch": 9.261245159368483, + "grad_norm": 0.023193359375, + "learning_rate": 0.019482873237908694, + "loss": 0.811, + "num_input_tokens_seen": 36096392, + "step": 62180 + }, + { + "epoch": 9.261989871909442, + "grad_norm": 0.021728515625, + "learning_rate": 0.019481012651135134, + "loss": 0.8103, + "num_input_tokens_seen": 36099368, + "step": 62185 + }, + { + "epoch": 9.262734584450403, + "grad_norm": 0.0400390625, + "learning_rate": 0.01947915198865942, + "loss": 0.8244, + "num_input_tokens_seen": 36102376, + "step": 62190 + }, + { + "epoch": 9.263479296991362, + "grad_norm": 0.022216796875, + "learning_rate": 0.01947729125051298, + "loss": 0.7913, + "num_input_tokens_seen": 36105128, + "step": 62195 + }, + { + "epoch": 9.26422400953232, + "grad_norm": 0.0250244140625, + "learning_rate": 0.019475430436727253, + "loss": 0.8008, + "num_input_tokens_seen": 36108296, + "step": 62200 + }, + { + "epoch": 9.264968722073279, + "grad_norm": 0.03466796875, + "learning_rate": 0.019473569547333685, + "loss": 0.803, + "num_input_tokens_seen": 36111656, + "step": 62205 + }, + { + "epoch": 9.26571343461424, + "grad_norm": 0.02734375, + "learning_rate": 0.019471708582363703, + "loss": 0.7973, + "num_input_tokens_seen": 36114536, + "step": 62210 + }, + { + "epoch": 9.266458147155198, + "grad_norm": 0.0255126953125, + "learning_rate": 0.01946984754184875, + "loss": 0.8109, + "num_input_tokens_seen": 36117672, + "step": 62215 + }, + { + "epoch": 9.267202859696157, + "grad_norm": 0.036376953125, + "learning_rate": 0.019467986425820268, + "loss": 0.7894, + "num_input_tokens_seen": 36120360, + "step": 62220 + }, + { + "epoch": 9.267947572237116, + "grad_norm": 0.02734375, + "learning_rate": 0.019466125234309694, + "loss": 0.8172, + "num_input_tokens_seen": 36123208, + "step": 62225 + }, + { + "epoch": 9.268692284778076, + "grad_norm": 0.0361328125, + "learning_rate": 0.01946426396734847, + "loss": 0.7859, + "num_input_tokens_seen": 36125992, + "step": 62230 + }, + { + "epoch": 9.269436997319035, + "grad_norm": 0.0289306640625, + "learning_rate": 0.019462402624968046, + "loss": 0.7869, + "num_input_tokens_seen": 36128936, + "step": 62235 + }, + { + "epoch": 9.270181709859994, + "grad_norm": 0.02587890625, + "learning_rate": 0.019460541207199863, + "loss": 0.8035, + "num_input_tokens_seen": 36132072, + "step": 62240 + }, + { + "epoch": 9.270926422400953, + "grad_norm": 0.018798828125, + "learning_rate": 0.019458679714075375, + "loss": 0.799, + "num_input_tokens_seen": 36135304, + "step": 62245 + }, + { + "epoch": 9.271671134941913, + "grad_norm": 0.040771484375, + "learning_rate": 0.019456818145626022, + "loss": 0.8246, + "num_input_tokens_seen": 36138312, + "step": 62250 + }, + { + "epoch": 9.272415847482872, + "grad_norm": 0.0267333984375, + "learning_rate": 0.019454956501883256, + "loss": 0.7887, + "num_input_tokens_seen": 36141192, + "step": 62255 + }, + { + "epoch": 9.27316056002383, + "grad_norm": 0.0216064453125, + "learning_rate": 0.019453094782878527, + "loss": 0.8093, + "num_input_tokens_seen": 36144232, + "step": 62260 + }, + { + "epoch": 9.27390527256479, + "grad_norm": 0.01953125, + "learning_rate": 0.01945123298864329, + "loss": 0.8103, + "num_input_tokens_seen": 36147016, + "step": 62265 + }, + { + "epoch": 9.27464998510575, + "grad_norm": 0.022216796875, + "learning_rate": 0.019449371119208993, + "loss": 0.7932, + "num_input_tokens_seen": 36149832, + "step": 62270 + }, + { + "epoch": 9.275394697646709, + "grad_norm": 0.0281982421875, + "learning_rate": 0.019447509174607094, + "loss": 0.807, + "num_input_tokens_seen": 36153000, + "step": 62275 + }, + { + "epoch": 9.276139410187668, + "grad_norm": 0.021240234375, + "learning_rate": 0.019445647154869048, + "loss": 0.8005, + "num_input_tokens_seen": 36156040, + "step": 62280 + }, + { + "epoch": 9.276884122728626, + "grad_norm": 0.03466796875, + "learning_rate": 0.019443785060026312, + "loss": 0.7872, + "num_input_tokens_seen": 36158568, + "step": 62285 + }, + { + "epoch": 9.277628835269585, + "grad_norm": 0.0302734375, + "learning_rate": 0.019441922890110343, + "loss": 0.7889, + "num_input_tokens_seen": 36161512, + "step": 62290 + }, + { + "epoch": 9.278373547810546, + "grad_norm": 0.03173828125, + "learning_rate": 0.019440060645152604, + "loss": 0.8124, + "num_input_tokens_seen": 36164328, + "step": 62295 + }, + { + "epoch": 9.279118260351504, + "grad_norm": 0.025390625, + "learning_rate": 0.019438198325184553, + "loss": 0.8014, + "num_input_tokens_seen": 36167272, + "step": 62300 + }, + { + "epoch": 9.279862972892463, + "grad_norm": 0.0216064453125, + "learning_rate": 0.019436335930237646, + "loss": 0.7899, + "num_input_tokens_seen": 36170120, + "step": 62305 + }, + { + "epoch": 9.280607685433422, + "grad_norm": 0.025634765625, + "learning_rate": 0.019434473460343354, + "loss": 0.7821, + "num_input_tokens_seen": 36173160, + "step": 62310 + }, + { + "epoch": 9.281352397974382, + "grad_norm": 0.041748046875, + "learning_rate": 0.019432610915533145, + "loss": 0.8096, + "num_input_tokens_seen": 36175816, + "step": 62315 + }, + { + "epoch": 9.282097110515341, + "grad_norm": 0.0274658203125, + "learning_rate": 0.019430748295838476, + "loss": 0.8089, + "num_input_tokens_seen": 36179304, + "step": 62320 + }, + { + "epoch": 9.2828418230563, + "grad_norm": 0.015625, + "learning_rate": 0.01942888560129082, + "loss": 0.8089, + "num_input_tokens_seen": 36182024, + "step": 62325 + }, + { + "epoch": 9.283586535597259, + "grad_norm": 0.020751953125, + "learning_rate": 0.01942702283192164, + "loss": 0.8057, + "num_input_tokens_seen": 36184904, + "step": 62330 + }, + { + "epoch": 9.28433124813822, + "grad_norm": 0.025390625, + "learning_rate": 0.01942515998776241, + "loss": 0.8041, + "num_input_tokens_seen": 36188040, + "step": 62335 + }, + { + "epoch": 9.285075960679178, + "grad_norm": 0.0213623046875, + "learning_rate": 0.019423297068844603, + "loss": 0.7907, + "num_input_tokens_seen": 36191048, + "step": 62340 + }, + { + "epoch": 9.285820673220137, + "grad_norm": 0.0166015625, + "learning_rate": 0.019421434075199685, + "loss": 0.7895, + "num_input_tokens_seen": 36193768, + "step": 62345 + }, + { + "epoch": 9.286565385761095, + "grad_norm": 0.027099609375, + "learning_rate": 0.019419571006859135, + "loss": 0.78, + "num_input_tokens_seen": 36196584, + "step": 62350 + }, + { + "epoch": 9.287310098302056, + "grad_norm": 0.0299072265625, + "learning_rate": 0.019417707863854418, + "loss": 0.8052, + "num_input_tokens_seen": 36199432, + "step": 62355 + }, + { + "epoch": 9.288054810843015, + "grad_norm": 0.0224609375, + "learning_rate": 0.019415844646217022, + "loss": 0.7966, + "num_input_tokens_seen": 36202280, + "step": 62360 + }, + { + "epoch": 9.288799523383974, + "grad_norm": 0.0233154296875, + "learning_rate": 0.019413981353978413, + "loss": 0.8148, + "num_input_tokens_seen": 36205064, + "step": 62365 + }, + { + "epoch": 9.289544235924932, + "grad_norm": 0.033447265625, + "learning_rate": 0.01941211798717008, + "loss": 0.8158, + "num_input_tokens_seen": 36207880, + "step": 62370 + }, + { + "epoch": 9.290288948465893, + "grad_norm": 0.0185546875, + "learning_rate": 0.0194102545458235, + "loss": 0.8013, + "num_input_tokens_seen": 36210664, + "step": 62375 + }, + { + "epoch": 9.291033661006852, + "grad_norm": 0.0218505859375, + "learning_rate": 0.019408391029970152, + "loss": 0.7741, + "num_input_tokens_seen": 36213192, + "step": 62380 + }, + { + "epoch": 9.29177837354781, + "grad_norm": 0.02294921875, + "learning_rate": 0.01940652743964151, + "loss": 0.8126, + "num_input_tokens_seen": 36215976, + "step": 62385 + }, + { + "epoch": 9.292523086088769, + "grad_norm": 0.03564453125, + "learning_rate": 0.01940466377486907, + "loss": 0.7956, + "num_input_tokens_seen": 36218888, + "step": 62390 + }, + { + "epoch": 9.29326779862973, + "grad_norm": 0.0311279296875, + "learning_rate": 0.01940280003568431, + "loss": 0.8245, + "num_input_tokens_seen": 36221640, + "step": 62395 + }, + { + "epoch": 9.294012511170688, + "grad_norm": 0.02490234375, + "learning_rate": 0.01940093622211872, + "loss": 0.7985, + "num_input_tokens_seen": 36224360, + "step": 62400 + }, + { + "epoch": 9.294757223711647, + "grad_norm": 0.021484375, + "learning_rate": 0.01939907233420379, + "loss": 0.7952, + "num_input_tokens_seen": 36227432, + "step": 62405 + }, + { + "epoch": 9.295501936252606, + "grad_norm": 0.0244140625, + "learning_rate": 0.019397208371970998, + "loss": 0.798, + "num_input_tokens_seen": 36230344, + "step": 62410 + }, + { + "epoch": 9.296246648793566, + "grad_norm": 0.0242919921875, + "learning_rate": 0.019395344335451842, + "loss": 0.7968, + "num_input_tokens_seen": 36233160, + "step": 62415 + }, + { + "epoch": 9.296991361334525, + "grad_norm": 0.0201416015625, + "learning_rate": 0.019393480224677812, + "loss": 0.815, + "num_input_tokens_seen": 36236008, + "step": 62420 + }, + { + "epoch": 9.297736073875484, + "grad_norm": 0.031982421875, + "learning_rate": 0.019391616039680395, + "loss": 0.8005, + "num_input_tokens_seen": 36239208, + "step": 62425 + }, + { + "epoch": 9.298480786416443, + "grad_norm": 0.0262451171875, + "learning_rate": 0.019389751780491092, + "loss": 0.7861, + "num_input_tokens_seen": 36242216, + "step": 62430 + }, + { + "epoch": 9.299225498957403, + "grad_norm": 0.026123046875, + "learning_rate": 0.019387887447141393, + "loss": 0.7867, + "num_input_tokens_seen": 36245320, + "step": 62435 + }, + { + "epoch": 9.299970211498362, + "grad_norm": 0.0296630859375, + "learning_rate": 0.019386023039662793, + "loss": 0.7974, + "num_input_tokens_seen": 36248104, + "step": 62440 + }, + { + "epoch": 9.30071492403932, + "grad_norm": 0.030517578125, + "learning_rate": 0.019384158558086798, + "loss": 0.7738, + "num_input_tokens_seen": 36250696, + "step": 62445 + }, + { + "epoch": 9.30145963658028, + "grad_norm": 0.0245361328125, + "learning_rate": 0.019382294002444893, + "loss": 0.8007, + "num_input_tokens_seen": 36253448, + "step": 62450 + }, + { + "epoch": 9.30220434912124, + "grad_norm": 0.0228271484375, + "learning_rate": 0.019380429372768588, + "loss": 0.8036, + "num_input_tokens_seen": 36256072, + "step": 62455 + }, + { + "epoch": 9.302949061662199, + "grad_norm": 0.0208740234375, + "learning_rate": 0.019378564669089382, + "loss": 0.8024, + "num_input_tokens_seen": 36258952, + "step": 62460 + }, + { + "epoch": 9.303693774203158, + "grad_norm": 0.0260009765625, + "learning_rate": 0.019376699891438774, + "loss": 0.7807, + "num_input_tokens_seen": 36261832, + "step": 62465 + }, + { + "epoch": 9.304438486744116, + "grad_norm": 0.0301513671875, + "learning_rate": 0.019374835039848267, + "loss": 0.8099, + "num_input_tokens_seen": 36264680, + "step": 62470 + }, + { + "epoch": 9.305183199285075, + "grad_norm": 0.023681640625, + "learning_rate": 0.019372970114349373, + "loss": 0.7778, + "num_input_tokens_seen": 36267656, + "step": 62475 + }, + { + "epoch": 9.305927911826036, + "grad_norm": 0.031494140625, + "learning_rate": 0.019371105114973593, + "loss": 0.8309, + "num_input_tokens_seen": 36270408, + "step": 62480 + }, + { + "epoch": 9.306672624366994, + "grad_norm": 0.0234375, + "learning_rate": 0.019369240041752434, + "loss": 0.7924, + "num_input_tokens_seen": 36273224, + "step": 62485 + }, + { + "epoch": 9.307417336907953, + "grad_norm": 0.0419921875, + "learning_rate": 0.0193673748947174, + "loss": 0.8065, + "num_input_tokens_seen": 36276232, + "step": 62490 + }, + { + "epoch": 9.308162049448912, + "grad_norm": 0.0216064453125, + "learning_rate": 0.01936550967390001, + "loss": 0.8126, + "num_input_tokens_seen": 36279272, + "step": 62495 + }, + { + "epoch": 9.308906761989872, + "grad_norm": 0.0244140625, + "learning_rate": 0.019363644379331777, + "loss": 0.7836, + "num_input_tokens_seen": 36281928, + "step": 62500 + }, + { + "epoch": 9.309651474530831, + "grad_norm": 0.02490234375, + "learning_rate": 0.0193617790110442, + "loss": 0.8314, + "num_input_tokens_seen": 36284744, + "step": 62505 + }, + { + "epoch": 9.31039618707179, + "grad_norm": 0.032470703125, + "learning_rate": 0.0193599135690688, + "loss": 0.8043, + "num_input_tokens_seen": 36287752, + "step": 62510 + }, + { + "epoch": 9.311140899612749, + "grad_norm": 0.0247802734375, + "learning_rate": 0.019358048053437094, + "loss": 0.8001, + "num_input_tokens_seen": 36290504, + "step": 62515 + }, + { + "epoch": 9.31188561215371, + "grad_norm": 0.02490234375, + "learning_rate": 0.019356182464180596, + "loss": 0.7777, + "num_input_tokens_seen": 36293384, + "step": 62520 + }, + { + "epoch": 9.312630324694668, + "grad_norm": 0.0218505859375, + "learning_rate": 0.01935431680133082, + "loss": 0.8026, + "num_input_tokens_seen": 36296328, + "step": 62525 + }, + { + "epoch": 9.313375037235627, + "grad_norm": 0.032958984375, + "learning_rate": 0.01935245106491929, + "loss": 0.8011, + "num_input_tokens_seen": 36299240, + "step": 62530 + }, + { + "epoch": 9.314119749776586, + "grad_norm": 0.029052734375, + "learning_rate": 0.019350585254977522, + "loss": 0.7947, + "num_input_tokens_seen": 36302024, + "step": 62535 + }, + { + "epoch": 9.314864462317546, + "grad_norm": 0.03271484375, + "learning_rate": 0.019348719371537034, + "loss": 0.8181, + "num_input_tokens_seen": 36305288, + "step": 62540 + }, + { + "epoch": 9.315609174858505, + "grad_norm": 0.0152587890625, + "learning_rate": 0.01934685341462936, + "loss": 0.8103, + "num_input_tokens_seen": 36308008, + "step": 62545 + }, + { + "epoch": 9.316353887399464, + "grad_norm": 0.0230712890625, + "learning_rate": 0.01934498738428601, + "loss": 0.7947, + "num_input_tokens_seen": 36310984, + "step": 62550 + }, + { + "epoch": 9.317098599940422, + "grad_norm": 0.0478515625, + "learning_rate": 0.019343121280538514, + "loss": 0.7806, + "num_input_tokens_seen": 36313960, + "step": 62555 + }, + { + "epoch": 9.317843312481383, + "grad_norm": 0.0341796875, + "learning_rate": 0.019341255103418397, + "loss": 0.8173, + "num_input_tokens_seen": 36316744, + "step": 62560 + }, + { + "epoch": 9.318588025022342, + "grad_norm": 0.02197265625, + "learning_rate": 0.019339388852957187, + "loss": 0.7888, + "num_input_tokens_seen": 36319496, + "step": 62565 + }, + { + "epoch": 9.3193327375633, + "grad_norm": 0.02783203125, + "learning_rate": 0.019337522529186413, + "loss": 0.8063, + "num_input_tokens_seen": 36322248, + "step": 62570 + }, + { + "epoch": 9.32007745010426, + "grad_norm": 0.0260009765625, + "learning_rate": 0.01933565613213761, + "loss": 0.8324, + "num_input_tokens_seen": 36325224, + "step": 62575 + }, + { + "epoch": 9.32082216264522, + "grad_norm": 0.02392578125, + "learning_rate": 0.0193337896618423, + "loss": 0.804, + "num_input_tokens_seen": 36327976, + "step": 62580 + }, + { + "epoch": 9.321566875186178, + "grad_norm": 0.0322265625, + "learning_rate": 0.01933192311833202, + "loss": 0.8075, + "num_input_tokens_seen": 36330792, + "step": 62585 + }, + { + "epoch": 9.322311587727137, + "grad_norm": 0.022216796875, + "learning_rate": 0.0193300565016383, + "loss": 0.7997, + "num_input_tokens_seen": 36333576, + "step": 62590 + }, + { + "epoch": 9.323056300268096, + "grad_norm": 0.015625, + "learning_rate": 0.019328189811792674, + "loss": 0.8262, + "num_input_tokens_seen": 36336424, + "step": 62595 + }, + { + "epoch": 9.323801012809056, + "grad_norm": 0.042236328125, + "learning_rate": 0.01932632304882668, + "loss": 0.8189, + "num_input_tokens_seen": 36339304, + "step": 62600 + }, + { + "epoch": 9.324545725350015, + "grad_norm": 0.0301513671875, + "learning_rate": 0.019324456212771856, + "loss": 0.821, + "num_input_tokens_seen": 36341992, + "step": 62605 + }, + { + "epoch": 9.325290437890974, + "grad_norm": 0.0269775390625, + "learning_rate": 0.019322589303659744, + "loss": 0.7991, + "num_input_tokens_seen": 36344968, + "step": 62610 + }, + { + "epoch": 9.326035150431933, + "grad_norm": 0.0205078125, + "learning_rate": 0.019320722321521874, + "loss": 0.8014, + "num_input_tokens_seen": 36348136, + "step": 62615 + }, + { + "epoch": 9.326779862972893, + "grad_norm": 0.02001953125, + "learning_rate": 0.01931885526638979, + "loss": 0.7946, + "num_input_tokens_seen": 36350824, + "step": 62620 + }, + { + "epoch": 9.327524575513852, + "grad_norm": 0.02880859375, + "learning_rate": 0.019316988138295046, + "loss": 0.7801, + "num_input_tokens_seen": 36353800, + "step": 62625 + }, + { + "epoch": 9.32826928805481, + "grad_norm": 0.018798828125, + "learning_rate": 0.019315120937269167, + "loss": 0.8206, + "num_input_tokens_seen": 36356584, + "step": 62630 + }, + { + "epoch": 9.32901400059577, + "grad_norm": 0.0242919921875, + "learning_rate": 0.019313253663343705, + "loss": 0.8112, + "num_input_tokens_seen": 36359368, + "step": 62635 + }, + { + "epoch": 9.32975871313673, + "grad_norm": 0.027099609375, + "learning_rate": 0.019311386316550206, + "loss": 0.7849, + "num_input_tokens_seen": 36362248, + "step": 62640 + }, + { + "epoch": 9.330503425677689, + "grad_norm": 0.0250244140625, + "learning_rate": 0.019309518896920223, + "loss": 0.8051, + "num_input_tokens_seen": 36364936, + "step": 62645 + }, + { + "epoch": 9.331248138218648, + "grad_norm": 0.0311279296875, + "learning_rate": 0.01930765140448529, + "loss": 0.8106, + "num_input_tokens_seen": 36368328, + "step": 62650 + }, + { + "epoch": 9.331992850759606, + "grad_norm": 0.0247802734375, + "learning_rate": 0.01930578383927697, + "loss": 0.804, + "num_input_tokens_seen": 36371144, + "step": 62655 + }, + { + "epoch": 9.332737563300565, + "grad_norm": 0.032958984375, + "learning_rate": 0.019303916201326803, + "loss": 0.805, + "num_input_tokens_seen": 36373896, + "step": 62660 + }, + { + "epoch": 9.333482275841526, + "grad_norm": 0.0211181640625, + "learning_rate": 0.019302048490666354, + "loss": 0.7896, + "num_input_tokens_seen": 36376840, + "step": 62665 + }, + { + "epoch": 9.334226988382484, + "grad_norm": 0.054443359375, + "learning_rate": 0.019300180707327166, + "loss": 0.8133, + "num_input_tokens_seen": 36379656, + "step": 62670 + }, + { + "epoch": 9.334971700923443, + "grad_norm": 0.017822265625, + "learning_rate": 0.019298312851340788, + "loss": 0.7925, + "num_input_tokens_seen": 36382472, + "step": 62675 + }, + { + "epoch": 9.335716413464402, + "grad_norm": 0.0322265625, + "learning_rate": 0.019296444922738794, + "loss": 0.7932, + "num_input_tokens_seen": 36385448, + "step": 62680 + }, + { + "epoch": 9.336461126005362, + "grad_norm": 0.027587890625, + "learning_rate": 0.019294576921552724, + "loss": 0.8085, + "num_input_tokens_seen": 36388520, + "step": 62685 + }, + { + "epoch": 9.337205838546321, + "grad_norm": 0.0262451171875, + "learning_rate": 0.01929270884781414, + "loss": 0.8076, + "num_input_tokens_seen": 36391272, + "step": 62690 + }, + { + "epoch": 9.33795055108728, + "grad_norm": 0.028076171875, + "learning_rate": 0.019290840701554606, + "loss": 0.7808, + "num_input_tokens_seen": 36394152, + "step": 62695 + }, + { + "epoch": 9.338695263628239, + "grad_norm": 0.0218505859375, + "learning_rate": 0.019288972482805675, + "loss": 0.8184, + "num_input_tokens_seen": 36397192, + "step": 62700 + }, + { + "epoch": 9.3394399761692, + "grad_norm": 0.025390625, + "learning_rate": 0.01928710419159892, + "loss": 0.7837, + "num_input_tokens_seen": 36400168, + "step": 62705 + }, + { + "epoch": 9.340184688710158, + "grad_norm": 0.02734375, + "learning_rate": 0.019285235827965895, + "loss": 0.7927, + "num_input_tokens_seen": 36402920, + "step": 62710 + }, + { + "epoch": 9.340929401251117, + "grad_norm": 0.0225830078125, + "learning_rate": 0.01928336739193816, + "loss": 0.7917, + "num_input_tokens_seen": 36406088, + "step": 62715 + }, + { + "epoch": 9.341674113792076, + "grad_norm": 0.0203857421875, + "learning_rate": 0.019281498883547295, + "loss": 0.7831, + "num_input_tokens_seen": 36409256, + "step": 62720 + }, + { + "epoch": 9.342418826333036, + "grad_norm": 0.0234375, + "learning_rate": 0.01927963030282485, + "loss": 0.8001, + "num_input_tokens_seen": 36411880, + "step": 62725 + }, + { + "epoch": 9.343163538873995, + "grad_norm": 0.023681640625, + "learning_rate": 0.019277761649802404, + "loss": 0.8091, + "num_input_tokens_seen": 36414760, + "step": 62730 + }, + { + "epoch": 9.343908251414954, + "grad_norm": 0.0172119140625, + "learning_rate": 0.019275892924511522, + "loss": 0.7794, + "num_input_tokens_seen": 36417608, + "step": 62735 + }, + { + "epoch": 9.344652963955912, + "grad_norm": 0.01904296875, + "learning_rate": 0.019274024126983775, + "loss": 0.8217, + "num_input_tokens_seen": 36420648, + "step": 62740 + }, + { + "epoch": 9.345397676496873, + "grad_norm": 0.037841796875, + "learning_rate": 0.019272155257250737, + "loss": 0.8135, + "num_input_tokens_seen": 36423432, + "step": 62745 + }, + { + "epoch": 9.346142389037832, + "grad_norm": 0.0322265625, + "learning_rate": 0.019270286315343974, + "loss": 0.8241, + "num_input_tokens_seen": 36426312, + "step": 62750 + }, + { + "epoch": 9.34688710157879, + "grad_norm": 0.030517578125, + "learning_rate": 0.019268417301295068, + "loss": 0.7762, + "num_input_tokens_seen": 36429256, + "step": 62755 + }, + { + "epoch": 9.34763181411975, + "grad_norm": 0.0400390625, + "learning_rate": 0.019266548215135588, + "loss": 0.7964, + "num_input_tokens_seen": 36432360, + "step": 62760 + }, + { + "epoch": 9.34837652666071, + "grad_norm": 0.022216796875, + "learning_rate": 0.01926467905689711, + "loss": 0.7887, + "num_input_tokens_seen": 36435400, + "step": 62765 + }, + { + "epoch": 9.349121239201668, + "grad_norm": 0.033203125, + "learning_rate": 0.019262809826611212, + "loss": 0.7966, + "num_input_tokens_seen": 36438312, + "step": 62770 + }, + { + "epoch": 9.349865951742627, + "grad_norm": 0.0223388671875, + "learning_rate": 0.019260940524309485, + "loss": 0.783, + "num_input_tokens_seen": 36441256, + "step": 62775 + }, + { + "epoch": 9.350610664283586, + "grad_norm": 0.02099609375, + "learning_rate": 0.01925907115002349, + "loss": 0.8173, + "num_input_tokens_seen": 36444168, + "step": 62780 + }, + { + "epoch": 9.351355376824547, + "grad_norm": 0.02392578125, + "learning_rate": 0.019257201703784817, + "loss": 0.7706, + "num_input_tokens_seen": 36446760, + "step": 62785 + }, + { + "epoch": 9.352100089365505, + "grad_norm": 0.0164794921875, + "learning_rate": 0.01925533218562505, + "loss": 0.8087, + "num_input_tokens_seen": 36449512, + "step": 62790 + }, + { + "epoch": 9.352844801906464, + "grad_norm": 0.0206298828125, + "learning_rate": 0.019253462595575768, + "loss": 0.7933, + "num_input_tokens_seen": 36452648, + "step": 62795 + }, + { + "epoch": 9.353589514447423, + "grad_norm": 0.035888671875, + "learning_rate": 0.01925159293366856, + "loss": 0.7851, + "num_input_tokens_seen": 36455432, + "step": 62800 + }, + { + "epoch": 9.354334226988382, + "grad_norm": 0.0341796875, + "learning_rate": 0.019249723199935013, + "loss": 0.7953, + "num_input_tokens_seen": 36458600, + "step": 62805 + }, + { + "epoch": 9.355078939529342, + "grad_norm": 0.0235595703125, + "learning_rate": 0.019247853394406707, + "loss": 0.8014, + "num_input_tokens_seen": 36461544, + "step": 62810 + }, + { + "epoch": 9.3558236520703, + "grad_norm": 0.0213623046875, + "learning_rate": 0.01924598351711524, + "loss": 0.7819, + "num_input_tokens_seen": 36464392, + "step": 62815 + }, + { + "epoch": 9.35656836461126, + "grad_norm": 0.030029296875, + "learning_rate": 0.019244113568092195, + "loss": 0.8081, + "num_input_tokens_seen": 36467080, + "step": 62820 + }, + { + "epoch": 9.357313077152218, + "grad_norm": 0.028564453125, + "learning_rate": 0.019242243547369163, + "loss": 0.8091, + "num_input_tokens_seen": 36469896, + "step": 62825 + }, + { + "epoch": 9.358057789693179, + "grad_norm": 0.02197265625, + "learning_rate": 0.01924037345497774, + "loss": 0.7906, + "num_input_tokens_seen": 36472744, + "step": 62830 + }, + { + "epoch": 9.358802502234138, + "grad_norm": 0.0179443359375, + "learning_rate": 0.019238503290949515, + "loss": 0.8221, + "num_input_tokens_seen": 36475368, + "step": 62835 + }, + { + "epoch": 9.359547214775096, + "grad_norm": 0.035888671875, + "learning_rate": 0.019236633055316088, + "loss": 0.7916, + "num_input_tokens_seen": 36478152, + "step": 62840 + }, + { + "epoch": 9.360291927316055, + "grad_norm": 0.0257568359375, + "learning_rate": 0.019234762748109053, + "loss": 0.7951, + "num_input_tokens_seen": 36480904, + "step": 62845 + }, + { + "epoch": 9.361036639857016, + "grad_norm": 0.0220947265625, + "learning_rate": 0.01923289236936, + "loss": 0.7914, + "num_input_tokens_seen": 36483784, + "step": 62850 + }, + { + "epoch": 9.361781352397974, + "grad_norm": 0.050048828125, + "learning_rate": 0.019231021919100533, + "loss": 0.8292, + "num_input_tokens_seen": 36486792, + "step": 62855 + }, + { + "epoch": 9.362526064938933, + "grad_norm": 0.038818359375, + "learning_rate": 0.019229151397362254, + "loss": 0.795, + "num_input_tokens_seen": 36489832, + "step": 62860 + }, + { + "epoch": 9.363270777479892, + "grad_norm": 0.0191650390625, + "learning_rate": 0.019227280804176763, + "loss": 0.8055, + "num_input_tokens_seen": 36492744, + "step": 62865 + }, + { + "epoch": 9.364015490020853, + "grad_norm": 0.0279541015625, + "learning_rate": 0.01922541013957565, + "loss": 0.7908, + "num_input_tokens_seen": 36495432, + "step": 62870 + }, + { + "epoch": 9.364760202561811, + "grad_norm": 0.0234375, + "learning_rate": 0.019223539403590537, + "loss": 0.8103, + "num_input_tokens_seen": 36497992, + "step": 62875 + }, + { + "epoch": 9.36550491510277, + "grad_norm": 0.02490234375, + "learning_rate": 0.019221668596253013, + "loss": 0.787, + "num_input_tokens_seen": 36500648, + "step": 62880 + }, + { + "epoch": 9.366249627643729, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01921979771759469, + "loss": 0.7995, + "num_input_tokens_seen": 36503784, + "step": 62885 + }, + { + "epoch": 9.36699434018469, + "grad_norm": 0.027099609375, + "learning_rate": 0.01921792676764717, + "loss": 0.8071, + "num_input_tokens_seen": 36506376, + "step": 62890 + }, + { + "epoch": 9.367739052725648, + "grad_norm": 0.0546875, + "learning_rate": 0.019216055746442067, + "loss": 0.8026, + "num_input_tokens_seen": 36509480, + "step": 62895 + }, + { + "epoch": 9.368483765266607, + "grad_norm": 0.03564453125, + "learning_rate": 0.019214184654010986, + "loss": 0.8176, + "num_input_tokens_seen": 36512264, + "step": 62900 + }, + { + "epoch": 9.369228477807566, + "grad_norm": 0.03271484375, + "learning_rate": 0.019212313490385543, + "loss": 0.7757, + "num_input_tokens_seen": 36515080, + "step": 62905 + }, + { + "epoch": 9.369973190348526, + "grad_norm": 0.035888671875, + "learning_rate": 0.019210442255597347, + "loss": 0.811, + "num_input_tokens_seen": 36517896, + "step": 62910 + }, + { + "epoch": 9.370717902889485, + "grad_norm": 0.0235595703125, + "learning_rate": 0.019208570949677997, + "loss": 0.785, + "num_input_tokens_seen": 36520552, + "step": 62915 + }, + { + "epoch": 9.371462615430444, + "grad_norm": 0.0361328125, + "learning_rate": 0.019206699572659126, + "loss": 0.798, + "num_input_tokens_seen": 36523464, + "step": 62920 + }, + { + "epoch": 9.372207327971402, + "grad_norm": 0.0322265625, + "learning_rate": 0.01920482812457234, + "loss": 0.7972, + "num_input_tokens_seen": 36526216, + "step": 62925 + }, + { + "epoch": 9.372952040512363, + "grad_norm": 0.0203857421875, + "learning_rate": 0.019202956605449253, + "loss": 0.817, + "num_input_tokens_seen": 36529032, + "step": 62930 + }, + { + "epoch": 9.373696753053322, + "grad_norm": 0.032470703125, + "learning_rate": 0.01920108501532149, + "loss": 0.7951, + "num_input_tokens_seen": 36531592, + "step": 62935 + }, + { + "epoch": 9.37444146559428, + "grad_norm": 0.0281982421875, + "learning_rate": 0.019199213354220663, + "loss": 0.8098, + "num_input_tokens_seen": 36534760, + "step": 62940 + }, + { + "epoch": 9.37518617813524, + "grad_norm": 0.035888671875, + "learning_rate": 0.0191973416221784, + "loss": 0.8209, + "num_input_tokens_seen": 36537736, + "step": 62945 + }, + { + "epoch": 9.3759308906762, + "grad_norm": 0.040283203125, + "learning_rate": 0.01919546981922631, + "loss": 0.8069, + "num_input_tokens_seen": 36540680, + "step": 62950 + }, + { + "epoch": 9.376675603217159, + "grad_norm": 0.020263671875, + "learning_rate": 0.01919359794539602, + "loss": 0.7973, + "num_input_tokens_seen": 36543144, + "step": 62955 + }, + { + "epoch": 9.377420315758117, + "grad_norm": 0.01409912109375, + "learning_rate": 0.01919172600071916, + "loss": 0.8053, + "num_input_tokens_seen": 36546088, + "step": 62960 + }, + { + "epoch": 9.378165028299076, + "grad_norm": 0.0264892578125, + "learning_rate": 0.019189853985227344, + "loss": 0.7897, + "num_input_tokens_seen": 36549384, + "step": 62965 + }, + { + "epoch": 9.378909740840037, + "grad_norm": 0.0311279296875, + "learning_rate": 0.019187981898952208, + "loss": 0.7885, + "num_input_tokens_seen": 36552424, + "step": 62970 + }, + { + "epoch": 9.379654453380995, + "grad_norm": 0.0303955078125, + "learning_rate": 0.019186109741925375, + "loss": 0.8078, + "num_input_tokens_seen": 36555432, + "step": 62975 + }, + { + "epoch": 9.380399165921954, + "grad_norm": 0.0191650390625, + "learning_rate": 0.01918423751417847, + "loss": 0.8036, + "num_input_tokens_seen": 36558632, + "step": 62980 + }, + { + "epoch": 9.381143878462913, + "grad_norm": 0.035400390625, + "learning_rate": 0.019182365215743128, + "loss": 0.8151, + "num_input_tokens_seen": 36561768, + "step": 62985 + }, + { + "epoch": 9.381888591003872, + "grad_norm": 0.050048828125, + "learning_rate": 0.019180492846650972, + "loss": 0.8049, + "num_input_tokens_seen": 36564488, + "step": 62990 + }, + { + "epoch": 9.382633303544832, + "grad_norm": 0.018310546875, + "learning_rate": 0.01917862040693364, + "loss": 0.8171, + "num_input_tokens_seen": 36567240, + "step": 62995 + }, + { + "epoch": 9.383378016085791, + "grad_norm": 0.0220947265625, + "learning_rate": 0.019176747896622767, + "loss": 0.809, + "num_input_tokens_seen": 36570248, + "step": 63000 + }, + { + "epoch": 9.38412272862675, + "grad_norm": 0.0294189453125, + "learning_rate": 0.019174875315749976, + "loss": 0.8063, + "num_input_tokens_seen": 36573000, + "step": 63005 + }, + { + "epoch": 9.384867441167708, + "grad_norm": 0.04638671875, + "learning_rate": 0.019173002664346916, + "loss": 0.8122, + "num_input_tokens_seen": 36576008, + "step": 63010 + }, + { + "epoch": 9.385612153708669, + "grad_norm": 0.034423828125, + "learning_rate": 0.019171129942445214, + "loss": 0.817, + "num_input_tokens_seen": 36578696, + "step": 63015 + }, + { + "epoch": 9.386356866249628, + "grad_norm": 0.0228271484375, + "learning_rate": 0.01916925715007651, + "loss": 0.7889, + "num_input_tokens_seen": 36581448, + "step": 63020 + }, + { + "epoch": 9.387101578790586, + "grad_norm": 0.02880859375, + "learning_rate": 0.019167384287272448, + "loss": 0.8033, + "num_input_tokens_seen": 36584456, + "step": 63025 + }, + { + "epoch": 9.387846291331545, + "grad_norm": 0.0263671875, + "learning_rate": 0.01916551135406466, + "loss": 0.7882, + "num_input_tokens_seen": 36587528, + "step": 63030 + }, + { + "epoch": 9.388591003872506, + "grad_norm": 0.025634765625, + "learning_rate": 0.01916363835048479, + "loss": 0.7949, + "num_input_tokens_seen": 36590408, + "step": 63035 + }, + { + "epoch": 9.389335716413465, + "grad_norm": 0.0302734375, + "learning_rate": 0.019161765276564487, + "loss": 0.8034, + "num_input_tokens_seen": 36593160, + "step": 63040 + }, + { + "epoch": 9.390080428954423, + "grad_norm": 0.023681640625, + "learning_rate": 0.019159892132335387, + "loss": 0.8213, + "num_input_tokens_seen": 36596008, + "step": 63045 + }, + { + "epoch": 9.390825141495382, + "grad_norm": 0.031982421875, + "learning_rate": 0.019158018917829138, + "loss": 0.7948, + "num_input_tokens_seen": 36598696, + "step": 63050 + }, + { + "epoch": 9.391569854036343, + "grad_norm": 0.029541015625, + "learning_rate": 0.019156145633077385, + "loss": 0.808, + "num_input_tokens_seen": 36601640, + "step": 63055 + }, + { + "epoch": 9.392314566577301, + "grad_norm": 0.0234375, + "learning_rate": 0.019154272278111773, + "loss": 0.807, + "num_input_tokens_seen": 36604424, + "step": 63060 + }, + { + "epoch": 9.39305927911826, + "grad_norm": 0.0228271484375, + "learning_rate": 0.019152398852963957, + "loss": 0.8032, + "num_input_tokens_seen": 36607272, + "step": 63065 + }, + { + "epoch": 9.393803991659219, + "grad_norm": 0.03466796875, + "learning_rate": 0.019150525357665583, + "loss": 0.7987, + "num_input_tokens_seen": 36610120, + "step": 63070 + }, + { + "epoch": 9.39454870420018, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0191486517922483, + "loss": 0.7908, + "num_input_tokens_seen": 36612904, + "step": 63075 + }, + { + "epoch": 9.395293416741138, + "grad_norm": 0.0218505859375, + "learning_rate": 0.01914677815674376, + "loss": 0.7974, + "num_input_tokens_seen": 36615752, + "step": 63080 + }, + { + "epoch": 9.396038129282097, + "grad_norm": 0.054443359375, + "learning_rate": 0.01914490445118362, + "loss": 0.8066, + "num_input_tokens_seen": 36618888, + "step": 63085 + }, + { + "epoch": 9.396782841823056, + "grad_norm": 0.020751953125, + "learning_rate": 0.01914303067559953, + "loss": 0.802, + "num_input_tokens_seen": 36621896, + "step": 63090 + }, + { + "epoch": 9.397527554364016, + "grad_norm": 0.021728515625, + "learning_rate": 0.019141156830023147, + "loss": 0.8249, + "num_input_tokens_seen": 36624712, + "step": 63095 + }, + { + "epoch": 9.398272266904975, + "grad_norm": 0.01409912109375, + "learning_rate": 0.01913928291448613, + "loss": 0.7892, + "num_input_tokens_seen": 36627400, + "step": 63100 + }, + { + "epoch": 9.399016979445934, + "grad_norm": 0.0279541015625, + "learning_rate": 0.019137408929020137, + "loss": 0.8036, + "num_input_tokens_seen": 36630312, + "step": 63105 + }, + { + "epoch": 9.399761691986892, + "grad_norm": 0.0267333984375, + "learning_rate": 0.019135534873656823, + "loss": 0.8017, + "num_input_tokens_seen": 36633192, + "step": 63110 + }, + { + "epoch": 9.400506404527853, + "grad_norm": 0.033203125, + "learning_rate": 0.019133660748427848, + "loss": 0.8096, + "num_input_tokens_seen": 36636008, + "step": 63115 + }, + { + "epoch": 9.401251117068812, + "grad_norm": 0.0341796875, + "learning_rate": 0.01913178655336488, + "loss": 0.7897, + "num_input_tokens_seen": 36638984, + "step": 63120 + }, + { + "epoch": 9.40199582960977, + "grad_norm": 0.0174560546875, + "learning_rate": 0.019129912288499574, + "loss": 0.7907, + "num_input_tokens_seen": 36641960, + "step": 63125 + }, + { + "epoch": 9.40274054215073, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0191280379538636, + "loss": 0.8021, + "num_input_tokens_seen": 36644776, + "step": 63130 + }, + { + "epoch": 9.40348525469169, + "grad_norm": 0.0181884765625, + "learning_rate": 0.019126163549488616, + "loss": 0.8053, + "num_input_tokens_seen": 36647560, + "step": 63135 + }, + { + "epoch": 9.404229967232649, + "grad_norm": 0.02392578125, + "learning_rate": 0.019124289075406302, + "loss": 0.7993, + "num_input_tokens_seen": 36650536, + "step": 63140 + }, + { + "epoch": 9.404974679773607, + "grad_norm": 0.0213623046875, + "learning_rate": 0.019122414531648306, + "loss": 0.8071, + "num_input_tokens_seen": 36653288, + "step": 63145 + }, + { + "epoch": 9.405719392314566, + "grad_norm": 0.0242919921875, + "learning_rate": 0.019120539918246315, + "loss": 0.7905, + "num_input_tokens_seen": 36656424, + "step": 63150 + }, + { + "epoch": 9.406464104855527, + "grad_norm": 0.0296630859375, + "learning_rate": 0.019118665235231983, + "loss": 0.7852, + "num_input_tokens_seen": 36659368, + "step": 63155 + }, + { + "epoch": 9.407208817396485, + "grad_norm": 0.0220947265625, + "learning_rate": 0.01911679048263699, + "loss": 0.7966, + "num_input_tokens_seen": 36662408, + "step": 63160 + }, + { + "epoch": 9.407953529937444, + "grad_norm": 0.0220947265625, + "learning_rate": 0.019114915660493004, + "loss": 0.8062, + "num_input_tokens_seen": 36665576, + "step": 63165 + }, + { + "epoch": 9.408698242478403, + "grad_norm": 0.02197265625, + "learning_rate": 0.019113040768831704, + "loss": 0.7734, + "num_input_tokens_seen": 36668360, + "step": 63170 + }, + { + "epoch": 9.409442955019362, + "grad_norm": 0.026123046875, + "learning_rate": 0.01911116580768476, + "loss": 0.8047, + "num_input_tokens_seen": 36671400, + "step": 63175 + }, + { + "epoch": 9.410187667560322, + "grad_norm": 0.024169921875, + "learning_rate": 0.019109290777083843, + "loss": 0.8116, + "num_input_tokens_seen": 36674344, + "step": 63180 + }, + { + "epoch": 9.410932380101281, + "grad_norm": 0.02685546875, + "learning_rate": 0.01910741567706064, + "loss": 0.8021, + "num_input_tokens_seen": 36677544, + "step": 63185 + }, + { + "epoch": 9.41167709264224, + "grad_norm": 0.032470703125, + "learning_rate": 0.019105540507646825, + "loss": 0.8212, + "num_input_tokens_seen": 36680744, + "step": 63190 + }, + { + "epoch": 9.412421805183198, + "grad_norm": 0.021484375, + "learning_rate": 0.01910366526887407, + "loss": 0.7904, + "num_input_tokens_seen": 36683624, + "step": 63195 + }, + { + "epoch": 9.413166517724159, + "grad_norm": 0.04833984375, + "learning_rate": 0.019101789960774065, + "loss": 0.8394, + "num_input_tokens_seen": 36686376, + "step": 63200 + }, + { + "epoch": 9.413911230265118, + "grad_norm": 0.030029296875, + "learning_rate": 0.019099914583378487, + "loss": 0.822, + "num_input_tokens_seen": 36689544, + "step": 63205 + }, + { + "epoch": 9.414655942806077, + "grad_norm": 0.0230712890625, + "learning_rate": 0.01909803913671902, + "loss": 0.8194, + "num_input_tokens_seen": 36692200, + "step": 63210 + }, + { + "epoch": 9.415400655347035, + "grad_norm": 0.0255126953125, + "learning_rate": 0.019096163620827344, + "loss": 0.813, + "num_input_tokens_seen": 36695400, + "step": 63215 + }, + { + "epoch": 9.416145367887996, + "grad_norm": 0.0223388671875, + "learning_rate": 0.01909428803573515, + "loss": 0.7967, + "num_input_tokens_seen": 36698216, + "step": 63220 + }, + { + "epoch": 9.416890080428955, + "grad_norm": 0.02880859375, + "learning_rate": 0.019092412381474116, + "loss": 0.7931, + "num_input_tokens_seen": 36701288, + "step": 63225 + }, + { + "epoch": 9.417634792969913, + "grad_norm": 0.035888671875, + "learning_rate": 0.01909053665807594, + "loss": 0.8013, + "num_input_tokens_seen": 36704072, + "step": 63230 + }, + { + "epoch": 9.418379505510872, + "grad_norm": 0.02490234375, + "learning_rate": 0.0190886608655723, + "loss": 0.7851, + "num_input_tokens_seen": 36707080, + "step": 63235 + }, + { + "epoch": 9.419124218051833, + "grad_norm": 0.0208740234375, + "learning_rate": 0.019086785003994897, + "loss": 0.8048, + "num_input_tokens_seen": 36709800, + "step": 63240 + }, + { + "epoch": 9.419868930592791, + "grad_norm": 0.0228271484375, + "learning_rate": 0.019084909073375407, + "loss": 0.7994, + "num_input_tokens_seen": 36712328, + "step": 63245 + }, + { + "epoch": 9.42061364313375, + "grad_norm": 0.03173828125, + "learning_rate": 0.019083033073745533, + "loss": 0.8101, + "num_input_tokens_seen": 36715560, + "step": 63250 + }, + { + "epoch": 9.421358355674709, + "grad_norm": 0.0257568359375, + "learning_rate": 0.019081157005136965, + "loss": 0.8114, + "num_input_tokens_seen": 36718248, + "step": 63255 + }, + { + "epoch": 9.42210306821567, + "grad_norm": 0.0242919921875, + "learning_rate": 0.019079280867581396, + "loss": 0.8042, + "num_input_tokens_seen": 36720936, + "step": 63260 + }, + { + "epoch": 9.422847780756628, + "grad_norm": 0.036376953125, + "learning_rate": 0.019077404661110523, + "loss": 0.8081, + "num_input_tokens_seen": 36723688, + "step": 63265 + }, + { + "epoch": 9.423592493297587, + "grad_norm": 0.0286865234375, + "learning_rate": 0.019075528385756046, + "loss": 0.7806, + "num_input_tokens_seen": 36726664, + "step": 63270 + }, + { + "epoch": 9.424337205838546, + "grad_norm": 0.02685546875, + "learning_rate": 0.019073652041549656, + "loss": 0.7995, + "num_input_tokens_seen": 36729448, + "step": 63275 + }, + { + "epoch": 9.425081918379506, + "grad_norm": 0.0159912109375, + "learning_rate": 0.019071775628523056, + "loss": 0.7892, + "num_input_tokens_seen": 36732104, + "step": 63280 + }, + { + "epoch": 9.425826630920465, + "grad_norm": 0.0294189453125, + "learning_rate": 0.019069899146707946, + "loss": 0.7985, + "num_input_tokens_seen": 36735080, + "step": 63285 + }, + { + "epoch": 9.426571343461424, + "grad_norm": 0.054931640625, + "learning_rate": 0.019068022596136023, + "loss": 0.8101, + "num_input_tokens_seen": 36738024, + "step": 63290 + }, + { + "epoch": 9.427316056002383, + "grad_norm": 0.036376953125, + "learning_rate": 0.019066145976838995, + "loss": 0.7909, + "num_input_tokens_seen": 36740904, + "step": 63295 + }, + { + "epoch": 9.428060768543343, + "grad_norm": 0.0213623046875, + "learning_rate": 0.01906426928884856, + "loss": 0.8019, + "num_input_tokens_seen": 36744040, + "step": 63300 + }, + { + "epoch": 9.428805481084302, + "grad_norm": 0.0264892578125, + "learning_rate": 0.019062392532196434, + "loss": 0.7893, + "num_input_tokens_seen": 36746888, + "step": 63305 + }, + { + "epoch": 9.42955019362526, + "grad_norm": 0.0262451171875, + "learning_rate": 0.019060515706914312, + "loss": 0.8133, + "num_input_tokens_seen": 36749736, + "step": 63310 + }, + { + "epoch": 9.43029490616622, + "grad_norm": 0.0230712890625, + "learning_rate": 0.019058638813033903, + "loss": 0.7901, + "num_input_tokens_seen": 36752584, + "step": 63315 + }, + { + "epoch": 9.43103961870718, + "grad_norm": 0.0250244140625, + "learning_rate": 0.019056761850586915, + "loss": 0.7936, + "num_input_tokens_seen": 36755784, + "step": 63320 + }, + { + "epoch": 9.431784331248139, + "grad_norm": 0.015869140625, + "learning_rate": 0.01905488481960506, + "loss": 0.8057, + "num_input_tokens_seen": 36758600, + "step": 63325 + }, + { + "epoch": 9.432529043789097, + "grad_norm": 0.025146484375, + "learning_rate": 0.019053007720120044, + "loss": 0.807, + "num_input_tokens_seen": 36761672, + "step": 63330 + }, + { + "epoch": 9.433273756330056, + "grad_norm": 0.034423828125, + "learning_rate": 0.01905113055216358, + "loss": 0.8022, + "num_input_tokens_seen": 36764680, + "step": 63335 + }, + { + "epoch": 9.434018468871017, + "grad_norm": 0.028076171875, + "learning_rate": 0.019049253315767394, + "loss": 0.7789, + "num_input_tokens_seen": 36767304, + "step": 63340 + }, + { + "epoch": 9.434763181411975, + "grad_norm": 0.036376953125, + "learning_rate": 0.01904737601096318, + "loss": 0.8077, + "num_input_tokens_seen": 36770216, + "step": 63345 + }, + { + "epoch": 9.435507893952934, + "grad_norm": 0.0179443359375, + "learning_rate": 0.019045498637782664, + "loss": 0.8004, + "num_input_tokens_seen": 36773096, + "step": 63350 + }, + { + "epoch": 9.436252606493893, + "grad_norm": 0.031982421875, + "learning_rate": 0.01904362119625756, + "loss": 0.8048, + "num_input_tokens_seen": 36775976, + "step": 63355 + }, + { + "epoch": 9.436997319034852, + "grad_norm": 0.038818359375, + "learning_rate": 0.019041743686419584, + "loss": 0.8038, + "num_input_tokens_seen": 36779016, + "step": 63360 + }, + { + "epoch": 9.437742031575812, + "grad_norm": 0.02099609375, + "learning_rate": 0.019039866108300458, + "loss": 0.8182, + "num_input_tokens_seen": 36781768, + "step": 63365 + }, + { + "epoch": 9.438486744116771, + "grad_norm": 0.024169921875, + "learning_rate": 0.0190379884619319, + "loss": 0.8152, + "num_input_tokens_seen": 36784840, + "step": 63370 + }, + { + "epoch": 9.43923145665773, + "grad_norm": 0.042724609375, + "learning_rate": 0.019036110747345632, + "loss": 0.795, + "num_input_tokens_seen": 36787816, + "step": 63375 + }, + { + "epoch": 9.439976169198689, + "grad_norm": 0.0181884765625, + "learning_rate": 0.019034232964573374, + "loss": 0.8097, + "num_input_tokens_seen": 36790696, + "step": 63380 + }, + { + "epoch": 9.440720881739649, + "grad_norm": 0.0301513671875, + "learning_rate": 0.019032355113646846, + "loss": 0.8076, + "num_input_tokens_seen": 36793768, + "step": 63385 + }, + { + "epoch": 9.441465594280608, + "grad_norm": 0.0281982421875, + "learning_rate": 0.019030477194597786, + "loss": 0.8029, + "num_input_tokens_seen": 36796680, + "step": 63390 + }, + { + "epoch": 9.442210306821567, + "grad_norm": 0.0400390625, + "learning_rate": 0.019028599207457902, + "loss": 0.7792, + "num_input_tokens_seen": 36799656, + "step": 63395 + }, + { + "epoch": 9.442955019362525, + "grad_norm": 0.03515625, + "learning_rate": 0.01902672115225893, + "loss": 0.7934, + "num_input_tokens_seen": 36802376, + "step": 63400 + }, + { + "epoch": 9.443699731903486, + "grad_norm": 0.0245361328125, + "learning_rate": 0.019024843029032602, + "loss": 0.8352, + "num_input_tokens_seen": 36804968, + "step": 63405 + }, + { + "epoch": 9.444444444444445, + "grad_norm": 0.03271484375, + "learning_rate": 0.019022964837810637, + "loss": 0.8004, + "num_input_tokens_seen": 36808136, + "step": 63410 + }, + { + "epoch": 9.445189156985403, + "grad_norm": 0.025390625, + "learning_rate": 0.019021086578624773, + "loss": 0.8089, + "num_input_tokens_seen": 36810920, + "step": 63415 + }, + { + "epoch": 9.445933869526362, + "grad_norm": 0.037353515625, + "learning_rate": 0.01901920825150673, + "loss": 0.7938, + "num_input_tokens_seen": 36813992, + "step": 63420 + }, + { + "epoch": 9.446678582067323, + "grad_norm": 0.0274658203125, + "learning_rate": 0.01901732985648826, + "loss": 0.7788, + "num_input_tokens_seen": 36816744, + "step": 63425 + }, + { + "epoch": 9.447423294608281, + "grad_norm": 0.034912109375, + "learning_rate": 0.01901545139360108, + "loss": 0.8253, + "num_input_tokens_seen": 36819880, + "step": 63430 + }, + { + "epoch": 9.44816800714924, + "grad_norm": 0.031982421875, + "learning_rate": 0.019013572862876926, + "loss": 0.814, + "num_input_tokens_seen": 36822920, + "step": 63435 + }, + { + "epoch": 9.448912719690199, + "grad_norm": 0.0419921875, + "learning_rate": 0.019011694264347546, + "loss": 0.8166, + "num_input_tokens_seen": 36825960, + "step": 63440 + }, + { + "epoch": 9.44965743223116, + "grad_norm": 0.03759765625, + "learning_rate": 0.019009815598044662, + "loss": 0.7882, + "num_input_tokens_seen": 36829064, + "step": 63445 + }, + { + "epoch": 9.450402144772118, + "grad_norm": 0.03515625, + "learning_rate": 0.01900793686400002, + "loss": 0.8238, + "num_input_tokens_seen": 36832040, + "step": 63450 + }, + { + "epoch": 9.451146857313077, + "grad_norm": 0.03759765625, + "learning_rate": 0.01900605806224536, + "loss": 0.8262, + "num_input_tokens_seen": 36834952, + "step": 63455 + }, + { + "epoch": 9.451891569854036, + "grad_norm": 0.035888671875, + "learning_rate": 0.019004179192812415, + "loss": 0.8066, + "num_input_tokens_seen": 36837832, + "step": 63460 + }, + { + "epoch": 9.452636282394996, + "grad_norm": 0.042724609375, + "learning_rate": 0.019002300255732938, + "loss": 0.7961, + "num_input_tokens_seen": 36840904, + "step": 63465 + }, + { + "epoch": 9.453380994935955, + "grad_norm": 0.041748046875, + "learning_rate": 0.019000421251038666, + "loss": 0.7878, + "num_input_tokens_seen": 36843688, + "step": 63470 + }, + { + "epoch": 9.454125707476914, + "grad_norm": 0.036865234375, + "learning_rate": 0.018998542178761346, + "loss": 0.8049, + "num_input_tokens_seen": 36846440, + "step": 63475 + }, + { + "epoch": 9.454870420017873, + "grad_norm": 0.0281982421875, + "learning_rate": 0.01899666303893271, + "loss": 0.819, + "num_input_tokens_seen": 36849416, + "step": 63480 + }, + { + "epoch": 9.455615132558833, + "grad_norm": 0.03125, + "learning_rate": 0.01899478383158452, + "loss": 0.8117, + "num_input_tokens_seen": 36852456, + "step": 63485 + }, + { + "epoch": 9.456359845099792, + "grad_norm": 0.022705078125, + "learning_rate": 0.018992904556748517, + "loss": 0.8061, + "num_input_tokens_seen": 36855368, + "step": 63490 + }, + { + "epoch": 9.45710455764075, + "grad_norm": 0.0634765625, + "learning_rate": 0.018991025214456447, + "loss": 0.8045, + "num_input_tokens_seen": 36858088, + "step": 63495 + }, + { + "epoch": 9.45784927018171, + "grad_norm": 0.0225830078125, + "learning_rate": 0.01898914580474007, + "loss": 0.7927, + "num_input_tokens_seen": 36860808, + "step": 63500 + }, + { + "epoch": 9.458593982722668, + "grad_norm": 0.02685546875, + "learning_rate": 0.018987266327631123, + "loss": 0.7944, + "num_input_tokens_seen": 36863624, + "step": 63505 + }, + { + "epoch": 9.459338695263629, + "grad_norm": 0.0361328125, + "learning_rate": 0.018985386783161366, + "loss": 0.7962, + "num_input_tokens_seen": 36866248, + "step": 63510 + }, + { + "epoch": 9.460083407804587, + "grad_norm": 0.0257568359375, + "learning_rate": 0.018983507171362546, + "loss": 0.8012, + "num_input_tokens_seen": 36868936, + "step": 63515 + }, + { + "epoch": 9.460828120345546, + "grad_norm": 0.035400390625, + "learning_rate": 0.018981627492266423, + "loss": 0.7975, + "num_input_tokens_seen": 36871784, + "step": 63520 + }, + { + "epoch": 9.461572832886505, + "grad_norm": 0.02783203125, + "learning_rate": 0.018979747745904753, + "loss": 0.8131, + "num_input_tokens_seen": 36874984, + "step": 63525 + }, + { + "epoch": 9.462317545427466, + "grad_norm": 0.0303955078125, + "learning_rate": 0.018977867932309284, + "loss": 0.7894, + "num_input_tokens_seen": 36877736, + "step": 63530 + }, + { + "epoch": 9.463062257968424, + "grad_norm": 0.04296875, + "learning_rate": 0.018975988051511787, + "loss": 0.8109, + "num_input_tokens_seen": 36880360, + "step": 63535 + }, + { + "epoch": 9.463806970509383, + "grad_norm": 0.031982421875, + "learning_rate": 0.018974108103544007, + "loss": 0.8047, + "num_input_tokens_seen": 36883208, + "step": 63540 + }, + { + "epoch": 9.464551683050342, + "grad_norm": 0.04345703125, + "learning_rate": 0.018972228088437713, + "loss": 0.787, + "num_input_tokens_seen": 36886184, + "step": 63545 + }, + { + "epoch": 9.465296395591302, + "grad_norm": 0.03466796875, + "learning_rate": 0.018970348006224658, + "loss": 0.8085, + "num_input_tokens_seen": 36889160, + "step": 63550 + }, + { + "epoch": 9.466041108132261, + "grad_norm": 0.03564453125, + "learning_rate": 0.018968467856936615, + "loss": 0.8125, + "num_input_tokens_seen": 36892040, + "step": 63555 + }, + { + "epoch": 9.46678582067322, + "grad_norm": 0.0203857421875, + "learning_rate": 0.018966587640605336, + "loss": 0.8137, + "num_input_tokens_seen": 36894856, + "step": 63560 + }, + { + "epoch": 9.467530533214179, + "grad_norm": 0.01806640625, + "learning_rate": 0.01896470735726259, + "loss": 0.7997, + "num_input_tokens_seen": 36897768, + "step": 63565 + }, + { + "epoch": 9.46827524575514, + "grad_norm": 0.04443359375, + "learning_rate": 0.018962827006940144, + "loss": 0.7947, + "num_input_tokens_seen": 36900616, + "step": 63570 + }, + { + "epoch": 9.469019958296098, + "grad_norm": 0.04833984375, + "learning_rate": 0.018960946589669764, + "loss": 0.8133, + "num_input_tokens_seen": 36903592, + "step": 63575 + }, + { + "epoch": 9.469764670837057, + "grad_norm": 0.0380859375, + "learning_rate": 0.018959066105483216, + "loss": 0.8119, + "num_input_tokens_seen": 36906472, + "step": 63580 + }, + { + "epoch": 9.470509383378015, + "grad_norm": 0.02392578125, + "learning_rate": 0.01895718555441227, + "loss": 0.7932, + "num_input_tokens_seen": 36909896, + "step": 63585 + }, + { + "epoch": 9.471254095918976, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0189553049364887, + "loss": 0.7968, + "num_input_tokens_seen": 36912840, + "step": 63590 + }, + { + "epoch": 9.471998808459935, + "grad_norm": 0.04150390625, + "learning_rate": 0.018953424251744268, + "loss": 0.8125, + "num_input_tokens_seen": 36916040, + "step": 63595 + }, + { + "epoch": 9.472743521000893, + "grad_norm": 0.023193359375, + "learning_rate": 0.018951543500210753, + "loss": 0.7882, + "num_input_tokens_seen": 36918824, + "step": 63600 + }, + { + "epoch": 9.473488233541852, + "grad_norm": 0.032470703125, + "learning_rate": 0.01894966268191993, + "loss": 0.8044, + "num_input_tokens_seen": 36921864, + "step": 63605 + }, + { + "epoch": 9.474232946082813, + "grad_norm": 0.0294189453125, + "learning_rate": 0.018947781796903564, + "loss": 0.7806, + "num_input_tokens_seen": 36924456, + "step": 63610 + }, + { + "epoch": 9.474977658623772, + "grad_norm": 0.020751953125, + "learning_rate": 0.01894590084519344, + "loss": 0.784, + "num_input_tokens_seen": 36927368, + "step": 63615 + }, + { + "epoch": 9.47572237116473, + "grad_norm": 0.03759765625, + "learning_rate": 0.018944019826821333, + "loss": 0.8142, + "num_input_tokens_seen": 36930760, + "step": 63620 + }, + { + "epoch": 9.476467083705689, + "grad_norm": 0.0235595703125, + "learning_rate": 0.018942138741819017, + "loss": 0.8101, + "num_input_tokens_seen": 36933448, + "step": 63625 + }, + { + "epoch": 9.47721179624665, + "grad_norm": 0.03271484375, + "learning_rate": 0.018940257590218276, + "loss": 0.7943, + "num_input_tokens_seen": 36936232, + "step": 63630 + }, + { + "epoch": 9.477956508787608, + "grad_norm": 0.0281982421875, + "learning_rate": 0.018938376372050886, + "loss": 0.799, + "num_input_tokens_seen": 36939080, + "step": 63635 + }, + { + "epoch": 9.478701221328567, + "grad_norm": 0.050048828125, + "learning_rate": 0.018936495087348633, + "loss": 0.8113, + "num_input_tokens_seen": 36942184, + "step": 63640 + }, + { + "epoch": 9.479445933869526, + "grad_norm": 0.0233154296875, + "learning_rate": 0.01893461373614329, + "loss": 0.817, + "num_input_tokens_seen": 36945192, + "step": 63645 + }, + { + "epoch": 9.480190646410486, + "grad_norm": 0.03466796875, + "learning_rate": 0.018932732318466653, + "loss": 0.8108, + "num_input_tokens_seen": 36948264, + "step": 63650 + }, + { + "epoch": 9.480935358951445, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0189308508343505, + "loss": 0.8125, + "num_input_tokens_seen": 36951240, + "step": 63655 + }, + { + "epoch": 9.481680071492404, + "grad_norm": 0.038330078125, + "learning_rate": 0.018928969283826615, + "loss": 0.7913, + "num_input_tokens_seen": 36954152, + "step": 63660 + }, + { + "epoch": 9.482424784033363, + "grad_norm": 0.037841796875, + "learning_rate": 0.018927087666926783, + "loss": 0.8021, + "num_input_tokens_seen": 36957096, + "step": 63665 + }, + { + "epoch": 9.483169496574323, + "grad_norm": 0.041748046875, + "learning_rate": 0.018925205983682802, + "loss": 0.8108, + "num_input_tokens_seen": 36959976, + "step": 63670 + }, + { + "epoch": 9.483914209115282, + "grad_norm": 0.0625, + "learning_rate": 0.01892332423412646, + "loss": 0.8303, + "num_input_tokens_seen": 36962664, + "step": 63675 + }, + { + "epoch": 9.48465892165624, + "grad_norm": 0.0257568359375, + "learning_rate": 0.01892144241828954, + "loss": 0.8192, + "num_input_tokens_seen": 36965416, + "step": 63680 + }, + { + "epoch": 9.4854036341972, + "grad_norm": 0.0245361328125, + "learning_rate": 0.018919560536203826, + "loss": 0.7959, + "num_input_tokens_seen": 36968232, + "step": 63685 + }, + { + "epoch": 9.486148346738158, + "grad_norm": 0.0252685546875, + "learning_rate": 0.018917678587901127, + "loss": 0.8068, + "num_input_tokens_seen": 36970952, + "step": 63690 + }, + { + "epoch": 9.486893059279119, + "grad_norm": 0.028564453125, + "learning_rate": 0.018915796573413232, + "loss": 0.7919, + "num_input_tokens_seen": 36973768, + "step": 63695 + }, + { + "epoch": 9.487637771820078, + "grad_norm": 0.0238037109375, + "learning_rate": 0.018913914492771935, + "loss": 0.7779, + "num_input_tokens_seen": 36976712, + "step": 63700 + }, + { + "epoch": 9.488382484361036, + "grad_norm": 0.0380859375, + "learning_rate": 0.018912032346009026, + "loss": 0.8135, + "num_input_tokens_seen": 36979592, + "step": 63705 + }, + { + "epoch": 9.489127196901995, + "grad_norm": 0.0250244140625, + "learning_rate": 0.01891015013315631, + "loss": 0.7979, + "num_input_tokens_seen": 36982184, + "step": 63710 + }, + { + "epoch": 9.489871909442956, + "grad_norm": 0.027587890625, + "learning_rate": 0.01890826785424558, + "loss": 0.8054, + "num_input_tokens_seen": 36984840, + "step": 63715 + }, + { + "epoch": 9.490616621983914, + "grad_norm": 0.038330078125, + "learning_rate": 0.018906385509308635, + "loss": 0.8011, + "num_input_tokens_seen": 36987944, + "step": 63720 + }, + { + "epoch": 9.491361334524873, + "grad_norm": 0.031494140625, + "learning_rate": 0.018904503098377278, + "loss": 0.7925, + "num_input_tokens_seen": 36990728, + "step": 63725 + }, + { + "epoch": 9.492106047065832, + "grad_norm": 0.03515625, + "learning_rate": 0.018902620621483308, + "loss": 0.8078, + "num_input_tokens_seen": 36993480, + "step": 63730 + }, + { + "epoch": 9.492850759606792, + "grad_norm": 0.0341796875, + "learning_rate": 0.018900738078658535, + "loss": 0.8123, + "num_input_tokens_seen": 36996104, + "step": 63735 + }, + { + "epoch": 9.493595472147751, + "grad_norm": 0.019775390625, + "learning_rate": 0.01889885546993475, + "loss": 0.8062, + "num_input_tokens_seen": 36998856, + "step": 63740 + }, + { + "epoch": 9.49434018468871, + "grad_norm": 0.039306640625, + "learning_rate": 0.018896972795343768, + "loss": 0.8041, + "num_input_tokens_seen": 37001448, + "step": 63745 + }, + { + "epoch": 9.495084897229669, + "grad_norm": 0.0306396484375, + "learning_rate": 0.018895090054917386, + "loss": 0.8004, + "num_input_tokens_seen": 37004360, + "step": 63750 + }, + { + "epoch": 9.49582960977063, + "grad_norm": 0.031494140625, + "learning_rate": 0.018893207248687423, + "loss": 0.8096, + "num_input_tokens_seen": 37006984, + "step": 63755 + }, + { + "epoch": 9.496574322311588, + "grad_norm": 0.03955078125, + "learning_rate": 0.01889132437668568, + "loss": 0.7892, + "num_input_tokens_seen": 37009960, + "step": 63760 + }, + { + "epoch": 9.497319034852547, + "grad_norm": 0.027099609375, + "learning_rate": 0.018889441438943964, + "loss": 0.807, + "num_input_tokens_seen": 37013000, + "step": 63765 + }, + { + "epoch": 9.498063747393505, + "grad_norm": 0.0218505859375, + "learning_rate": 0.01888755843549409, + "loss": 0.7894, + "num_input_tokens_seen": 37015784, + "step": 63770 + }, + { + "epoch": 9.498808459934466, + "grad_norm": 0.03369140625, + "learning_rate": 0.018885675366367868, + "loss": 0.8195, + "num_input_tokens_seen": 37018344, + "step": 63775 + }, + { + "epoch": 9.499553172475425, + "grad_norm": 0.0233154296875, + "learning_rate": 0.018883792231597106, + "loss": 0.7783, + "num_input_tokens_seen": 37021416, + "step": 63780 + }, + { + "epoch": 9.500297885016384, + "grad_norm": 0.032470703125, + "learning_rate": 0.01888190903121362, + "loss": 0.7974, + "num_input_tokens_seen": 37024552, + "step": 63785 + }, + { + "epoch": 9.501042597557342, + "grad_norm": 0.0272216796875, + "learning_rate": 0.018880025765249234, + "loss": 0.8019, + "num_input_tokens_seen": 37027464, + "step": 63790 + }, + { + "epoch": 9.501787310098303, + "grad_norm": 0.03466796875, + "learning_rate": 0.018878142433735753, + "loss": 0.7913, + "num_input_tokens_seen": 37030312, + "step": 63795 + }, + { + "epoch": 9.502532022639262, + "grad_norm": 0.02490234375, + "learning_rate": 0.018876259036704996, + "loss": 0.7975, + "num_input_tokens_seen": 37033128, + "step": 63800 + }, + { + "epoch": 9.50327673518022, + "grad_norm": 0.0238037109375, + "learning_rate": 0.018874375574188786, + "loss": 0.794, + "num_input_tokens_seen": 37036040, + "step": 63805 + }, + { + "epoch": 9.504021447721179, + "grad_norm": 0.0167236328125, + "learning_rate": 0.018872492046218933, + "loss": 0.8022, + "num_input_tokens_seen": 37038952, + "step": 63810 + }, + { + "epoch": 9.50476616026214, + "grad_norm": 0.03369140625, + "learning_rate": 0.018870608452827262, + "loss": 0.7898, + "num_input_tokens_seen": 37041928, + "step": 63815 + }, + { + "epoch": 9.505510872803098, + "grad_norm": 0.031494140625, + "learning_rate": 0.018868724794045597, + "loss": 0.7914, + "num_input_tokens_seen": 37044680, + "step": 63820 + }, + { + "epoch": 9.506255585344057, + "grad_norm": 0.037109375, + "learning_rate": 0.01886684106990576, + "loss": 0.8005, + "num_input_tokens_seen": 37047784, + "step": 63825 + }, + { + "epoch": 9.507000297885016, + "grad_norm": 0.025390625, + "learning_rate": 0.018864957280439573, + "loss": 0.8032, + "num_input_tokens_seen": 37050664, + "step": 63830 + }, + { + "epoch": 9.507745010425975, + "grad_norm": 0.024658203125, + "learning_rate": 0.01886307342567886, + "loss": 0.7894, + "num_input_tokens_seen": 37053640, + "step": 63835 + }, + { + "epoch": 9.508489722966935, + "grad_norm": 0.0537109375, + "learning_rate": 0.018861189505655452, + "loss": 0.8149, + "num_input_tokens_seen": 37056648, + "step": 63840 + }, + { + "epoch": 9.509234435507894, + "grad_norm": 0.02490234375, + "learning_rate": 0.018859305520401166, + "loss": 0.827, + "num_input_tokens_seen": 37059560, + "step": 63845 + }, + { + "epoch": 9.509979148048853, + "grad_norm": 0.0238037109375, + "learning_rate": 0.018857421469947837, + "loss": 0.8016, + "num_input_tokens_seen": 37062728, + "step": 63850 + }, + { + "epoch": 9.510723860589813, + "grad_norm": 0.0262451171875, + "learning_rate": 0.018855537354327292, + "loss": 0.8101, + "num_input_tokens_seen": 37065672, + "step": 63855 + }, + { + "epoch": 9.511468573130772, + "grad_norm": 0.025390625, + "learning_rate": 0.018853653173571364, + "loss": 0.8072, + "num_input_tokens_seen": 37068488, + "step": 63860 + }, + { + "epoch": 9.51221328567173, + "grad_norm": 0.0234375, + "learning_rate": 0.018851768927711884, + "loss": 0.8175, + "num_input_tokens_seen": 37071240, + "step": 63865 + }, + { + "epoch": 9.51295799821269, + "grad_norm": 0.021484375, + "learning_rate": 0.01884988461678068, + "loss": 0.7892, + "num_input_tokens_seen": 37074472, + "step": 63870 + }, + { + "epoch": 9.513702710753648, + "grad_norm": 0.04052734375, + "learning_rate": 0.01884800024080959, + "loss": 0.7786, + "num_input_tokens_seen": 37077128, + "step": 63875 + }, + { + "epoch": 9.514447423294609, + "grad_norm": 0.04248046875, + "learning_rate": 0.018846115799830445, + "loss": 0.799, + "num_input_tokens_seen": 37080072, + "step": 63880 + }, + { + "epoch": 9.515192135835568, + "grad_norm": 0.03369140625, + "learning_rate": 0.01884423129387508, + "loss": 0.8072, + "num_input_tokens_seen": 37082856, + "step": 63885 + }, + { + "epoch": 9.515936848376526, + "grad_norm": 0.0255126953125, + "learning_rate": 0.01884234672297534, + "loss": 0.7868, + "num_input_tokens_seen": 37085992, + "step": 63890 + }, + { + "epoch": 9.516681560917485, + "grad_norm": 0.0252685546875, + "learning_rate": 0.018840462087163053, + "loss": 0.8159, + "num_input_tokens_seen": 37088968, + "step": 63895 + }, + { + "epoch": 9.517426273458446, + "grad_norm": 0.0269775390625, + "learning_rate": 0.018838577386470066, + "loss": 0.7895, + "num_input_tokens_seen": 37091976, + "step": 63900 + }, + { + "epoch": 9.518170985999404, + "grad_norm": 0.04443359375, + "learning_rate": 0.01883669262092821, + "loss": 0.8212, + "num_input_tokens_seen": 37095240, + "step": 63905 + }, + { + "epoch": 9.518915698540363, + "grad_norm": 0.031494140625, + "learning_rate": 0.018834807790569338, + "loss": 0.8292, + "num_input_tokens_seen": 37098376, + "step": 63910 + }, + { + "epoch": 9.519660411081322, + "grad_norm": 0.03857421875, + "learning_rate": 0.018832922895425284, + "loss": 0.8455, + "num_input_tokens_seen": 37101352, + "step": 63915 + }, + { + "epoch": 9.520405123622282, + "grad_norm": 0.029052734375, + "learning_rate": 0.01883103793552789, + "loss": 0.8092, + "num_input_tokens_seen": 37104104, + "step": 63920 + }, + { + "epoch": 9.521149836163241, + "grad_norm": 0.0303955078125, + "learning_rate": 0.018829152910909006, + "loss": 0.805, + "num_input_tokens_seen": 37106984, + "step": 63925 + }, + { + "epoch": 9.5218945487042, + "grad_norm": 0.0203857421875, + "learning_rate": 0.018827267821600473, + "loss": 0.8154, + "num_input_tokens_seen": 37110024, + "step": 63930 + }, + { + "epoch": 9.522639261245159, + "grad_norm": 0.0311279296875, + "learning_rate": 0.018825382667634144, + "loss": 0.7989, + "num_input_tokens_seen": 37112872, + "step": 63935 + }, + { + "epoch": 9.52338397378612, + "grad_norm": 0.0240478515625, + "learning_rate": 0.018823497449041864, + "loss": 0.8114, + "num_input_tokens_seen": 37115656, + "step": 63940 + }, + { + "epoch": 9.524128686327078, + "grad_norm": 0.02880859375, + "learning_rate": 0.018821612165855477, + "loss": 0.8035, + "num_input_tokens_seen": 37118568, + "step": 63945 + }, + { + "epoch": 9.524873398868037, + "grad_norm": 0.02099609375, + "learning_rate": 0.018819726818106834, + "loss": 0.8101, + "num_input_tokens_seen": 37121416, + "step": 63950 + }, + { + "epoch": 9.525618111408996, + "grad_norm": 0.034912109375, + "learning_rate": 0.018817841405827797, + "loss": 0.8223, + "num_input_tokens_seen": 37124488, + "step": 63955 + }, + { + "epoch": 9.526362823949956, + "grad_norm": 0.02734375, + "learning_rate": 0.018815955929050204, + "loss": 0.7969, + "num_input_tokens_seen": 37127400, + "step": 63960 + }, + { + "epoch": 9.527107536490915, + "grad_norm": 0.026123046875, + "learning_rate": 0.01881407038780591, + "loss": 0.8093, + "num_input_tokens_seen": 37130152, + "step": 63965 + }, + { + "epoch": 9.527852249031874, + "grad_norm": 0.053466796875, + "learning_rate": 0.018812184782126783, + "loss": 0.7845, + "num_input_tokens_seen": 37133096, + "step": 63970 + }, + { + "epoch": 9.528596961572832, + "grad_norm": 0.034423828125, + "learning_rate": 0.01881029911204466, + "loss": 0.8179, + "num_input_tokens_seen": 37136072, + "step": 63975 + }, + { + "epoch": 9.529341674113793, + "grad_norm": 0.029052734375, + "learning_rate": 0.01880841337759141, + "loss": 0.7788, + "num_input_tokens_seen": 37139240, + "step": 63980 + }, + { + "epoch": 9.530086386654752, + "grad_norm": 0.037841796875, + "learning_rate": 0.018806527578798887, + "loss": 0.7858, + "num_input_tokens_seen": 37142152, + "step": 63985 + }, + { + "epoch": 9.53083109919571, + "grad_norm": 0.0400390625, + "learning_rate": 0.018804641715698945, + "loss": 0.809, + "num_input_tokens_seen": 37145000, + "step": 63990 + }, + { + "epoch": 9.53157581173667, + "grad_norm": 0.029296875, + "learning_rate": 0.018802755788323456, + "loss": 0.7812, + "num_input_tokens_seen": 37148008, + "step": 63995 + }, + { + "epoch": 9.53232052427763, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01880086979670427, + "loss": 0.822, + "num_input_tokens_seen": 37150664, + "step": 64000 + }, + { + "epoch": 9.533065236818588, + "grad_norm": 0.0216064453125, + "learning_rate": 0.01879898374087325, + "loss": 0.7929, + "num_input_tokens_seen": 37153768, + "step": 64005 + }, + { + "epoch": 9.533809949359547, + "grad_norm": 0.0174560546875, + "learning_rate": 0.01879709762086226, + "loss": 0.7854, + "num_input_tokens_seen": 37157000, + "step": 64010 + }, + { + "epoch": 9.534554661900506, + "grad_norm": 0.031494140625, + "learning_rate": 0.018795211436703166, + "loss": 0.7873, + "num_input_tokens_seen": 37160072, + "step": 64015 + }, + { + "epoch": 9.535299374441465, + "grad_norm": 0.0179443359375, + "learning_rate": 0.018793325188427832, + "loss": 0.8199, + "num_input_tokens_seen": 37162856, + "step": 64020 + }, + { + "epoch": 9.536044086982425, + "grad_norm": 0.033935546875, + "learning_rate": 0.01879143887606812, + "loss": 0.7688, + "num_input_tokens_seen": 37166056, + "step": 64025 + }, + { + "epoch": 9.536788799523384, + "grad_norm": 0.0279541015625, + "learning_rate": 0.01878955249965591, + "loss": 0.7716, + "num_input_tokens_seen": 37169128, + "step": 64030 + }, + { + "epoch": 9.537533512064343, + "grad_norm": 0.0244140625, + "learning_rate": 0.01878766605922306, + "loss": 0.7912, + "num_input_tokens_seen": 37172136, + "step": 64035 + }, + { + "epoch": 9.538278224605303, + "grad_norm": 0.02978515625, + "learning_rate": 0.01878577955480144, + "loss": 0.7916, + "num_input_tokens_seen": 37175272, + "step": 64040 + }, + { + "epoch": 9.539022937146262, + "grad_norm": 0.02099609375, + "learning_rate": 0.018783892986422922, + "loss": 0.8044, + "num_input_tokens_seen": 37178216, + "step": 64045 + }, + { + "epoch": 9.53976764968722, + "grad_norm": 0.0299072265625, + "learning_rate": 0.018782006354119377, + "loss": 0.7863, + "num_input_tokens_seen": 37181064, + "step": 64050 + }, + { + "epoch": 9.54051236222818, + "grad_norm": 0.044189453125, + "learning_rate": 0.01878011965792268, + "loss": 0.8003, + "num_input_tokens_seen": 37184104, + "step": 64055 + }, + { + "epoch": 9.541257074769138, + "grad_norm": 0.0322265625, + "learning_rate": 0.0187782328978647, + "loss": 0.8412, + "num_input_tokens_seen": 37186792, + "step": 64060 + }, + { + "epoch": 9.542001787310099, + "grad_norm": 0.044189453125, + "learning_rate": 0.01877634607397732, + "loss": 0.8121, + "num_input_tokens_seen": 37189704, + "step": 64065 + }, + { + "epoch": 9.542746499851058, + "grad_norm": 0.0263671875, + "learning_rate": 0.018774459186292405, + "loss": 0.8051, + "num_input_tokens_seen": 37192712, + "step": 64070 + }, + { + "epoch": 9.543491212392016, + "grad_norm": 0.031494140625, + "learning_rate": 0.01877257223484185, + "loss": 0.8014, + "num_input_tokens_seen": 37195656, + "step": 64075 + }, + { + "epoch": 9.544235924932975, + "grad_norm": 0.0269775390625, + "learning_rate": 0.018770685219657513, + "loss": 0.7714, + "num_input_tokens_seen": 37198376, + "step": 64080 + }, + { + "epoch": 9.544980637473936, + "grad_norm": 0.043701171875, + "learning_rate": 0.01876879814077128, + "loss": 0.7958, + "num_input_tokens_seen": 37201448, + "step": 64085 + }, + { + "epoch": 9.545725350014894, + "grad_norm": 0.0252685546875, + "learning_rate": 0.018766910998215033, + "loss": 0.7883, + "num_input_tokens_seen": 37204488, + "step": 64090 + }, + { + "epoch": 9.546470062555853, + "grad_norm": 0.02392578125, + "learning_rate": 0.018765023792020656, + "loss": 0.8056, + "num_input_tokens_seen": 37207624, + "step": 64095 + }, + { + "epoch": 9.547214775096812, + "grad_norm": 0.0255126953125, + "learning_rate": 0.018763136522220027, + "loss": 0.7871, + "num_input_tokens_seen": 37210664, + "step": 64100 + }, + { + "epoch": 9.547959487637772, + "grad_norm": 0.02880859375, + "learning_rate": 0.01876124918884503, + "loss": 0.8118, + "num_input_tokens_seen": 37213608, + "step": 64105 + }, + { + "epoch": 9.548704200178731, + "grad_norm": 0.031982421875, + "learning_rate": 0.018759361791927553, + "loss": 0.7905, + "num_input_tokens_seen": 37216584, + "step": 64110 + }, + { + "epoch": 9.54944891271969, + "grad_norm": 0.060791015625, + "learning_rate": 0.018757474331499484, + "loss": 0.7977, + "num_input_tokens_seen": 37219368, + "step": 64115 + }, + { + "epoch": 9.550193625260649, + "grad_norm": 0.0341796875, + "learning_rate": 0.018755586807592695, + "loss": 0.8285, + "num_input_tokens_seen": 37222280, + "step": 64120 + }, + { + "epoch": 9.55093833780161, + "grad_norm": 0.0260009765625, + "learning_rate": 0.01875369922023909, + "loss": 0.7824, + "num_input_tokens_seen": 37224968, + "step": 64125 + }, + { + "epoch": 9.551683050342568, + "grad_norm": 0.020263671875, + "learning_rate": 0.01875181156947055, + "loss": 0.8037, + "num_input_tokens_seen": 37227848, + "step": 64130 + }, + { + "epoch": 9.552427762883527, + "grad_norm": 0.034912109375, + "learning_rate": 0.01874992385531897, + "loss": 0.8171, + "num_input_tokens_seen": 37230568, + "step": 64135 + }, + { + "epoch": 9.553172475424486, + "grad_norm": 0.025390625, + "learning_rate": 0.018748036077816235, + "loss": 0.8047, + "num_input_tokens_seen": 37233512, + "step": 64140 + }, + { + "epoch": 9.553917187965446, + "grad_norm": 0.03125, + "learning_rate": 0.01874614823699424, + "loss": 0.7851, + "num_input_tokens_seen": 37236296, + "step": 64145 + }, + { + "epoch": 9.554661900506405, + "grad_norm": 0.03076171875, + "learning_rate": 0.018744260332884875, + "loss": 0.8113, + "num_input_tokens_seen": 37239336, + "step": 64150 + }, + { + "epoch": 9.555406613047364, + "grad_norm": 0.023193359375, + "learning_rate": 0.01874237236552004, + "loss": 0.8368, + "num_input_tokens_seen": 37242344, + "step": 64155 + }, + { + "epoch": 9.556151325588322, + "grad_norm": 0.027587890625, + "learning_rate": 0.01874048433493163, + "loss": 0.8119, + "num_input_tokens_seen": 37244904, + "step": 64160 + }, + { + "epoch": 9.556896038129283, + "grad_norm": 0.0400390625, + "learning_rate": 0.018738596241151536, + "loss": 0.7813, + "num_input_tokens_seen": 37247816, + "step": 64165 + }, + { + "epoch": 9.557640750670242, + "grad_norm": 0.0308837890625, + "learning_rate": 0.01873670808421166, + "loss": 0.8065, + "num_input_tokens_seen": 37250568, + "step": 64170 + }, + { + "epoch": 9.5583854632112, + "grad_norm": 0.0299072265625, + "learning_rate": 0.018734819864143895, + "loss": 0.7844, + "num_input_tokens_seen": 37253512, + "step": 64175 + }, + { + "epoch": 9.55913017575216, + "grad_norm": 0.024169921875, + "learning_rate": 0.018732931580980144, + "loss": 0.819, + "num_input_tokens_seen": 37256616, + "step": 64180 + }, + { + "epoch": 9.55987488829312, + "grad_norm": 0.029052734375, + "learning_rate": 0.018731043234752314, + "loss": 0.8021, + "num_input_tokens_seen": 37259752, + "step": 64185 + }, + { + "epoch": 9.560619600834078, + "grad_norm": 0.02978515625, + "learning_rate": 0.018729154825492294, + "loss": 0.8023, + "num_input_tokens_seen": 37262824, + "step": 64190 + }, + { + "epoch": 9.561364313375037, + "grad_norm": 0.0281982421875, + "learning_rate": 0.018727266353232, + "loss": 0.8079, + "num_input_tokens_seen": 37265544, + "step": 64195 + }, + { + "epoch": 9.562109025915996, + "grad_norm": 0.02880859375, + "learning_rate": 0.018725377818003326, + "loss": 0.8146, + "num_input_tokens_seen": 37268520, + "step": 64200 + }, + { + "epoch": 9.562853738456955, + "grad_norm": 0.0167236328125, + "learning_rate": 0.01872348921983818, + "loss": 0.7986, + "num_input_tokens_seen": 37271368, + "step": 64205 + }, + { + "epoch": 9.563598450997915, + "grad_norm": 0.026611328125, + "learning_rate": 0.018721600558768467, + "loss": 0.7772, + "num_input_tokens_seen": 37274216, + "step": 64210 + }, + { + "epoch": 9.564343163538874, + "grad_norm": 0.0201416015625, + "learning_rate": 0.018719711834826094, + "loss": 0.8076, + "num_input_tokens_seen": 37277128, + "step": 64215 + }, + { + "epoch": 9.565087876079833, + "grad_norm": 0.0235595703125, + "learning_rate": 0.018717823048042974, + "loss": 0.8075, + "num_input_tokens_seen": 37279912, + "step": 64220 + }, + { + "epoch": 9.565832588620792, + "grad_norm": 0.04248046875, + "learning_rate": 0.018715934198451006, + "loss": 0.7941, + "num_input_tokens_seen": 37282632, + "step": 64225 + }, + { + "epoch": 9.566577301161752, + "grad_norm": 0.0311279296875, + "learning_rate": 0.018714045286082115, + "loss": 0.7915, + "num_input_tokens_seen": 37285416, + "step": 64230 + }, + { + "epoch": 9.56732201370271, + "grad_norm": 0.031982421875, + "learning_rate": 0.0187121563109682, + "loss": 0.789, + "num_input_tokens_seen": 37288200, + "step": 64235 + }, + { + "epoch": 9.56806672624367, + "grad_norm": 0.0279541015625, + "learning_rate": 0.018710267273141176, + "loss": 0.795, + "num_input_tokens_seen": 37291048, + "step": 64240 + }, + { + "epoch": 9.568811438784628, + "grad_norm": 0.0264892578125, + "learning_rate": 0.018708378172632958, + "loss": 0.7997, + "num_input_tokens_seen": 37294152, + "step": 64245 + }, + { + "epoch": 9.569556151325589, + "grad_norm": 0.0203857421875, + "learning_rate": 0.01870648900947546, + "loss": 0.8082, + "num_input_tokens_seen": 37297320, + "step": 64250 + }, + { + "epoch": 9.570300863866548, + "grad_norm": 0.02392578125, + "learning_rate": 0.018704599783700593, + "loss": 0.7856, + "num_input_tokens_seen": 37300424, + "step": 64255 + }, + { + "epoch": 9.571045576407506, + "grad_norm": 0.01904296875, + "learning_rate": 0.01870271049534028, + "loss": 0.7998, + "num_input_tokens_seen": 37303144, + "step": 64260 + }, + { + "epoch": 9.571790288948465, + "grad_norm": 0.0252685546875, + "learning_rate": 0.01870082114442644, + "loss": 0.7815, + "num_input_tokens_seen": 37305928, + "step": 64265 + }, + { + "epoch": 9.572535001489426, + "grad_norm": 0.0283203125, + "learning_rate": 0.018698931730990985, + "loss": 0.7885, + "num_input_tokens_seen": 37308904, + "step": 64270 + }, + { + "epoch": 9.573279714030384, + "grad_norm": 0.032958984375, + "learning_rate": 0.01869704225506584, + "loss": 0.7821, + "num_input_tokens_seen": 37311688, + "step": 64275 + }, + { + "epoch": 9.574024426571343, + "grad_norm": 0.02392578125, + "learning_rate": 0.018695152716682924, + "loss": 0.7922, + "num_input_tokens_seen": 37314984, + "step": 64280 + }, + { + "epoch": 9.574769139112302, + "grad_norm": 0.0228271484375, + "learning_rate": 0.018693263115874156, + "loss": 0.8043, + "num_input_tokens_seen": 37317864, + "step": 64285 + }, + { + "epoch": 9.575513851653263, + "grad_norm": 0.028076171875, + "learning_rate": 0.018691373452671464, + "loss": 0.8257, + "num_input_tokens_seen": 37321096, + "step": 64290 + }, + { + "epoch": 9.576258564194221, + "grad_norm": 0.03662109375, + "learning_rate": 0.018689483727106763, + "loss": 0.82, + "num_input_tokens_seen": 37323784, + "step": 64295 + }, + { + "epoch": 9.57700327673518, + "grad_norm": 0.0208740234375, + "learning_rate": 0.018687593939211992, + "loss": 0.7789, + "num_input_tokens_seen": 37326632, + "step": 64300 + }, + { + "epoch": 9.577747989276139, + "grad_norm": 0.050048828125, + "learning_rate": 0.018685704089019067, + "loss": 0.8236, + "num_input_tokens_seen": 37329768, + "step": 64305 + }, + { + "epoch": 9.5784927018171, + "grad_norm": 0.02685546875, + "learning_rate": 0.018683814176559916, + "loss": 0.8048, + "num_input_tokens_seen": 37332488, + "step": 64310 + }, + { + "epoch": 9.579237414358058, + "grad_norm": 0.0291748046875, + "learning_rate": 0.018681924201866467, + "loss": 0.7919, + "num_input_tokens_seen": 37335304, + "step": 64315 + }, + { + "epoch": 9.579982126899017, + "grad_norm": 0.0233154296875, + "learning_rate": 0.018680034164970657, + "loss": 0.7808, + "num_input_tokens_seen": 37338312, + "step": 64320 + }, + { + "epoch": 9.580726839439976, + "grad_norm": 0.0250244140625, + "learning_rate": 0.018678144065904404, + "loss": 0.8008, + "num_input_tokens_seen": 37341256, + "step": 64325 + }, + { + "epoch": 9.581471551980936, + "grad_norm": 0.0308837890625, + "learning_rate": 0.018676253904699645, + "loss": 0.8056, + "num_input_tokens_seen": 37343816, + "step": 64330 + }, + { + "epoch": 9.582216264521895, + "grad_norm": 0.028076171875, + "learning_rate": 0.01867436368138832, + "loss": 0.7933, + "num_input_tokens_seen": 37346888, + "step": 64335 + }, + { + "epoch": 9.582960977062854, + "grad_norm": 0.043212890625, + "learning_rate": 0.018672473396002347, + "loss": 0.8127, + "num_input_tokens_seen": 37349704, + "step": 64340 + }, + { + "epoch": 9.583705689603812, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01867058304857367, + "loss": 0.7907, + "num_input_tokens_seen": 37353000, + "step": 64345 + }, + { + "epoch": 9.584450402144771, + "grad_norm": 0.0198974609375, + "learning_rate": 0.018668692639134222, + "loss": 0.7895, + "num_input_tokens_seen": 37355912, + "step": 64350 + }, + { + "epoch": 9.585195114685732, + "grad_norm": 0.0252685546875, + "learning_rate": 0.018666802167715945, + "loss": 0.8086, + "num_input_tokens_seen": 37359016, + "step": 64355 + }, + { + "epoch": 9.58593982722669, + "grad_norm": 0.017578125, + "learning_rate": 0.01866491163435077, + "loss": 0.8214, + "num_input_tokens_seen": 37361640, + "step": 64360 + }, + { + "epoch": 9.58668453976765, + "grad_norm": 0.0252685546875, + "learning_rate": 0.018663021039070637, + "loss": 0.7995, + "num_input_tokens_seen": 37364840, + "step": 64365 + }, + { + "epoch": 9.58742925230861, + "grad_norm": 0.0181884765625, + "learning_rate": 0.01866113038190749, + "loss": 0.7741, + "num_input_tokens_seen": 37367656, + "step": 64370 + }, + { + "epoch": 9.588173964849569, + "grad_norm": 0.0284423828125, + "learning_rate": 0.018659239662893262, + "loss": 0.8259, + "num_input_tokens_seen": 37370824, + "step": 64375 + }, + { + "epoch": 9.588918677390527, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0186573488820599, + "loss": 0.8215, + "num_input_tokens_seen": 37373960, + "step": 64380 + }, + { + "epoch": 9.589663389931486, + "grad_norm": 0.02783203125, + "learning_rate": 0.018655458039439347, + "loss": 0.8121, + "num_input_tokens_seen": 37376584, + "step": 64385 + }, + { + "epoch": 9.590408102472445, + "grad_norm": 0.028564453125, + "learning_rate": 0.018653567135063543, + "loss": 0.7943, + "num_input_tokens_seen": 37379304, + "step": 64390 + }, + { + "epoch": 9.591152815013405, + "grad_norm": 0.03564453125, + "learning_rate": 0.01865167616896444, + "loss": 0.798, + "num_input_tokens_seen": 37382280, + "step": 64395 + }, + { + "epoch": 9.591897527554364, + "grad_norm": 0.0291748046875, + "learning_rate": 0.01864978514117398, + "loss": 0.8132, + "num_input_tokens_seen": 37384904, + "step": 64400 + }, + { + "epoch": 9.592642240095323, + "grad_norm": 0.0390625, + "learning_rate": 0.018647894051724106, + "loss": 0.8266, + "num_input_tokens_seen": 37387816, + "step": 64405 + }, + { + "epoch": 9.593386952636282, + "grad_norm": 0.0252685546875, + "learning_rate": 0.018646002900646767, + "loss": 0.7936, + "num_input_tokens_seen": 37390696, + "step": 64410 + }, + { + "epoch": 9.594131665177242, + "grad_norm": 0.027587890625, + "learning_rate": 0.018644111687973915, + "loss": 0.8146, + "num_input_tokens_seen": 37393576, + "step": 64415 + }, + { + "epoch": 9.594876377718201, + "grad_norm": 0.017822265625, + "learning_rate": 0.018642220413737507, + "loss": 0.7933, + "num_input_tokens_seen": 37396360, + "step": 64420 + }, + { + "epoch": 9.59562109025916, + "grad_norm": 0.017333984375, + "learning_rate": 0.01864032907796948, + "loss": 0.797, + "num_input_tokens_seen": 37399240, + "step": 64425 + }, + { + "epoch": 9.596365802800118, + "grad_norm": 0.027587890625, + "learning_rate": 0.018638437680701798, + "loss": 0.7989, + "num_input_tokens_seen": 37402312, + "step": 64430 + }, + { + "epoch": 9.597110515341079, + "grad_norm": 0.0233154296875, + "learning_rate": 0.018636546221966407, + "loss": 0.7912, + "num_input_tokens_seen": 37405032, + "step": 64435 + }, + { + "epoch": 9.597855227882038, + "grad_norm": 0.023681640625, + "learning_rate": 0.01863465470179526, + "loss": 0.789, + "num_input_tokens_seen": 37407912, + "step": 64440 + }, + { + "epoch": 9.598599940422996, + "grad_norm": 0.0279541015625, + "learning_rate": 0.01863276312022032, + "loss": 0.8061, + "num_input_tokens_seen": 37410920, + "step": 64445 + }, + { + "epoch": 9.599344652963955, + "grad_norm": 0.030517578125, + "learning_rate": 0.01863087147727354, + "loss": 0.7769, + "num_input_tokens_seen": 37413704, + "step": 64450 + }, + { + "epoch": 9.600089365504916, + "grad_norm": 0.03466796875, + "learning_rate": 0.018628979772986876, + "loss": 0.7906, + "num_input_tokens_seen": 37416808, + "step": 64455 + }, + { + "epoch": 9.600834078045875, + "grad_norm": 0.0269775390625, + "learning_rate": 0.018627088007392285, + "loss": 0.7962, + "num_input_tokens_seen": 37419656, + "step": 64460 + }, + { + "epoch": 9.601578790586833, + "grad_norm": 0.04541015625, + "learning_rate": 0.01862519618052173, + "loss": 0.7875, + "num_input_tokens_seen": 37422728, + "step": 64465 + }, + { + "epoch": 9.602323503127792, + "grad_norm": 0.01556396484375, + "learning_rate": 0.018623304292407168, + "loss": 0.7942, + "num_input_tokens_seen": 37425384, + "step": 64470 + }, + { + "epoch": 9.603068215668753, + "grad_norm": 0.02587890625, + "learning_rate": 0.018621412343080564, + "loss": 0.7774, + "num_input_tokens_seen": 37428232, + "step": 64475 + }, + { + "epoch": 9.603812928209711, + "grad_norm": 0.021484375, + "learning_rate": 0.018619520332573882, + "loss": 0.7993, + "num_input_tokens_seen": 37430952, + "step": 64480 + }, + { + "epoch": 9.60455764075067, + "grad_norm": 0.0302734375, + "learning_rate": 0.01861762826091908, + "loss": 0.8082, + "num_input_tokens_seen": 37433768, + "step": 64485 + }, + { + "epoch": 9.605302353291629, + "grad_norm": 0.0208740234375, + "learning_rate": 0.018615736128148125, + "loss": 0.8159, + "num_input_tokens_seen": 37436520, + "step": 64490 + }, + { + "epoch": 9.60604706583259, + "grad_norm": 0.0238037109375, + "learning_rate": 0.018613843934292985, + "loss": 0.8057, + "num_input_tokens_seen": 37439208, + "step": 64495 + }, + { + "epoch": 9.606791778373548, + "grad_norm": 0.0216064453125, + "learning_rate": 0.018611951679385626, + "loss": 0.8055, + "num_input_tokens_seen": 37441960, + "step": 64500 + }, + { + "epoch": 9.607536490914507, + "grad_norm": 0.0400390625, + "learning_rate": 0.01861005936345801, + "loss": 0.7984, + "num_input_tokens_seen": 37445096, + "step": 64505 + }, + { + "epoch": 9.608281203455466, + "grad_norm": 0.0277099609375, + "learning_rate": 0.018608166986542107, + "loss": 0.7952, + "num_input_tokens_seen": 37448008, + "step": 64510 + }, + { + "epoch": 9.609025915996426, + "grad_norm": 0.03515625, + "learning_rate": 0.018606274548669893, + "loss": 0.763, + "num_input_tokens_seen": 37451016, + "step": 64515 + }, + { + "epoch": 9.609770628537385, + "grad_norm": 0.0245361328125, + "learning_rate": 0.018604382049873344, + "loss": 0.794, + "num_input_tokens_seen": 37453864, + "step": 64520 + }, + { + "epoch": 9.610515341078344, + "grad_norm": 0.021240234375, + "learning_rate": 0.018602489490184414, + "loss": 0.7974, + "num_input_tokens_seen": 37456456, + "step": 64525 + }, + { + "epoch": 9.611260053619302, + "grad_norm": 0.0250244140625, + "learning_rate": 0.01860059686963509, + "loss": 0.7625, + "num_input_tokens_seen": 37459208, + "step": 64530 + }, + { + "epoch": 9.612004766160261, + "grad_norm": 0.020751953125, + "learning_rate": 0.01859870418825734, + "loss": 0.785, + "num_input_tokens_seen": 37462024, + "step": 64535 + }, + { + "epoch": 9.612749478701222, + "grad_norm": 0.0228271484375, + "learning_rate": 0.01859681144608314, + "loss": 0.8078, + "num_input_tokens_seen": 37464936, + "step": 64540 + }, + { + "epoch": 9.61349419124218, + "grad_norm": 0.0167236328125, + "learning_rate": 0.018594918643144463, + "loss": 0.7941, + "num_input_tokens_seen": 37467816, + "step": 64545 + }, + { + "epoch": 9.61423890378314, + "grad_norm": 0.021240234375, + "learning_rate": 0.018593025779473294, + "loss": 0.7817, + "num_input_tokens_seen": 37470792, + "step": 64550 + }, + { + "epoch": 9.6149836163241, + "grad_norm": 0.038818359375, + "learning_rate": 0.018591132855101604, + "loss": 0.7918, + "num_input_tokens_seen": 37473704, + "step": 64555 + }, + { + "epoch": 9.615728328865059, + "grad_norm": 0.0341796875, + "learning_rate": 0.018589239870061378, + "loss": 0.8524, + "num_input_tokens_seen": 37476392, + "step": 64560 + }, + { + "epoch": 9.616473041406017, + "grad_norm": 0.03076171875, + "learning_rate": 0.018587346824384593, + "loss": 0.7956, + "num_input_tokens_seen": 37479496, + "step": 64565 + }, + { + "epoch": 9.617217753946976, + "grad_norm": 0.03662109375, + "learning_rate": 0.01858545371810322, + "loss": 0.7987, + "num_input_tokens_seen": 37482248, + "step": 64570 + }, + { + "epoch": 9.617962466487935, + "grad_norm": 0.02978515625, + "learning_rate": 0.01858356055124926, + "loss": 0.8102, + "num_input_tokens_seen": 37485064, + "step": 64575 + }, + { + "epoch": 9.618707179028895, + "grad_norm": 0.03076171875, + "learning_rate": 0.01858166732385468, + "loss": 0.8583, + "num_input_tokens_seen": 37488136, + "step": 64580 + }, + { + "epoch": 9.619451891569854, + "grad_norm": 0.0380859375, + "learning_rate": 0.01857977403595147, + "loss": 0.8012, + "num_input_tokens_seen": 37491144, + "step": 64585 + }, + { + "epoch": 9.620196604110813, + "grad_norm": 0.021728515625, + "learning_rate": 0.018577880687571623, + "loss": 0.7928, + "num_input_tokens_seen": 37493864, + "step": 64590 + }, + { + "epoch": 9.620941316651772, + "grad_norm": 0.0224609375, + "learning_rate": 0.018575987278747118, + "loss": 0.7874, + "num_input_tokens_seen": 37496648, + "step": 64595 + }, + { + "epoch": 9.621686029192732, + "grad_norm": 0.037109375, + "learning_rate": 0.01857409380950994, + "loss": 0.8062, + "num_input_tokens_seen": 37499528, + "step": 64600 + }, + { + "epoch": 9.622430741733691, + "grad_norm": 0.0224609375, + "learning_rate": 0.01857220027989208, + "loss": 0.7785, + "num_input_tokens_seen": 37502152, + "step": 64605 + }, + { + "epoch": 9.62317545427465, + "grad_norm": 0.0341796875, + "learning_rate": 0.018570306689925525, + "loss": 0.8107, + "num_input_tokens_seen": 37504968, + "step": 64610 + }, + { + "epoch": 9.623920166815608, + "grad_norm": 0.0177001953125, + "learning_rate": 0.018568413039642268, + "loss": 0.7989, + "num_input_tokens_seen": 37507848, + "step": 64615 + }, + { + "epoch": 9.624664879356569, + "grad_norm": 0.035888671875, + "learning_rate": 0.018566519329074296, + "loss": 0.801, + "num_input_tokens_seen": 37510888, + "step": 64620 + }, + { + "epoch": 9.625409591897528, + "grad_norm": 0.025146484375, + "learning_rate": 0.01856462555825361, + "loss": 0.7977, + "num_input_tokens_seen": 37513768, + "step": 64625 + }, + { + "epoch": 9.626154304438487, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0185627317272122, + "loss": 0.7898, + "num_input_tokens_seen": 37516552, + "step": 64630 + }, + { + "epoch": 9.626899016979445, + "grad_norm": 0.022705078125, + "learning_rate": 0.018560837835982055, + "loss": 0.8058, + "num_input_tokens_seen": 37519528, + "step": 64635 + }, + { + "epoch": 9.627643729520406, + "grad_norm": 0.024658203125, + "learning_rate": 0.018558943884595173, + "loss": 0.7997, + "num_input_tokens_seen": 37522440, + "step": 64640 + }, + { + "epoch": 9.628388442061365, + "grad_norm": 0.0230712890625, + "learning_rate": 0.01855704987308355, + "loss": 0.7896, + "num_input_tokens_seen": 37525128, + "step": 64645 + }, + { + "epoch": 9.629133154602323, + "grad_norm": 0.023193359375, + "learning_rate": 0.018555155801479185, + "loss": 0.8105, + "num_input_tokens_seen": 37527848, + "step": 64650 + }, + { + "epoch": 9.629877867143282, + "grad_norm": 0.0260009765625, + "learning_rate": 0.018553261669814077, + "loss": 0.7924, + "num_input_tokens_seen": 37531112, + "step": 64655 + }, + { + "epoch": 9.630622579684243, + "grad_norm": 0.0179443359375, + "learning_rate": 0.018551367478120223, + "loss": 0.8044, + "num_input_tokens_seen": 37533768, + "step": 64660 + }, + { + "epoch": 9.631367292225201, + "grad_norm": 0.028076171875, + "learning_rate": 0.018549473226429632, + "loss": 0.8032, + "num_input_tokens_seen": 37536584, + "step": 64665 + }, + { + "epoch": 9.63211200476616, + "grad_norm": 0.034423828125, + "learning_rate": 0.018547578914774293, + "loss": 0.8175, + "num_input_tokens_seen": 37539592, + "step": 64670 + }, + { + "epoch": 9.632856717307119, + "grad_norm": 0.0181884765625, + "learning_rate": 0.018545684543186215, + "loss": 0.8304, + "num_input_tokens_seen": 37542440, + "step": 64675 + }, + { + "epoch": 9.63360142984808, + "grad_norm": 0.03271484375, + "learning_rate": 0.018543790111697402, + "loss": 0.8017, + "num_input_tokens_seen": 37545000, + "step": 64680 + }, + { + "epoch": 9.634346142389038, + "grad_norm": 0.026123046875, + "learning_rate": 0.01854189562033985, + "loss": 0.8147, + "num_input_tokens_seen": 37547752, + "step": 64685 + }, + { + "epoch": 9.635090854929997, + "grad_norm": 0.037841796875, + "learning_rate": 0.01854000106914558, + "loss": 0.8133, + "num_input_tokens_seen": 37550504, + "step": 64690 + }, + { + "epoch": 9.635835567470956, + "grad_norm": 0.02490234375, + "learning_rate": 0.018538106458146582, + "loss": 0.7991, + "num_input_tokens_seen": 37553576, + "step": 64695 + }, + { + "epoch": 9.636580280011916, + "grad_norm": 0.03759765625, + "learning_rate": 0.018536211787374877, + "loss": 0.8098, + "num_input_tokens_seen": 37556584, + "step": 64700 + }, + { + "epoch": 9.637324992552875, + "grad_norm": 0.029541015625, + "learning_rate": 0.018534317056862464, + "loss": 0.8054, + "num_input_tokens_seen": 37559496, + "step": 64705 + }, + { + "epoch": 9.638069705093834, + "grad_norm": 0.031494140625, + "learning_rate": 0.018532422266641358, + "loss": 0.8247, + "num_input_tokens_seen": 37562696, + "step": 64710 + }, + { + "epoch": 9.638814417634793, + "grad_norm": 0.0301513671875, + "learning_rate": 0.018530527416743568, + "loss": 0.8244, + "num_input_tokens_seen": 37565640, + "step": 64715 + }, + { + "epoch": 9.639559130175751, + "grad_norm": 0.031494140625, + "learning_rate": 0.018528632507201107, + "loss": 0.7845, + "num_input_tokens_seen": 37568552, + "step": 64720 + }, + { + "epoch": 9.640303842716712, + "grad_norm": 0.023193359375, + "learning_rate": 0.018526737538045984, + "loss": 0.795, + "num_input_tokens_seen": 37571688, + "step": 64725 + }, + { + "epoch": 9.64104855525767, + "grad_norm": 0.026123046875, + "learning_rate": 0.01852484250931022, + "loss": 0.7965, + "num_input_tokens_seen": 37574408, + "step": 64730 + }, + { + "epoch": 9.64179326779863, + "grad_norm": 0.0262451171875, + "learning_rate": 0.018522947421025816, + "loss": 0.7769, + "num_input_tokens_seen": 37577384, + "step": 64735 + }, + { + "epoch": 9.642537980339588, + "grad_norm": 0.021728515625, + "learning_rate": 0.0185210522732248, + "loss": 0.8083, + "num_input_tokens_seen": 37580360, + "step": 64740 + }, + { + "epoch": 9.643282692880549, + "grad_norm": 0.034912109375, + "learning_rate": 0.018519157065939183, + "loss": 0.8078, + "num_input_tokens_seen": 37583080, + "step": 64745 + }, + { + "epoch": 9.644027405421507, + "grad_norm": 0.022705078125, + "learning_rate": 0.018517261799200987, + "loss": 0.7793, + "num_input_tokens_seen": 37585896, + "step": 64750 + }, + { + "epoch": 9.644772117962466, + "grad_norm": 0.0546875, + "learning_rate": 0.018515366473042225, + "loss": 0.8101, + "num_input_tokens_seen": 37588712, + "step": 64755 + }, + { + "epoch": 9.645516830503425, + "grad_norm": 0.0517578125, + "learning_rate": 0.01851347108749492, + "loss": 0.8058, + "num_input_tokens_seen": 37591752, + "step": 64760 + }, + { + "epoch": 9.646261543044385, + "grad_norm": 0.02392578125, + "learning_rate": 0.018511575642591097, + "loss": 0.7899, + "num_input_tokens_seen": 37594440, + "step": 64765 + }, + { + "epoch": 9.647006255585344, + "grad_norm": 0.026611328125, + "learning_rate": 0.018509680138362766, + "loss": 0.7999, + "num_input_tokens_seen": 37597576, + "step": 64770 + }, + { + "epoch": 9.647750968126303, + "grad_norm": 0.02294921875, + "learning_rate": 0.018507784574841958, + "loss": 0.8126, + "num_input_tokens_seen": 37600392, + "step": 64775 + }, + { + "epoch": 9.648495680667262, + "grad_norm": 0.0234375, + "learning_rate": 0.018505888952060692, + "loss": 0.7931, + "num_input_tokens_seen": 37603528, + "step": 64780 + }, + { + "epoch": 9.649240393208222, + "grad_norm": 0.02734375, + "learning_rate": 0.018503993270051, + "loss": 0.8163, + "num_input_tokens_seen": 37606280, + "step": 64785 + }, + { + "epoch": 9.649985105749181, + "grad_norm": 0.02880859375, + "learning_rate": 0.018502097528844897, + "loss": 0.792, + "num_input_tokens_seen": 37608904, + "step": 64790 + }, + { + "epoch": 9.65072981829014, + "grad_norm": 0.0216064453125, + "learning_rate": 0.01850020172847442, + "loss": 0.7933, + "num_input_tokens_seen": 37611944, + "step": 64795 + }, + { + "epoch": 9.651474530831099, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0184983058689716, + "loss": 0.7879, + "num_input_tokens_seen": 37614792, + "step": 64800 + }, + { + "epoch": 9.652219243372059, + "grad_norm": 0.0263671875, + "learning_rate": 0.018496409950368446, + "loss": 0.7982, + "num_input_tokens_seen": 37617512, + "step": 64805 + }, + { + "epoch": 9.652963955913018, + "grad_norm": 0.0299072265625, + "learning_rate": 0.018494513972697003, + "loss": 0.8043, + "num_input_tokens_seen": 37620200, + "step": 64810 + }, + { + "epoch": 9.653708668453977, + "grad_norm": 0.023193359375, + "learning_rate": 0.0184926179359893, + "loss": 0.799, + "num_input_tokens_seen": 37622984, + "step": 64815 + }, + { + "epoch": 9.654453380994935, + "grad_norm": 0.0146484375, + "learning_rate": 0.018490721840277363, + "loss": 0.8004, + "num_input_tokens_seen": 37625608, + "step": 64820 + }, + { + "epoch": 9.655198093535896, + "grad_norm": 0.031982421875, + "learning_rate": 0.01848882568559323, + "loss": 0.8211, + "num_input_tokens_seen": 37628328, + "step": 64825 + }, + { + "epoch": 9.655942806076855, + "grad_norm": 0.03369140625, + "learning_rate": 0.018486929471968932, + "loss": 0.7966, + "num_input_tokens_seen": 37631176, + "step": 64830 + }, + { + "epoch": 9.656687518617813, + "grad_norm": 0.0255126953125, + "learning_rate": 0.018485033199436507, + "loss": 0.7943, + "num_input_tokens_seen": 37633864, + "step": 64835 + }, + { + "epoch": 9.657432231158772, + "grad_norm": 0.022705078125, + "learning_rate": 0.01848313686802799, + "loss": 0.8072, + "num_input_tokens_seen": 37636584, + "step": 64840 + }, + { + "epoch": 9.658176943699733, + "grad_norm": 0.046875, + "learning_rate": 0.018481240477775412, + "loss": 0.7795, + "num_input_tokens_seen": 37640840, + "step": 64845 + }, + { + "epoch": 9.658921656240691, + "grad_norm": 0.023681640625, + "learning_rate": 0.018479344028710813, + "loss": 0.8129, + "num_input_tokens_seen": 37643848, + "step": 64850 + }, + { + "epoch": 9.65966636878165, + "grad_norm": 0.0322265625, + "learning_rate": 0.018477447520866233, + "loss": 0.7758, + "num_input_tokens_seen": 37646600, + "step": 64855 + }, + { + "epoch": 9.660411081322609, + "grad_norm": 0.034423828125, + "learning_rate": 0.018475550954273712, + "loss": 0.7952, + "num_input_tokens_seen": 37649256, + "step": 64860 + }, + { + "epoch": 9.66115579386357, + "grad_norm": 0.0390625, + "learning_rate": 0.018473654328965295, + "loss": 0.8289, + "num_input_tokens_seen": 37652264, + "step": 64865 + }, + { + "epoch": 9.661900506404528, + "grad_norm": 0.033935546875, + "learning_rate": 0.018471757644973018, + "loss": 0.8143, + "num_input_tokens_seen": 37655400, + "step": 64870 + }, + { + "epoch": 9.662645218945487, + "grad_norm": 0.0247802734375, + "learning_rate": 0.018469860902328914, + "loss": 0.8, + "num_input_tokens_seen": 37658216, + "step": 64875 + }, + { + "epoch": 9.663389931486446, + "grad_norm": 0.0220947265625, + "learning_rate": 0.01846796410106505, + "loss": 0.8107, + "num_input_tokens_seen": 37661352, + "step": 64880 + }, + { + "epoch": 9.664134644027406, + "grad_norm": 0.03271484375, + "learning_rate": 0.018466067241213448, + "loss": 0.7968, + "num_input_tokens_seen": 37664200, + "step": 64885 + }, + { + "epoch": 9.664879356568365, + "grad_norm": 0.03515625, + "learning_rate": 0.018464170322806166, + "loss": 0.8084, + "num_input_tokens_seen": 37667176, + "step": 64890 + }, + { + "epoch": 9.665624069109324, + "grad_norm": 0.0296630859375, + "learning_rate": 0.01846227334587525, + "loss": 0.7883, + "num_input_tokens_seen": 37669800, + "step": 64895 + }, + { + "epoch": 9.666368781650283, + "grad_norm": 0.026611328125, + "learning_rate": 0.01846037631045274, + "loss": 0.8088, + "num_input_tokens_seen": 37673128, + "step": 64900 + }, + { + "epoch": 9.667113494191241, + "grad_norm": 0.04931640625, + "learning_rate": 0.018458479216570692, + "loss": 0.7959, + "num_input_tokens_seen": 37676136, + "step": 64905 + }, + { + "epoch": 9.667858206732202, + "grad_norm": 0.03857421875, + "learning_rate": 0.018456582064261148, + "loss": 0.8126, + "num_input_tokens_seen": 37679016, + "step": 64910 + }, + { + "epoch": 9.66860291927316, + "grad_norm": 0.0306396484375, + "learning_rate": 0.018454684853556166, + "loss": 0.806, + "num_input_tokens_seen": 37681992, + "step": 64915 + }, + { + "epoch": 9.66934763181412, + "grad_norm": 0.027587890625, + "learning_rate": 0.018452787584487793, + "loss": 0.7762, + "num_input_tokens_seen": 37685192, + "step": 64920 + }, + { + "epoch": 9.670092344355078, + "grad_norm": 0.033203125, + "learning_rate": 0.018450890257088088, + "loss": 0.8142, + "num_input_tokens_seen": 37688040, + "step": 64925 + }, + { + "epoch": 9.670837056896039, + "grad_norm": 0.03369140625, + "learning_rate": 0.0184489928713891, + "loss": 0.7708, + "num_input_tokens_seen": 37691016, + "step": 64930 + }, + { + "epoch": 9.671581769436997, + "grad_norm": 0.0299072265625, + "learning_rate": 0.018447095427422875, + "loss": 0.8145, + "num_input_tokens_seen": 37693736, + "step": 64935 + }, + { + "epoch": 9.672326481977956, + "grad_norm": 0.02197265625, + "learning_rate": 0.018445197925221483, + "loss": 0.8049, + "num_input_tokens_seen": 37696808, + "step": 64940 + }, + { + "epoch": 9.673071194518915, + "grad_norm": 0.024658203125, + "learning_rate": 0.01844330036481697, + "loss": 0.8069, + "num_input_tokens_seen": 37699944, + "step": 64945 + }, + { + "epoch": 9.673815907059875, + "grad_norm": 0.0272216796875, + "learning_rate": 0.018441402746241398, + "loss": 0.7988, + "num_input_tokens_seen": 37702760, + "step": 64950 + }, + { + "epoch": 9.674560619600834, + "grad_norm": 0.0361328125, + "learning_rate": 0.01843950506952682, + "loss": 0.8013, + "num_input_tokens_seen": 37705640, + "step": 64955 + }, + { + "epoch": 9.675305332141793, + "grad_norm": 0.036865234375, + "learning_rate": 0.01843760733470531, + "loss": 0.8033, + "num_input_tokens_seen": 37708648, + "step": 64960 + }, + { + "epoch": 9.676050044682752, + "grad_norm": 0.0322265625, + "learning_rate": 0.018435709541808915, + "loss": 0.8002, + "num_input_tokens_seen": 37711656, + "step": 64965 + }, + { + "epoch": 9.676794757223712, + "grad_norm": 0.03857421875, + "learning_rate": 0.018433811690869696, + "loss": 0.8138, + "num_input_tokens_seen": 37714408, + "step": 64970 + }, + { + "epoch": 9.677539469764671, + "grad_norm": 0.0205078125, + "learning_rate": 0.018431913781919714, + "loss": 0.7925, + "num_input_tokens_seen": 37717224, + "step": 64975 + }, + { + "epoch": 9.67828418230563, + "grad_norm": 0.027099609375, + "learning_rate": 0.018430015814991044, + "loss": 0.7875, + "num_input_tokens_seen": 37720040, + "step": 64980 + }, + { + "epoch": 9.679028894846589, + "grad_norm": 0.030029296875, + "learning_rate": 0.01842811779011574, + "loss": 0.8184, + "num_input_tokens_seen": 37723176, + "step": 64985 + }, + { + "epoch": 9.679773607387549, + "grad_norm": 0.039306640625, + "learning_rate": 0.01842621970732587, + "loss": 0.7892, + "num_input_tokens_seen": 37725928, + "step": 64990 + }, + { + "epoch": 9.680518319928508, + "grad_norm": 0.026611328125, + "learning_rate": 0.0184243215666535, + "loss": 0.7926, + "num_input_tokens_seen": 37728680, + "step": 64995 + }, + { + "epoch": 9.681263032469467, + "grad_norm": 0.048095703125, + "learning_rate": 0.0184224233681307, + "loss": 0.8033, + "num_input_tokens_seen": 37731912, + "step": 65000 + }, + { + "epoch": 9.682007745010425, + "grad_norm": 0.03125, + "learning_rate": 0.018420525111789533, + "loss": 0.8156, + "num_input_tokens_seen": 37734728, + "step": 65005 + }, + { + "epoch": 9.682752457551386, + "grad_norm": 0.04345703125, + "learning_rate": 0.018418626797662072, + "loss": 0.8195, + "num_input_tokens_seen": 37737576, + "step": 65010 + }, + { + "epoch": 9.683497170092345, + "grad_norm": 0.022705078125, + "learning_rate": 0.018416728425780383, + "loss": 0.7936, + "num_input_tokens_seen": 37740136, + "step": 65015 + }, + { + "epoch": 9.684241882633303, + "grad_norm": 0.0283203125, + "learning_rate": 0.018414829996176545, + "loss": 0.8048, + "num_input_tokens_seen": 37743048, + "step": 65020 + }, + { + "epoch": 9.684986595174262, + "grad_norm": 0.032958984375, + "learning_rate": 0.01841293150888262, + "loss": 0.8098, + "num_input_tokens_seen": 37745864, + "step": 65025 + }, + { + "epoch": 9.685731307715223, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01841103296393069, + "loss": 0.7996, + "num_input_tokens_seen": 37748712, + "step": 65030 + }, + { + "epoch": 9.686476020256181, + "grad_norm": 0.030029296875, + "learning_rate": 0.01840913436135282, + "loss": 0.7742, + "num_input_tokens_seen": 37751720, + "step": 65035 + }, + { + "epoch": 9.68722073279714, + "grad_norm": 0.0238037109375, + "learning_rate": 0.018407235701181093, + "loss": 0.7874, + "num_input_tokens_seen": 37754472, + "step": 65040 + }, + { + "epoch": 9.687965445338099, + "grad_norm": 0.05078125, + "learning_rate": 0.018405336983447585, + "loss": 0.8207, + "num_input_tokens_seen": 37757288, + "step": 65045 + }, + { + "epoch": 9.688710157879058, + "grad_norm": 0.0284423828125, + "learning_rate": 0.018403438208184363, + "loss": 0.7962, + "num_input_tokens_seen": 37760456, + "step": 65050 + }, + { + "epoch": 9.689454870420018, + "grad_norm": 0.02099609375, + "learning_rate": 0.018401539375423517, + "loss": 0.8305, + "num_input_tokens_seen": 37763208, + "step": 65055 + }, + { + "epoch": 9.690199582960977, + "grad_norm": 0.036376953125, + "learning_rate": 0.01839964048519712, + "loss": 0.804, + "num_input_tokens_seen": 37765896, + "step": 65060 + }, + { + "epoch": 9.690944295501936, + "grad_norm": 0.037353515625, + "learning_rate": 0.01839774153753725, + "loss": 0.8139, + "num_input_tokens_seen": 37768840, + "step": 65065 + }, + { + "epoch": 9.691689008042896, + "grad_norm": 0.0252685546875, + "learning_rate": 0.01839584253247599, + "loss": 0.7828, + "num_input_tokens_seen": 37771912, + "step": 65070 + }, + { + "epoch": 9.692433720583855, + "grad_norm": 0.0286865234375, + "learning_rate": 0.01839394347004542, + "loss": 0.8174, + "num_input_tokens_seen": 37774696, + "step": 65075 + }, + { + "epoch": 9.693178433124814, + "grad_norm": 0.032958984375, + "learning_rate": 0.018392044350277626, + "loss": 0.8145, + "num_input_tokens_seen": 37778088, + "step": 65080 + }, + { + "epoch": 9.693923145665773, + "grad_norm": 0.02392578125, + "learning_rate": 0.018390145173204694, + "loss": 0.7971, + "num_input_tokens_seen": 37780872, + "step": 65085 + }, + { + "epoch": 9.694667858206731, + "grad_norm": 0.0235595703125, + "learning_rate": 0.018388245938858697, + "loss": 0.7806, + "num_input_tokens_seen": 37784008, + "step": 65090 + }, + { + "epoch": 9.695412570747692, + "grad_norm": 0.024169921875, + "learning_rate": 0.018386346647271738, + "loss": 0.7765, + "num_input_tokens_seen": 37787048, + "step": 65095 + }, + { + "epoch": 9.69615728328865, + "grad_norm": 0.041259765625, + "learning_rate": 0.018384447298475888, + "loss": 0.8146, + "num_input_tokens_seen": 37790024, + "step": 65100 + }, + { + "epoch": 9.69690199582961, + "grad_norm": 0.034912109375, + "learning_rate": 0.018382547892503244, + "loss": 0.7751, + "num_input_tokens_seen": 37793224, + "step": 65105 + }, + { + "epoch": 9.697646708370568, + "grad_norm": 0.021728515625, + "learning_rate": 0.018380648429385884, + "loss": 0.7869, + "num_input_tokens_seen": 37796168, + "step": 65110 + }, + { + "epoch": 9.698391420911529, + "grad_norm": 0.015869140625, + "learning_rate": 0.01837874890915591, + "loss": 0.8051, + "num_input_tokens_seen": 37799080, + "step": 65115 + }, + { + "epoch": 9.699136133452487, + "grad_norm": 0.029296875, + "learning_rate": 0.01837684933184541, + "loss": 0.8034, + "num_input_tokens_seen": 37801704, + "step": 65120 + }, + { + "epoch": 9.699880845993446, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01837494969748647, + "loss": 0.8047, + "num_input_tokens_seen": 37804968, + "step": 65125 + }, + { + "epoch": 9.700625558534405, + "grad_norm": 0.0302734375, + "learning_rate": 0.018373050006111186, + "loss": 0.7949, + "num_input_tokens_seen": 37807880, + "step": 65130 + }, + { + "epoch": 9.701370271075366, + "grad_norm": 0.02587890625, + "learning_rate": 0.01837115025775165, + "loss": 0.8214, + "num_input_tokens_seen": 37811240, + "step": 65135 + }, + { + "epoch": 9.702114983616324, + "grad_norm": 0.0257568359375, + "learning_rate": 0.018369250452439954, + "loss": 0.8006, + "num_input_tokens_seen": 37814120, + "step": 65140 + }, + { + "epoch": 9.702859696157283, + "grad_norm": 0.026123046875, + "learning_rate": 0.0183673505902082, + "loss": 0.7939, + "num_input_tokens_seen": 37816968, + "step": 65145 + }, + { + "epoch": 9.703604408698242, + "grad_norm": 0.03662109375, + "learning_rate": 0.018365450671088475, + "loss": 0.8243, + "num_input_tokens_seen": 37819528, + "step": 65150 + }, + { + "epoch": 9.704349121239202, + "grad_norm": 0.02490234375, + "learning_rate": 0.018363550695112887, + "loss": 0.8226, + "num_input_tokens_seen": 37822056, + "step": 65155 + }, + { + "epoch": 9.705093833780161, + "grad_norm": 0.0264892578125, + "learning_rate": 0.018361650662313527, + "loss": 0.7958, + "num_input_tokens_seen": 37825128, + "step": 65160 + }, + { + "epoch": 9.70583854632112, + "grad_norm": 0.0218505859375, + "learning_rate": 0.018359750572722495, + "loss": 0.788, + "num_input_tokens_seen": 37828200, + "step": 65165 + }, + { + "epoch": 9.706583258862079, + "grad_norm": 0.0238037109375, + "learning_rate": 0.01835785042637189, + "loss": 0.7601, + "num_input_tokens_seen": 37830728, + "step": 65170 + }, + { + "epoch": 9.70732797140304, + "grad_norm": 0.034912109375, + "learning_rate": 0.01835595022329382, + "loss": 0.7975, + "num_input_tokens_seen": 37833864, + "step": 65175 + }, + { + "epoch": 9.708072683943998, + "grad_norm": 0.0174560546875, + "learning_rate": 0.018354049963520375, + "loss": 0.7979, + "num_input_tokens_seen": 37836776, + "step": 65180 + }, + { + "epoch": 9.708817396484957, + "grad_norm": 0.035400390625, + "learning_rate": 0.01835214964708367, + "loss": 0.792, + "num_input_tokens_seen": 37839560, + "step": 65185 + }, + { + "epoch": 9.709562109025915, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0183502492740158, + "loss": 0.8065, + "num_input_tokens_seen": 37842600, + "step": 65190 + }, + { + "epoch": 9.710306821566876, + "grad_norm": 0.034423828125, + "learning_rate": 0.018348348844348876, + "loss": 0.8009, + "num_input_tokens_seen": 37845288, + "step": 65195 + }, + { + "epoch": 9.711051534107835, + "grad_norm": 0.0306396484375, + "learning_rate": 0.018346448358114997, + "loss": 0.7703, + "num_input_tokens_seen": 37848232, + "step": 65200 + }, + { + "epoch": 9.711796246648793, + "grad_norm": 0.0517578125, + "learning_rate": 0.018344547815346283, + "loss": 0.8104, + "num_input_tokens_seen": 37851048, + "step": 65205 + }, + { + "epoch": 9.712540959189752, + "grad_norm": 0.025146484375, + "learning_rate": 0.018342647216074825, + "loss": 0.7992, + "num_input_tokens_seen": 37853928, + "step": 65210 + }, + { + "epoch": 9.713285671730713, + "grad_norm": 0.02685546875, + "learning_rate": 0.01834074656033274, + "loss": 0.7964, + "num_input_tokens_seen": 37856840, + "step": 65215 + }, + { + "epoch": 9.714030384271672, + "grad_norm": 0.041015625, + "learning_rate": 0.01833884584815214, + "loss": 0.797, + "num_input_tokens_seen": 37859752, + "step": 65220 + }, + { + "epoch": 9.71477509681263, + "grad_norm": 0.03125, + "learning_rate": 0.018336945079565136, + "loss": 0.8137, + "num_input_tokens_seen": 37862440, + "step": 65225 + }, + { + "epoch": 9.715519809353589, + "grad_norm": 0.0269775390625, + "learning_rate": 0.018335044254603833, + "loss": 0.7908, + "num_input_tokens_seen": 37865384, + "step": 65230 + }, + { + "epoch": 9.716264521894548, + "grad_norm": 0.0361328125, + "learning_rate": 0.018333143373300345, + "loss": 0.7903, + "num_input_tokens_seen": 37868168, + "step": 65235 + }, + { + "epoch": 9.717009234435508, + "grad_norm": 0.025634765625, + "learning_rate": 0.018331242435686786, + "loss": 0.786, + "num_input_tokens_seen": 37871112, + "step": 65240 + }, + { + "epoch": 9.717753946976467, + "grad_norm": 0.02490234375, + "learning_rate": 0.01832934144179528, + "loss": 0.7904, + "num_input_tokens_seen": 37874024, + "step": 65245 + }, + { + "epoch": 9.718498659517426, + "grad_norm": 0.03125, + "learning_rate": 0.018327440391657925, + "loss": 0.7889, + "num_input_tokens_seen": 37877096, + "step": 65250 + }, + { + "epoch": 9.719243372058386, + "grad_norm": 0.018310546875, + "learning_rate": 0.01832553928530685, + "loss": 0.8122, + "num_input_tokens_seen": 37880104, + "step": 65255 + }, + { + "epoch": 9.719988084599345, + "grad_norm": 0.0302734375, + "learning_rate": 0.018323638122774176, + "loss": 0.8051, + "num_input_tokens_seen": 37883144, + "step": 65260 + }, + { + "epoch": 9.720732797140304, + "grad_norm": 0.028564453125, + "learning_rate": 0.018321736904092004, + "loss": 0.8081, + "num_input_tokens_seen": 37886056, + "step": 65265 + }, + { + "epoch": 9.721477509681263, + "grad_norm": 0.0272216796875, + "learning_rate": 0.018319835629292465, + "loss": 0.8179, + "num_input_tokens_seen": 37889096, + "step": 65270 + }, + { + "epoch": 9.722222222222221, + "grad_norm": 0.0203857421875, + "learning_rate": 0.018317934298407678, + "loss": 0.8208, + "num_input_tokens_seen": 37891912, + "step": 65275 + }, + { + "epoch": 9.722966934763182, + "grad_norm": 0.0286865234375, + "learning_rate": 0.01831603291146976, + "loss": 0.7987, + "num_input_tokens_seen": 37894504, + "step": 65280 + }, + { + "epoch": 9.72371164730414, + "grad_norm": 0.037353515625, + "learning_rate": 0.018314131468510842, + "loss": 0.8151, + "num_input_tokens_seen": 37897416, + "step": 65285 + }, + { + "epoch": 9.7244563598451, + "grad_norm": 0.035888671875, + "learning_rate": 0.01831222996956304, + "loss": 0.8375, + "num_input_tokens_seen": 37900552, + "step": 65290 + }, + { + "epoch": 9.725201072386058, + "grad_norm": 0.034423828125, + "learning_rate": 0.01831032841465848, + "loss": 0.8091, + "num_input_tokens_seen": 37903624, + "step": 65295 + }, + { + "epoch": 9.725945784927019, + "grad_norm": 0.024658203125, + "learning_rate": 0.018308426803829284, + "loss": 0.7973, + "num_input_tokens_seen": 37906664, + "step": 65300 + }, + { + "epoch": 9.726690497467978, + "grad_norm": 0.037841796875, + "learning_rate": 0.018306525137107576, + "loss": 0.8199, + "num_input_tokens_seen": 37909384, + "step": 65305 + }, + { + "epoch": 9.727435210008936, + "grad_norm": 0.0260009765625, + "learning_rate": 0.01830462341452549, + "loss": 0.7987, + "num_input_tokens_seen": 37912616, + "step": 65310 + }, + { + "epoch": 9.728179922549895, + "grad_norm": 0.035400390625, + "learning_rate": 0.01830272163611515, + "loss": 0.8036, + "num_input_tokens_seen": 37915432, + "step": 65315 + }, + { + "epoch": 9.728924635090856, + "grad_norm": 0.026611328125, + "learning_rate": 0.018300819801908685, + "loss": 0.797, + "num_input_tokens_seen": 37918312, + "step": 65320 + }, + { + "epoch": 9.729669347631814, + "grad_norm": 0.027587890625, + "learning_rate": 0.018298917911938224, + "loss": 0.7982, + "num_input_tokens_seen": 37921224, + "step": 65325 + }, + { + "epoch": 9.730414060172773, + "grad_norm": 0.033935546875, + "learning_rate": 0.0182970159662359, + "loss": 0.8168, + "num_input_tokens_seen": 37923944, + "step": 65330 + }, + { + "epoch": 9.731158772713732, + "grad_norm": 0.031494140625, + "learning_rate": 0.01829511396483384, + "loss": 0.7814, + "num_input_tokens_seen": 37926824, + "step": 65335 + }, + { + "epoch": 9.731903485254692, + "grad_norm": 0.0439453125, + "learning_rate": 0.018293211907764177, + "loss": 0.8214, + "num_input_tokens_seen": 37929640, + "step": 65340 + }, + { + "epoch": 9.732648197795651, + "grad_norm": 0.0179443359375, + "learning_rate": 0.018291309795059045, + "loss": 0.7958, + "num_input_tokens_seen": 37932328, + "step": 65345 + }, + { + "epoch": 9.73339291033661, + "grad_norm": 0.031494140625, + "learning_rate": 0.01828940762675058, + "loss": 0.8088, + "num_input_tokens_seen": 37935432, + "step": 65350 + }, + { + "epoch": 9.734137622877569, + "grad_norm": 0.0277099609375, + "learning_rate": 0.018287505402870916, + "loss": 0.8032, + "num_input_tokens_seen": 37938312, + "step": 65355 + }, + { + "epoch": 9.73488233541853, + "grad_norm": 0.0162353515625, + "learning_rate": 0.018285603123452197, + "loss": 0.7959, + "num_input_tokens_seen": 37940904, + "step": 65360 + }, + { + "epoch": 9.735627047959488, + "grad_norm": 0.037109375, + "learning_rate": 0.01828370078852655, + "loss": 0.8068, + "num_input_tokens_seen": 37943560, + "step": 65365 + }, + { + "epoch": 9.736371760500447, + "grad_norm": 0.0286865234375, + "learning_rate": 0.01828179839812611, + "loss": 0.7885, + "num_input_tokens_seen": 37946184, + "step": 65370 + }, + { + "epoch": 9.737116473041405, + "grad_norm": 0.0257568359375, + "learning_rate": 0.01827989595228302, + "loss": 0.794, + "num_input_tokens_seen": 37949032, + "step": 65375 + }, + { + "epoch": 9.737861185582366, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01827799345102943, + "loss": 0.8003, + "num_input_tokens_seen": 37952072, + "step": 65380 + }, + { + "epoch": 9.738605898123325, + "grad_norm": 0.040283203125, + "learning_rate": 0.018276090894397464, + "loss": 0.8041, + "num_input_tokens_seen": 37954920, + "step": 65385 + }, + { + "epoch": 9.739350610664284, + "grad_norm": 0.0184326171875, + "learning_rate": 0.01827418828241928, + "loss": 0.7889, + "num_input_tokens_seen": 37957768, + "step": 65390 + }, + { + "epoch": 9.740095323205242, + "grad_norm": 0.0289306640625, + "learning_rate": 0.018272285615127008, + "loss": 0.7989, + "num_input_tokens_seen": 37960360, + "step": 65395 + }, + { + "epoch": 9.740840035746203, + "grad_norm": 0.0252685546875, + "learning_rate": 0.018270382892552797, + "loss": 0.8106, + "num_input_tokens_seen": 37962984, + "step": 65400 + }, + { + "epoch": 9.741584748287162, + "grad_norm": 0.03955078125, + "learning_rate": 0.01826848011472879, + "loss": 0.7936, + "num_input_tokens_seen": 37965672, + "step": 65405 + }, + { + "epoch": 9.74232946082812, + "grad_norm": 0.0269775390625, + "learning_rate": 0.01826657728168713, + "loss": 0.8057, + "num_input_tokens_seen": 37968584, + "step": 65410 + }, + { + "epoch": 9.743074173369079, + "grad_norm": 0.037109375, + "learning_rate": 0.01826467439345997, + "loss": 0.7904, + "num_input_tokens_seen": 37971464, + "step": 65415 + }, + { + "epoch": 9.743818885910038, + "grad_norm": 0.025390625, + "learning_rate": 0.018262771450079457, + "loss": 0.7859, + "num_input_tokens_seen": 37974632, + "step": 65420 + }, + { + "epoch": 9.744563598450998, + "grad_norm": 0.030029296875, + "learning_rate": 0.018260868451577737, + "loss": 0.7993, + "num_input_tokens_seen": 37977512, + "step": 65425 + }, + { + "epoch": 9.745308310991957, + "grad_norm": 0.029052734375, + "learning_rate": 0.018258965397986954, + "loss": 0.8152, + "num_input_tokens_seen": 37980232, + "step": 65430 + }, + { + "epoch": 9.746053023532916, + "grad_norm": 0.0205078125, + "learning_rate": 0.018257062289339265, + "loss": 0.8003, + "num_input_tokens_seen": 37983208, + "step": 65435 + }, + { + "epoch": 9.746797736073875, + "grad_norm": 0.038818359375, + "learning_rate": 0.01825515912566682, + "loss": 0.8076, + "num_input_tokens_seen": 37986056, + "step": 65440 + }, + { + "epoch": 9.747542448614835, + "grad_norm": 0.0213623046875, + "learning_rate": 0.018253255907001772, + "loss": 0.796, + "num_input_tokens_seen": 37989256, + "step": 65445 + }, + { + "epoch": 9.748287161155794, + "grad_norm": 0.042724609375, + "learning_rate": 0.018251352633376267, + "loss": 0.7868, + "num_input_tokens_seen": 37992168, + "step": 65450 + }, + { + "epoch": 9.749031873696753, + "grad_norm": 0.027587890625, + "learning_rate": 0.018249449304822467, + "loss": 0.7812, + "num_input_tokens_seen": 37995272, + "step": 65455 + }, + { + "epoch": 9.749776586237711, + "grad_norm": 0.0308837890625, + "learning_rate": 0.018247545921372524, + "loss": 0.7998, + "num_input_tokens_seen": 37998184, + "step": 65460 + }, + { + "epoch": 9.750521298778672, + "grad_norm": 0.034423828125, + "learning_rate": 0.01824564248305859, + "loss": 0.8118, + "num_input_tokens_seen": 38001192, + "step": 65465 + }, + { + "epoch": 9.75126601131963, + "grad_norm": 0.035400390625, + "learning_rate": 0.018243738989912824, + "loss": 0.7881, + "num_input_tokens_seen": 38003912, + "step": 65470 + }, + { + "epoch": 9.75201072386059, + "grad_norm": 0.0291748046875, + "learning_rate": 0.018241835441967388, + "loss": 0.7857, + "num_input_tokens_seen": 38006920, + "step": 65475 + }, + { + "epoch": 9.752755436401548, + "grad_norm": 0.0277099609375, + "learning_rate": 0.018239931839254434, + "loss": 0.808, + "num_input_tokens_seen": 38009576, + "step": 65480 + }, + { + "epoch": 9.753500148942509, + "grad_norm": 0.025390625, + "learning_rate": 0.018238028181806133, + "loss": 0.7996, + "num_input_tokens_seen": 38012360, + "step": 65485 + }, + { + "epoch": 9.754244861483468, + "grad_norm": 0.03369140625, + "learning_rate": 0.018236124469654627, + "loss": 0.8204, + "num_input_tokens_seen": 38015304, + "step": 65490 + }, + { + "epoch": 9.754989574024426, + "grad_norm": 0.027587890625, + "learning_rate": 0.018234220702832094, + "loss": 0.8142, + "num_input_tokens_seen": 38018280, + "step": 65495 + }, + { + "epoch": 9.755734286565385, + "grad_norm": 0.02783203125, + "learning_rate": 0.018232316881370683, + "loss": 0.7893, + "num_input_tokens_seen": 38021096, + "step": 65500 + }, + { + "epoch": 9.756478999106346, + "grad_norm": 0.040771484375, + "learning_rate": 0.01823041300530257, + "loss": 0.8156, + "num_input_tokens_seen": 38023976, + "step": 65505 + }, + { + "epoch": 9.757223711647304, + "grad_norm": 0.02978515625, + "learning_rate": 0.018228509074659904, + "loss": 0.7957, + "num_input_tokens_seen": 38026952, + "step": 65510 + }, + { + "epoch": 9.757968424188263, + "grad_norm": 0.0296630859375, + "learning_rate": 0.01822660508947486, + "loss": 0.7753, + "num_input_tokens_seen": 38029832, + "step": 65515 + }, + { + "epoch": 9.758713136729222, + "grad_norm": 0.036376953125, + "learning_rate": 0.018224701049779607, + "loss": 0.8277, + "num_input_tokens_seen": 38032840, + "step": 65520 + }, + { + "epoch": 9.759457849270182, + "grad_norm": 0.031005859375, + "learning_rate": 0.01822279695560631, + "loss": 0.7975, + "num_input_tokens_seen": 38036168, + "step": 65525 + }, + { + "epoch": 9.760202561811141, + "grad_norm": 0.04345703125, + "learning_rate": 0.018220892806987132, + "loss": 0.8282, + "num_input_tokens_seen": 38038920, + "step": 65530 + }, + { + "epoch": 9.7609472743521, + "grad_norm": 0.029296875, + "learning_rate": 0.01821898860395424, + "loss": 0.7982, + "num_input_tokens_seen": 38042152, + "step": 65535 + }, + { + "epoch": 9.761691986893059, + "grad_norm": 0.0283203125, + "learning_rate": 0.01821708434653981, + "loss": 0.8066, + "num_input_tokens_seen": 38045320, + "step": 65540 + }, + { + "epoch": 9.76243669943402, + "grad_norm": 0.0303955078125, + "learning_rate": 0.018215180034776006, + "loss": 0.8011, + "num_input_tokens_seen": 38048072, + "step": 65545 + }, + { + "epoch": 9.763181411974978, + "grad_norm": 0.039306640625, + "learning_rate": 0.018213275668695007, + "loss": 0.8032, + "num_input_tokens_seen": 38050856, + "step": 65550 + }, + { + "epoch": 9.763926124515937, + "grad_norm": 0.03271484375, + "learning_rate": 0.01821137124832898, + "loss": 0.7944, + "num_input_tokens_seen": 38053608, + "step": 65555 + }, + { + "epoch": 9.764670837056896, + "grad_norm": 0.0244140625, + "learning_rate": 0.0182094667737101, + "loss": 0.8014, + "num_input_tokens_seen": 38056168, + "step": 65560 + }, + { + "epoch": 9.765415549597854, + "grad_norm": 0.0206298828125, + "learning_rate": 0.01820756224487054, + "loss": 0.8192, + "num_input_tokens_seen": 38059048, + "step": 65565 + }, + { + "epoch": 9.766160262138815, + "grad_norm": 0.0220947265625, + "learning_rate": 0.018205657661842477, + "loss": 0.7861, + "num_input_tokens_seen": 38061928, + "step": 65570 + }, + { + "epoch": 9.766904974679774, + "grad_norm": 0.0419921875, + "learning_rate": 0.018203753024658084, + "loss": 0.8002, + "num_input_tokens_seen": 38065032, + "step": 65575 + }, + { + "epoch": 9.767649687220732, + "grad_norm": 0.03271484375, + "learning_rate": 0.018201848333349542, + "loss": 0.8343, + "num_input_tokens_seen": 38067944, + "step": 65580 + }, + { + "epoch": 9.768394399761693, + "grad_norm": 0.0206298828125, + "learning_rate": 0.018199943587949022, + "loss": 0.8063, + "num_input_tokens_seen": 38070536, + "step": 65585 + }, + { + "epoch": 9.769139112302652, + "grad_norm": 0.0281982421875, + "learning_rate": 0.018198038788488715, + "loss": 0.804, + "num_input_tokens_seen": 38073480, + "step": 65590 + }, + { + "epoch": 9.76988382484361, + "grad_norm": 0.041015625, + "learning_rate": 0.01819613393500079, + "loss": 0.8076, + "num_input_tokens_seen": 38076648, + "step": 65595 + }, + { + "epoch": 9.77062853738457, + "grad_norm": 0.030029296875, + "learning_rate": 0.01819422902751743, + "loss": 0.7995, + "num_input_tokens_seen": 38080008, + "step": 65600 + }, + { + "epoch": 9.771373249925528, + "grad_norm": 0.0177001953125, + "learning_rate": 0.01819232406607081, + "loss": 0.8058, + "num_input_tokens_seen": 38082696, + "step": 65605 + }, + { + "epoch": 9.772117962466488, + "grad_norm": 0.0299072265625, + "learning_rate": 0.01819041905069313, + "loss": 0.8025, + "num_input_tokens_seen": 38085608, + "step": 65610 + }, + { + "epoch": 9.772862675007447, + "grad_norm": 0.04052734375, + "learning_rate": 0.01818851398141656, + "loss": 0.816, + "num_input_tokens_seen": 38088200, + "step": 65615 + }, + { + "epoch": 9.773607387548406, + "grad_norm": 0.02685546875, + "learning_rate": 0.018186608858273282, + "loss": 0.7932, + "num_input_tokens_seen": 38090760, + "step": 65620 + }, + { + "epoch": 9.774352100089365, + "grad_norm": 0.037841796875, + "learning_rate": 0.018184703681295496, + "loss": 0.8024, + "num_input_tokens_seen": 38093544, + "step": 65625 + }, + { + "epoch": 9.775096812630325, + "grad_norm": 0.032470703125, + "learning_rate": 0.01818279845051537, + "loss": 0.7969, + "num_input_tokens_seen": 38096392, + "step": 65630 + }, + { + "epoch": 9.775841525171284, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0181808931659651, + "loss": 0.8099, + "num_input_tokens_seen": 38099368, + "step": 65635 + }, + { + "epoch": 9.776586237712243, + "grad_norm": 0.052001953125, + "learning_rate": 0.018178987827676872, + "loss": 0.8193, + "num_input_tokens_seen": 38102792, + "step": 65640 + }, + { + "epoch": 9.777330950253202, + "grad_norm": 0.03857421875, + "learning_rate": 0.018177082435682876, + "loss": 0.8002, + "num_input_tokens_seen": 38105544, + "step": 65645 + }, + { + "epoch": 9.778075662794162, + "grad_norm": 0.0322265625, + "learning_rate": 0.018175176990015307, + "loss": 0.8009, + "num_input_tokens_seen": 38108680, + "step": 65650 + }, + { + "epoch": 9.77882037533512, + "grad_norm": 0.026123046875, + "learning_rate": 0.01817327149070634, + "loss": 0.7908, + "num_input_tokens_seen": 38111656, + "step": 65655 + }, + { + "epoch": 9.77956508787608, + "grad_norm": 0.03515625, + "learning_rate": 0.01817136593778819, + "loss": 0.806, + "num_input_tokens_seen": 38114696, + "step": 65660 + }, + { + "epoch": 9.780309800417038, + "grad_norm": 0.029541015625, + "learning_rate": 0.018169460331293023, + "loss": 0.8063, + "num_input_tokens_seen": 38117640, + "step": 65665 + }, + { + "epoch": 9.781054512957999, + "grad_norm": 0.04150390625, + "learning_rate": 0.018167554671253052, + "loss": 0.7932, + "num_input_tokens_seen": 38120424, + "step": 65670 + }, + { + "epoch": 9.781799225498958, + "grad_norm": 0.0308837890625, + "learning_rate": 0.018165648957700457, + "loss": 0.8084, + "num_input_tokens_seen": 38123176, + "step": 65675 + }, + { + "epoch": 9.782543938039916, + "grad_norm": 0.0247802734375, + "learning_rate": 0.018163743190667447, + "loss": 0.7966, + "num_input_tokens_seen": 38126056, + "step": 65680 + }, + { + "epoch": 9.783288650580875, + "grad_norm": 0.026611328125, + "learning_rate": 0.018161837370186208, + "loss": 0.8029, + "num_input_tokens_seen": 38129064, + "step": 65685 + }, + { + "epoch": 9.784033363121836, + "grad_norm": 0.0498046875, + "learning_rate": 0.018159931496288945, + "loss": 0.7925, + "num_input_tokens_seen": 38132008, + "step": 65690 + }, + { + "epoch": 9.784778075662794, + "grad_norm": 0.0267333984375, + "learning_rate": 0.01815802556900785, + "loss": 0.8186, + "num_input_tokens_seen": 38134760, + "step": 65695 + }, + { + "epoch": 9.785522788203753, + "grad_norm": 0.027099609375, + "learning_rate": 0.018156119588375118, + "loss": 0.8046, + "num_input_tokens_seen": 38137352, + "step": 65700 + }, + { + "epoch": 9.786267500744712, + "grad_norm": 0.0224609375, + "learning_rate": 0.018154213554422954, + "loss": 0.8087, + "num_input_tokens_seen": 38140200, + "step": 65705 + }, + { + "epoch": 9.787012213285673, + "grad_norm": 0.04345703125, + "learning_rate": 0.01815230746718356, + "loss": 0.8033, + "num_input_tokens_seen": 38143144, + "step": 65710 + }, + { + "epoch": 9.787756925826631, + "grad_norm": 0.0230712890625, + "learning_rate": 0.018150401326689134, + "loss": 0.7904, + "num_input_tokens_seen": 38146216, + "step": 65715 + }, + { + "epoch": 9.78850163836759, + "grad_norm": 0.051025390625, + "learning_rate": 0.01814849513297188, + "loss": 0.806, + "num_input_tokens_seen": 38149064, + "step": 65720 + }, + { + "epoch": 9.789246350908549, + "grad_norm": 0.022216796875, + "learning_rate": 0.018146588886064, + "loss": 0.8056, + "num_input_tokens_seen": 38151944, + "step": 65725 + }, + { + "epoch": 9.78999106344951, + "grad_norm": 0.02490234375, + "learning_rate": 0.018144682585997704, + "loss": 0.8055, + "num_input_tokens_seen": 38154568, + "step": 65730 + }, + { + "epoch": 9.790735775990468, + "grad_norm": 0.0341796875, + "learning_rate": 0.018142776232805188, + "loss": 0.7948, + "num_input_tokens_seen": 38157320, + "step": 65735 + }, + { + "epoch": 9.791480488531427, + "grad_norm": 0.033203125, + "learning_rate": 0.01814086982651866, + "loss": 0.7956, + "num_input_tokens_seen": 38161096, + "step": 65740 + }, + { + "epoch": 9.792225201072386, + "grad_norm": 0.031982421875, + "learning_rate": 0.018138963367170326, + "loss": 0.7975, + "num_input_tokens_seen": 38163848, + "step": 65745 + }, + { + "epoch": 9.792969913613344, + "grad_norm": 0.03369140625, + "learning_rate": 0.018137056854792402, + "loss": 0.7906, + "num_input_tokens_seen": 38166568, + "step": 65750 + }, + { + "epoch": 9.793714626154305, + "grad_norm": 0.0277099609375, + "learning_rate": 0.018135150289417087, + "loss": 0.7834, + "num_input_tokens_seen": 38169224, + "step": 65755 + }, + { + "epoch": 9.794459338695264, + "grad_norm": 0.042724609375, + "learning_rate": 0.018133243671076597, + "loss": 0.7757, + "num_input_tokens_seen": 38172072, + "step": 65760 + }, + { + "epoch": 9.795204051236222, + "grad_norm": 0.0341796875, + "learning_rate": 0.018131336999803137, + "loss": 0.7587, + "num_input_tokens_seen": 38174888, + "step": 65765 + }, + { + "epoch": 9.795948763777183, + "grad_norm": 0.041015625, + "learning_rate": 0.018129430275628924, + "loss": 0.8391, + "num_input_tokens_seen": 38177928, + "step": 65770 + }, + { + "epoch": 9.796693476318142, + "grad_norm": 0.037353515625, + "learning_rate": 0.018127523498586165, + "loss": 0.8133, + "num_input_tokens_seen": 38180904, + "step": 65775 + }, + { + "epoch": 9.7974381888591, + "grad_norm": 0.032958984375, + "learning_rate": 0.018125616668707076, + "loss": 0.8136, + "num_input_tokens_seen": 38183816, + "step": 65780 + }, + { + "epoch": 9.79818290140006, + "grad_norm": 0.0250244140625, + "learning_rate": 0.01812370978602387, + "loss": 0.8163, + "num_input_tokens_seen": 38186824, + "step": 65785 + }, + { + "epoch": 9.798927613941018, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01812180285056876, + "loss": 0.8074, + "num_input_tokens_seen": 38189992, + "step": 65790 + }, + { + "epoch": 9.799672326481979, + "grad_norm": 0.02490234375, + "learning_rate": 0.018119895862373968, + "loss": 0.7978, + "num_input_tokens_seen": 38193064, + "step": 65795 + }, + { + "epoch": 9.800417039022937, + "grad_norm": 0.031494140625, + "learning_rate": 0.0181179888214717, + "loss": 0.8153, + "num_input_tokens_seen": 38196168, + "step": 65800 + }, + { + "epoch": 9.801161751563896, + "grad_norm": 0.054931640625, + "learning_rate": 0.018116081727894185, + "loss": 0.7985, + "num_input_tokens_seen": 38198952, + "step": 65805 + }, + { + "epoch": 9.801906464104855, + "grad_norm": 0.0255126953125, + "learning_rate": 0.018114174581673637, + "loss": 0.8117, + "num_input_tokens_seen": 38201768, + "step": 65810 + }, + { + "epoch": 9.802651176645815, + "grad_norm": 0.02490234375, + "learning_rate": 0.018112267382842273, + "loss": 0.7973, + "num_input_tokens_seen": 38204392, + "step": 65815 + }, + { + "epoch": 9.803395889186774, + "grad_norm": 0.021240234375, + "learning_rate": 0.018110360131432312, + "loss": 0.7988, + "num_input_tokens_seen": 38207208, + "step": 65820 + }, + { + "epoch": 9.804140601727733, + "grad_norm": 0.035400390625, + "learning_rate": 0.018108452827475984, + "loss": 0.8039, + "num_input_tokens_seen": 38210216, + "step": 65825 + }, + { + "epoch": 9.804885314268692, + "grad_norm": 0.046142578125, + "learning_rate": 0.0181065454710055, + "loss": 0.7969, + "num_input_tokens_seen": 38213000, + "step": 65830 + }, + { + "epoch": 9.805630026809652, + "grad_norm": 0.041015625, + "learning_rate": 0.018104638062053088, + "loss": 0.8156, + "num_input_tokens_seen": 38216072, + "step": 65835 + }, + { + "epoch": 9.80637473935061, + "grad_norm": 0.031982421875, + "learning_rate": 0.018102730600650967, + "loss": 0.802, + "num_input_tokens_seen": 38219080, + "step": 65840 + }, + { + "epoch": 9.80711945189157, + "grad_norm": 0.031982421875, + "learning_rate": 0.018100823086831374, + "loss": 0.8161, + "num_input_tokens_seen": 38222088, + "step": 65845 + }, + { + "epoch": 9.807864164432528, + "grad_norm": 0.03857421875, + "learning_rate": 0.018098915520626525, + "loss": 0.7979, + "num_input_tokens_seen": 38225544, + "step": 65850 + }, + { + "epoch": 9.808608876973489, + "grad_norm": 0.032470703125, + "learning_rate": 0.018097007902068644, + "loss": 0.7993, + "num_input_tokens_seen": 38228456, + "step": 65855 + }, + { + "epoch": 9.809353589514448, + "grad_norm": 0.0301513671875, + "learning_rate": 0.018095100231189966, + "loss": 0.8202, + "num_input_tokens_seen": 38231592, + "step": 65860 + }, + { + "epoch": 9.810098302055406, + "grad_norm": 0.03271484375, + "learning_rate": 0.018093192508022715, + "loss": 0.8162, + "num_input_tokens_seen": 38234280, + "step": 65865 + }, + { + "epoch": 9.810843014596365, + "grad_norm": 0.035888671875, + "learning_rate": 0.018091284732599116, + "loss": 0.7978, + "num_input_tokens_seen": 38237128, + "step": 65870 + }, + { + "epoch": 9.811587727137326, + "grad_norm": 0.0302734375, + "learning_rate": 0.018089376904951406, + "loss": 0.7643, + "num_input_tokens_seen": 38239912, + "step": 65875 + }, + { + "epoch": 9.812332439678285, + "grad_norm": 0.0306396484375, + "learning_rate": 0.018087469025111814, + "loss": 0.8, + "num_input_tokens_seen": 38242664, + "step": 65880 + }, + { + "epoch": 9.813077152219243, + "grad_norm": 0.0277099609375, + "learning_rate": 0.01808556109311257, + "loss": 0.8073, + "num_input_tokens_seen": 38245736, + "step": 65885 + }, + { + "epoch": 9.813821864760202, + "grad_norm": 0.02783203125, + "learning_rate": 0.01808365310898591, + "loss": 0.7934, + "num_input_tokens_seen": 38248584, + "step": 65890 + }, + { + "epoch": 9.814566577301163, + "grad_norm": 0.0257568359375, + "learning_rate": 0.018081745072764062, + "loss": 0.7969, + "num_input_tokens_seen": 38251496, + "step": 65895 + }, + { + "epoch": 9.815311289842121, + "grad_norm": 0.03271484375, + "learning_rate": 0.018079836984479264, + "loss": 0.8086, + "num_input_tokens_seen": 38254280, + "step": 65900 + }, + { + "epoch": 9.81605600238308, + "grad_norm": 0.030029296875, + "learning_rate": 0.01807792884416375, + "loss": 0.7917, + "num_input_tokens_seen": 38257128, + "step": 65905 + }, + { + "epoch": 9.816800714924039, + "grad_norm": 0.0296630859375, + "learning_rate": 0.018076020651849753, + "loss": 0.807, + "num_input_tokens_seen": 38260104, + "step": 65910 + }, + { + "epoch": 9.817545427465, + "grad_norm": 0.0380859375, + "learning_rate": 0.018074112407569516, + "loss": 0.8256, + "num_input_tokens_seen": 38263336, + "step": 65915 + }, + { + "epoch": 9.818290140005958, + "grad_norm": 0.0272216796875, + "learning_rate": 0.018072204111355277, + "loss": 0.7859, + "num_input_tokens_seen": 38266280, + "step": 65920 + }, + { + "epoch": 9.819034852546917, + "grad_norm": 0.044677734375, + "learning_rate": 0.01807029576323927, + "loss": 0.8144, + "num_input_tokens_seen": 38269448, + "step": 65925 + }, + { + "epoch": 9.819779565087876, + "grad_norm": 0.0341796875, + "learning_rate": 0.01806838736325374, + "loss": 0.8064, + "num_input_tokens_seen": 38272200, + "step": 65930 + }, + { + "epoch": 9.820524277628834, + "grad_norm": 0.03466796875, + "learning_rate": 0.01806647891143092, + "loss": 0.7905, + "num_input_tokens_seen": 38275208, + "step": 65935 + }, + { + "epoch": 9.821268990169795, + "grad_norm": 0.039306640625, + "learning_rate": 0.018064570407803054, + "loss": 0.8215, + "num_input_tokens_seen": 38278088, + "step": 65940 + }, + { + "epoch": 9.822013702710754, + "grad_norm": 0.02734375, + "learning_rate": 0.018062661852402385, + "loss": 0.8092, + "num_input_tokens_seen": 38280616, + "step": 65945 + }, + { + "epoch": 9.822758415251712, + "grad_norm": 0.01953125, + "learning_rate": 0.01806075324526116, + "loss": 0.787, + "num_input_tokens_seen": 38283496, + "step": 65950 + }, + { + "epoch": 9.823503127792671, + "grad_norm": 0.0242919921875, + "learning_rate": 0.018058844586411625, + "loss": 0.8029, + "num_input_tokens_seen": 38286408, + "step": 65955 + }, + { + "epoch": 9.824247840333632, + "grad_norm": 0.0263671875, + "learning_rate": 0.01805693587588601, + "loss": 0.8186, + "num_input_tokens_seen": 38289352, + "step": 65960 + }, + { + "epoch": 9.82499255287459, + "grad_norm": 0.0191650390625, + "learning_rate": 0.018055027113716576, + "loss": 0.7983, + "num_input_tokens_seen": 38292104, + "step": 65965 + }, + { + "epoch": 9.82573726541555, + "grad_norm": 0.02490234375, + "learning_rate": 0.01805311829993556, + "loss": 0.7934, + "num_input_tokens_seen": 38295016, + "step": 65970 + }, + { + "epoch": 9.826481977956508, + "grad_norm": 0.02734375, + "learning_rate": 0.018051209434575215, + "loss": 0.8151, + "num_input_tokens_seen": 38297896, + "step": 65975 + }, + { + "epoch": 9.827226690497469, + "grad_norm": 0.0230712890625, + "learning_rate": 0.018049300517667788, + "loss": 0.8066, + "num_input_tokens_seen": 38300808, + "step": 65980 + }, + { + "epoch": 9.827971403038427, + "grad_norm": 0.0234375, + "learning_rate": 0.018047391549245527, + "loss": 0.8058, + "num_input_tokens_seen": 38303496, + "step": 65985 + }, + { + "epoch": 9.828716115579386, + "grad_norm": 0.0189208984375, + "learning_rate": 0.018045482529340687, + "loss": 0.7946, + "num_input_tokens_seen": 38306216, + "step": 65990 + }, + { + "epoch": 9.829460828120345, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01804357345798551, + "loss": 0.8064, + "num_input_tokens_seen": 38308904, + "step": 65995 + }, + { + "epoch": 9.830205540661305, + "grad_norm": 0.0272216796875, + "learning_rate": 0.018041664335212253, + "loss": 0.786, + "num_input_tokens_seen": 38311944, + "step": 66000 + }, + { + "epoch": 9.830950253202264, + "grad_norm": 0.0257568359375, + "learning_rate": 0.018039755161053172, + "loss": 0.7921, + "num_input_tokens_seen": 38314664, + "step": 66005 + }, + { + "epoch": 9.831694965743223, + "grad_norm": 0.03564453125, + "learning_rate": 0.018037845935540513, + "loss": 0.7996, + "num_input_tokens_seen": 38317352, + "step": 66010 + }, + { + "epoch": 9.832439678284182, + "grad_norm": 0.029541015625, + "learning_rate": 0.018035936658706536, + "loss": 0.7904, + "num_input_tokens_seen": 38320296, + "step": 66015 + }, + { + "epoch": 9.833184390825142, + "grad_norm": 0.0693359375, + "learning_rate": 0.0180340273305835, + "loss": 0.8058, + "num_input_tokens_seen": 38323368, + "step": 66020 + }, + { + "epoch": 9.833929103366101, + "grad_norm": 0.032470703125, + "learning_rate": 0.01803211795120365, + "loss": 0.7861, + "num_input_tokens_seen": 38326344, + "step": 66025 + }, + { + "epoch": 9.83467381590706, + "grad_norm": 0.0277099609375, + "learning_rate": 0.01803020852059925, + "loss": 0.7931, + "num_input_tokens_seen": 38329352, + "step": 66030 + }, + { + "epoch": 9.835418528448018, + "grad_norm": 0.0250244140625, + "learning_rate": 0.018028299038802556, + "loss": 0.7909, + "num_input_tokens_seen": 38332296, + "step": 66035 + }, + { + "epoch": 9.836163240988979, + "grad_norm": 0.034912109375, + "learning_rate": 0.01802638950584583, + "loss": 0.7866, + "num_input_tokens_seen": 38335112, + "step": 66040 + }, + { + "epoch": 9.836907953529938, + "grad_norm": 0.06298828125, + "learning_rate": 0.018024479921761323, + "loss": 0.7962, + "num_input_tokens_seen": 38338120, + "step": 66045 + }, + { + "epoch": 9.837652666070897, + "grad_norm": 0.0308837890625, + "learning_rate": 0.018022570286581312, + "loss": 0.7753, + "num_input_tokens_seen": 38341160, + "step": 66050 + }, + { + "epoch": 9.838397378611855, + "grad_norm": 0.06982421875, + "learning_rate": 0.01802066060033804, + "loss": 0.7703, + "num_input_tokens_seen": 38344168, + "step": 66055 + }, + { + "epoch": 9.839142091152816, + "grad_norm": 0.037353515625, + "learning_rate": 0.018018750863063784, + "loss": 0.8308, + "num_input_tokens_seen": 38347048, + "step": 66060 + }, + { + "epoch": 9.839886803693775, + "grad_norm": 0.0235595703125, + "learning_rate": 0.018016841074790794, + "loss": 0.8058, + "num_input_tokens_seen": 38350056, + "step": 66065 + }, + { + "epoch": 9.840631516234733, + "grad_norm": 0.038818359375, + "learning_rate": 0.01801493123555134, + "loss": 0.8281, + "num_input_tokens_seen": 38352776, + "step": 66070 + }, + { + "epoch": 9.841376228775692, + "grad_norm": 0.036865234375, + "learning_rate": 0.01801302134537769, + "loss": 0.8135, + "num_input_tokens_seen": 38355816, + "step": 66075 + }, + { + "epoch": 9.842120941316653, + "grad_norm": 0.0306396484375, + "learning_rate": 0.018011111404302105, + "loss": 0.8058, + "num_input_tokens_seen": 38358920, + "step": 66080 + }, + { + "epoch": 9.842865653857611, + "grad_norm": 0.048583984375, + "learning_rate": 0.018009201412356853, + "loss": 0.7862, + "num_input_tokens_seen": 38361768, + "step": 66085 + }, + { + "epoch": 9.84361036639857, + "grad_norm": 0.028076171875, + "learning_rate": 0.018007291369574206, + "loss": 0.8313, + "num_input_tokens_seen": 38364616, + "step": 66090 + }, + { + "epoch": 9.844355078939529, + "grad_norm": 0.0595703125, + "learning_rate": 0.018005381275986426, + "loss": 0.7795, + "num_input_tokens_seen": 38367528, + "step": 66095 + }, + { + "epoch": 9.84509979148049, + "grad_norm": 0.0223388671875, + "learning_rate": 0.01800347113162578, + "loss": 0.7993, + "num_input_tokens_seen": 38370504, + "step": 66100 + }, + { + "epoch": 9.845844504021448, + "grad_norm": 0.0301513671875, + "learning_rate": 0.01800156093652454, + "loss": 0.7928, + "num_input_tokens_seen": 38373320, + "step": 66105 + }, + { + "epoch": 9.846589216562407, + "grad_norm": 0.03076171875, + "learning_rate": 0.017999650690714984, + "loss": 0.8274, + "num_input_tokens_seen": 38376200, + "step": 66110 + }, + { + "epoch": 9.847333929103366, + "grad_norm": 0.032958984375, + "learning_rate": 0.017997740394229376, + "loss": 0.8211, + "num_input_tokens_seen": 38379176, + "step": 66115 + }, + { + "epoch": 9.848078641644324, + "grad_norm": 0.0341796875, + "learning_rate": 0.017995830047099995, + "loss": 0.7771, + "num_input_tokens_seen": 38382152, + "step": 66120 + }, + { + "epoch": 9.848823354185285, + "grad_norm": 0.03271484375, + "learning_rate": 0.017993919649359105, + "loss": 0.8343, + "num_input_tokens_seen": 38385192, + "step": 66125 + }, + { + "epoch": 9.849568066726244, + "grad_norm": 0.043212890625, + "learning_rate": 0.017992009201038987, + "loss": 0.8, + "num_input_tokens_seen": 38388328, + "step": 66130 + }, + { + "epoch": 9.850312779267203, + "grad_norm": 0.0289306640625, + "learning_rate": 0.017990098702171917, + "loss": 0.7901, + "num_input_tokens_seen": 38391048, + "step": 66135 + }, + { + "epoch": 9.851057491808161, + "grad_norm": 0.029052734375, + "learning_rate": 0.017988188152790165, + "loss": 0.7928, + "num_input_tokens_seen": 38393832, + "step": 66140 + }, + { + "epoch": 9.851802204349122, + "grad_norm": 0.032958984375, + "learning_rate": 0.017986277552926012, + "loss": 0.7958, + "num_input_tokens_seen": 38396840, + "step": 66145 + }, + { + "epoch": 9.85254691689008, + "grad_norm": 0.029296875, + "learning_rate": 0.017984366902611737, + "loss": 0.7862, + "num_input_tokens_seen": 38399784, + "step": 66150 + }, + { + "epoch": 9.85329162943104, + "grad_norm": 0.023193359375, + "learning_rate": 0.017982456201879618, + "loss": 0.8137, + "num_input_tokens_seen": 38403016, + "step": 66155 + }, + { + "epoch": 9.854036341971998, + "grad_norm": 0.020263671875, + "learning_rate": 0.017980545450761925, + "loss": 0.8127, + "num_input_tokens_seen": 38406024, + "step": 66160 + }, + { + "epoch": 9.854781054512959, + "grad_norm": 0.01806640625, + "learning_rate": 0.01797863464929095, + "loss": 0.7859, + "num_input_tokens_seen": 38409160, + "step": 66165 + }, + { + "epoch": 9.855525767053917, + "grad_norm": 0.023681640625, + "learning_rate": 0.01797672379749897, + "loss": 0.8023, + "num_input_tokens_seen": 38412136, + "step": 66170 + }, + { + "epoch": 9.856270479594876, + "grad_norm": 0.031494140625, + "learning_rate": 0.01797481289541827, + "loss": 0.8127, + "num_input_tokens_seen": 38414888, + "step": 66175 + }, + { + "epoch": 9.857015192135835, + "grad_norm": 0.037353515625, + "learning_rate": 0.017972901943081123, + "loss": 0.8074, + "num_input_tokens_seen": 38417640, + "step": 66180 + }, + { + "epoch": 9.857759904676795, + "grad_norm": 0.023193359375, + "learning_rate": 0.017970990940519824, + "loss": 0.8032, + "num_input_tokens_seen": 38420808, + "step": 66185 + }, + { + "epoch": 9.858504617217754, + "grad_norm": 0.02734375, + "learning_rate": 0.017969079887766654, + "loss": 0.7929, + "num_input_tokens_seen": 38423848, + "step": 66190 + }, + { + "epoch": 9.859249329758713, + "grad_norm": 0.0303955078125, + "learning_rate": 0.017967168784853892, + "loss": 0.7871, + "num_input_tokens_seen": 38426568, + "step": 66195 + }, + { + "epoch": 9.859994042299672, + "grad_norm": 0.033203125, + "learning_rate": 0.017965257631813832, + "loss": 0.8124, + "num_input_tokens_seen": 38429736, + "step": 66200 + }, + { + "epoch": 9.860738754840632, + "grad_norm": 0.032470703125, + "learning_rate": 0.017963346428678757, + "loss": 0.7992, + "num_input_tokens_seen": 38432680, + "step": 66205 + }, + { + "epoch": 9.861483467381591, + "grad_norm": 0.0262451171875, + "learning_rate": 0.017961435175480957, + "loss": 0.7913, + "num_input_tokens_seen": 38435400, + "step": 66210 + }, + { + "epoch": 9.86222817992255, + "grad_norm": 0.02783203125, + "learning_rate": 0.01795952387225272, + "loss": 0.8166, + "num_input_tokens_seen": 38438344, + "step": 66215 + }, + { + "epoch": 9.862972892463509, + "grad_norm": 0.0296630859375, + "learning_rate": 0.017957612519026334, + "loss": 0.7916, + "num_input_tokens_seen": 38441288, + "step": 66220 + }, + { + "epoch": 9.863717605004469, + "grad_norm": 0.0296630859375, + "learning_rate": 0.017955701115834093, + "loss": 0.786, + "num_input_tokens_seen": 38444264, + "step": 66225 + }, + { + "epoch": 9.864462317545428, + "grad_norm": 0.0296630859375, + "learning_rate": 0.017953789662708282, + "loss": 0.7997, + "num_input_tokens_seen": 38447080, + "step": 66230 + }, + { + "epoch": 9.865207030086387, + "grad_norm": 0.03564453125, + "learning_rate": 0.017951878159681196, + "loss": 0.771, + "num_input_tokens_seen": 38449896, + "step": 66235 + }, + { + "epoch": 9.865951742627345, + "grad_norm": 0.0213623046875, + "learning_rate": 0.01794996660678513, + "loss": 0.8399, + "num_input_tokens_seen": 38452712, + "step": 66240 + }, + { + "epoch": 9.866696455168306, + "grad_norm": 0.04052734375, + "learning_rate": 0.017948055004052377, + "loss": 0.8253, + "num_input_tokens_seen": 38455432, + "step": 66245 + }, + { + "epoch": 9.867441167709265, + "grad_norm": 0.0196533203125, + "learning_rate": 0.01794614335151523, + "loss": 0.7821, + "num_input_tokens_seen": 38458504, + "step": 66250 + }, + { + "epoch": 9.868185880250223, + "grad_norm": 0.036376953125, + "learning_rate": 0.017944231649205993, + "loss": 0.8081, + "num_input_tokens_seen": 38461416, + "step": 66255 + }, + { + "epoch": 9.868930592791182, + "grad_norm": 0.0286865234375, + "learning_rate": 0.017942319897156944, + "loss": 0.8104, + "num_input_tokens_seen": 38464296, + "step": 66260 + }, + { + "epoch": 9.86967530533214, + "grad_norm": 0.0263671875, + "learning_rate": 0.017940408095400397, + "loss": 0.8021, + "num_input_tokens_seen": 38466952, + "step": 66265 + }, + { + "epoch": 9.870420017873101, + "grad_norm": 0.03564453125, + "learning_rate": 0.01793849624396864, + "loss": 0.8264, + "num_input_tokens_seen": 38469960, + "step": 66270 + }, + { + "epoch": 9.87116473041406, + "grad_norm": 0.028564453125, + "learning_rate": 0.01793658434289398, + "loss": 0.7966, + "num_input_tokens_seen": 38472872, + "step": 66275 + }, + { + "epoch": 9.871909442955019, + "grad_norm": 0.0257568359375, + "learning_rate": 0.01793467239220871, + "loss": 0.7765, + "num_input_tokens_seen": 38475752, + "step": 66280 + }, + { + "epoch": 9.87265415549598, + "grad_norm": 0.025634765625, + "learning_rate": 0.017932760391945128, + "loss": 0.7962, + "num_input_tokens_seen": 38478472, + "step": 66285 + }, + { + "epoch": 9.873398868036938, + "grad_norm": 0.018310546875, + "learning_rate": 0.017930848342135545, + "loss": 0.8164, + "num_input_tokens_seen": 38481128, + "step": 66290 + }, + { + "epoch": 9.874143580577897, + "grad_norm": 0.019775390625, + "learning_rate": 0.01792893624281226, + "loss": 0.7758, + "num_input_tokens_seen": 38484104, + "step": 66295 + }, + { + "epoch": 9.874888293118856, + "grad_norm": 0.027099609375, + "learning_rate": 0.01792702409400757, + "loss": 0.8176, + "num_input_tokens_seen": 38487016, + "step": 66300 + }, + { + "epoch": 9.875633005659815, + "grad_norm": 0.034912109375, + "learning_rate": 0.017925111895753784, + "loss": 0.8141, + "num_input_tokens_seen": 38489832, + "step": 66305 + }, + { + "epoch": 9.876377718200775, + "grad_norm": 0.0291748046875, + "learning_rate": 0.017923199648083202, + "loss": 0.816, + "num_input_tokens_seen": 38492968, + "step": 66310 + }, + { + "epoch": 9.877122430741734, + "grad_norm": 0.0279541015625, + "learning_rate": 0.017921287351028135, + "loss": 0.8079, + "num_input_tokens_seen": 38495784, + "step": 66315 + }, + { + "epoch": 9.877867143282693, + "grad_norm": 0.0234375, + "learning_rate": 0.017919375004620892, + "loss": 0.7843, + "num_input_tokens_seen": 38498760, + "step": 66320 + }, + { + "epoch": 9.878611855823651, + "grad_norm": 0.0198974609375, + "learning_rate": 0.01791746260889377, + "loss": 0.8128, + "num_input_tokens_seen": 38501832, + "step": 66325 + }, + { + "epoch": 9.879356568364612, + "grad_norm": 0.032470703125, + "learning_rate": 0.017915550163879086, + "loss": 0.8196, + "num_input_tokens_seen": 38504648, + "step": 66330 + }, + { + "epoch": 9.88010128090557, + "grad_norm": 0.0242919921875, + "learning_rate": 0.017913637669609146, + "loss": 0.7936, + "num_input_tokens_seen": 38507592, + "step": 66335 + }, + { + "epoch": 9.88084599344653, + "grad_norm": 0.02392578125, + "learning_rate": 0.017911725126116253, + "loss": 0.7789, + "num_input_tokens_seen": 38510664, + "step": 66340 + }, + { + "epoch": 9.881590705987488, + "grad_norm": 0.0242919921875, + "learning_rate": 0.017909812533432726, + "loss": 0.788, + "num_input_tokens_seen": 38513384, + "step": 66345 + }, + { + "epoch": 9.882335418528449, + "grad_norm": 0.021484375, + "learning_rate": 0.017907899891590872, + "loss": 0.8196, + "num_input_tokens_seen": 38516520, + "step": 66350 + }, + { + "epoch": 9.883080131069407, + "grad_norm": 0.0281982421875, + "learning_rate": 0.01790598720062301, + "loss": 0.8032, + "num_input_tokens_seen": 38519592, + "step": 66355 + }, + { + "epoch": 9.883824843610366, + "grad_norm": 0.0245361328125, + "learning_rate": 0.017904074460561446, + "loss": 0.8036, + "num_input_tokens_seen": 38522408, + "step": 66360 + }, + { + "epoch": 9.884569556151325, + "grad_norm": 0.027587890625, + "learning_rate": 0.01790216167143849, + "loss": 0.7757, + "num_input_tokens_seen": 38525512, + "step": 66365 + }, + { + "epoch": 9.885314268692285, + "grad_norm": 0.051025390625, + "learning_rate": 0.017900248833286468, + "loss": 0.8095, + "num_input_tokens_seen": 38528520, + "step": 66370 + }, + { + "epoch": 9.886058981233244, + "grad_norm": 0.034912109375, + "learning_rate": 0.01789833594613769, + "loss": 0.7896, + "num_input_tokens_seen": 38531496, + "step": 66375 + }, + { + "epoch": 9.886803693774203, + "grad_norm": 0.0299072265625, + "learning_rate": 0.01789642301002447, + "loss": 0.7891, + "num_input_tokens_seen": 38534184, + "step": 66380 + }, + { + "epoch": 9.887548406315162, + "grad_norm": 0.0301513671875, + "learning_rate": 0.01789451002497913, + "loss": 0.7816, + "num_input_tokens_seen": 38537256, + "step": 66385 + }, + { + "epoch": 9.888293118856122, + "grad_norm": 0.02587890625, + "learning_rate": 0.01789259699103398, + "loss": 0.8374, + "num_input_tokens_seen": 38540136, + "step": 66390 + }, + { + "epoch": 9.889037831397081, + "grad_norm": 0.0281982421875, + "learning_rate": 0.017890683908221346, + "loss": 0.7903, + "num_input_tokens_seen": 38542920, + "step": 66395 + }, + { + "epoch": 9.88978254393804, + "grad_norm": 0.029296875, + "learning_rate": 0.017888770776573543, + "loss": 0.7971, + "num_input_tokens_seen": 38545768, + "step": 66400 + }, + { + "epoch": 9.890527256478999, + "grad_norm": 0.031494140625, + "learning_rate": 0.017886857596122897, + "loss": 0.8174, + "num_input_tokens_seen": 38548552, + "step": 66405 + }, + { + "epoch": 9.891271969019959, + "grad_norm": 0.0247802734375, + "learning_rate": 0.017884944366901724, + "loss": 0.8115, + "num_input_tokens_seen": 38551208, + "step": 66410 + }, + { + "epoch": 9.892016681560918, + "grad_norm": 0.02880859375, + "learning_rate": 0.01788303108894235, + "loss": 0.7948, + "num_input_tokens_seen": 38553896, + "step": 66415 + }, + { + "epoch": 9.892761394101877, + "grad_norm": 0.0225830078125, + "learning_rate": 0.017881117762277098, + "loss": 0.7943, + "num_input_tokens_seen": 38557000, + "step": 66420 + }, + { + "epoch": 9.893506106642835, + "grad_norm": 0.0218505859375, + "learning_rate": 0.017879204386938286, + "loss": 0.7986, + "num_input_tokens_seen": 38560008, + "step": 66425 + }, + { + "epoch": 9.894250819183796, + "grad_norm": 0.039306640625, + "learning_rate": 0.01787729096295824, + "loss": 0.7955, + "num_input_tokens_seen": 38562920, + "step": 66430 + }, + { + "epoch": 9.894995531724755, + "grad_norm": 0.03369140625, + "learning_rate": 0.01787537749036929, + "loss": 0.7797, + "num_input_tokens_seen": 38565864, + "step": 66435 + }, + { + "epoch": 9.895740244265713, + "grad_norm": 0.033935546875, + "learning_rate": 0.017873463969203758, + "loss": 0.8157, + "num_input_tokens_seen": 38568904, + "step": 66440 + }, + { + "epoch": 9.896484956806672, + "grad_norm": 0.024658203125, + "learning_rate": 0.017871550399493976, + "loss": 0.7927, + "num_input_tokens_seen": 38571784, + "step": 66445 + }, + { + "epoch": 9.897229669347631, + "grad_norm": 0.042724609375, + "learning_rate": 0.017869636781272267, + "loss": 0.7836, + "num_input_tokens_seen": 38574632, + "step": 66450 + }, + { + "epoch": 9.897974381888591, + "grad_norm": 0.0272216796875, + "learning_rate": 0.017867723114570966, + "loss": 0.7842, + "num_input_tokens_seen": 38577448, + "step": 66455 + }, + { + "epoch": 9.89871909442955, + "grad_norm": 0.033935546875, + "learning_rate": 0.01786580939942239, + "loss": 0.8152, + "num_input_tokens_seen": 38580520, + "step": 66460 + }, + { + "epoch": 9.899463806970509, + "grad_norm": 0.029052734375, + "learning_rate": 0.01786389563585888, + "loss": 0.7861, + "num_input_tokens_seen": 38583528, + "step": 66465 + }, + { + "epoch": 9.900208519511468, + "grad_norm": 0.0220947265625, + "learning_rate": 0.017861981823912767, + "loss": 0.8041, + "num_input_tokens_seen": 38586248, + "step": 66470 + }, + { + "epoch": 9.900953232052428, + "grad_norm": 0.039794921875, + "learning_rate": 0.01786006796361637, + "loss": 0.7935, + "num_input_tokens_seen": 38589000, + "step": 66475 + }, + { + "epoch": 9.901697944593387, + "grad_norm": 0.026611328125, + "learning_rate": 0.01785815405500204, + "loss": 0.8217, + "num_input_tokens_seen": 38592232, + "step": 66480 + }, + { + "epoch": 9.902442657134346, + "grad_norm": 0.0306396484375, + "learning_rate": 0.017856240098102105, + "loss": 0.7837, + "num_input_tokens_seen": 38595144, + "step": 66485 + }, + { + "epoch": 9.903187369675305, + "grad_norm": 0.0247802734375, + "learning_rate": 0.01785432609294889, + "loss": 0.8234, + "num_input_tokens_seen": 38597768, + "step": 66490 + }, + { + "epoch": 9.903932082216265, + "grad_norm": 0.0233154296875, + "learning_rate": 0.01785241203957474, + "loss": 0.8031, + "num_input_tokens_seen": 38600680, + "step": 66495 + }, + { + "epoch": 9.904676794757224, + "grad_norm": 0.020263671875, + "learning_rate": 0.017850497938011984, + "loss": 0.7847, + "num_input_tokens_seen": 38603432, + "step": 66500 + }, + { + "epoch": 9.905421507298183, + "grad_norm": 0.037109375, + "learning_rate": 0.017848583788292967, + "loss": 0.7989, + "num_input_tokens_seen": 38606312, + "step": 66505 + }, + { + "epoch": 9.906166219839141, + "grad_norm": 0.01708984375, + "learning_rate": 0.017846669590450018, + "loss": 0.8306, + "num_input_tokens_seen": 38609256, + "step": 66510 + }, + { + "epoch": 9.906910932380102, + "grad_norm": 0.0272216796875, + "learning_rate": 0.017844755344515482, + "loss": 0.7818, + "num_input_tokens_seen": 38612360, + "step": 66515 + }, + { + "epoch": 9.90765564492106, + "grad_norm": 0.044677734375, + "learning_rate": 0.017842841050521696, + "loss": 0.8242, + "num_input_tokens_seen": 38615048, + "step": 66520 + }, + { + "epoch": 9.90840035746202, + "grad_norm": 0.0233154296875, + "learning_rate": 0.017840926708500997, + "loss": 0.7797, + "num_input_tokens_seen": 38617960, + "step": 66525 + }, + { + "epoch": 9.909145070002978, + "grad_norm": 0.03759765625, + "learning_rate": 0.017839012318485732, + "loss": 0.8008, + "num_input_tokens_seen": 38620776, + "step": 66530 + }, + { + "epoch": 9.909889782543939, + "grad_norm": 0.037353515625, + "learning_rate": 0.01783709788050824, + "loss": 0.7737, + "num_input_tokens_seen": 38623720, + "step": 66535 + }, + { + "epoch": 9.910634495084897, + "grad_norm": 0.026123046875, + "learning_rate": 0.01783518339460086, + "loss": 0.7993, + "num_input_tokens_seen": 38626664, + "step": 66540 + }, + { + "epoch": 9.911379207625856, + "grad_norm": 0.033447265625, + "learning_rate": 0.017833268860795934, + "loss": 0.815, + "num_input_tokens_seen": 38629736, + "step": 66545 + }, + { + "epoch": 9.912123920166815, + "grad_norm": 0.018798828125, + "learning_rate": 0.017831354279125817, + "loss": 0.8186, + "num_input_tokens_seen": 38632616, + "step": 66550 + }, + { + "epoch": 9.912868632707776, + "grad_norm": 0.0250244140625, + "learning_rate": 0.017829439649622844, + "loss": 0.7935, + "num_input_tokens_seen": 38635176, + "step": 66555 + }, + { + "epoch": 9.913613345248734, + "grad_norm": 0.037353515625, + "learning_rate": 0.017827524972319368, + "loss": 0.7847, + "num_input_tokens_seen": 38637928, + "step": 66560 + }, + { + "epoch": 9.914358057789693, + "grad_norm": 0.021240234375, + "learning_rate": 0.017825610247247724, + "loss": 0.8228, + "num_input_tokens_seen": 38640776, + "step": 66565 + }, + { + "epoch": 9.915102770330652, + "grad_norm": 0.03564453125, + "learning_rate": 0.017823695474440272, + "loss": 0.8146, + "num_input_tokens_seen": 38643720, + "step": 66570 + }, + { + "epoch": 9.915847482871612, + "grad_norm": 0.02978515625, + "learning_rate": 0.017821780653929354, + "loss": 0.7879, + "num_input_tokens_seen": 38646600, + "step": 66575 + }, + { + "epoch": 9.916592195412571, + "grad_norm": 0.052978515625, + "learning_rate": 0.01781986578574732, + "loss": 0.789, + "num_input_tokens_seen": 38649512, + "step": 66580 + }, + { + "epoch": 9.91733690795353, + "grad_norm": 0.0230712890625, + "learning_rate": 0.017817950869926522, + "loss": 0.7737, + "num_input_tokens_seen": 38652200, + "step": 66585 + }, + { + "epoch": 9.918081620494489, + "grad_norm": 0.039794921875, + "learning_rate": 0.017816035906499304, + "loss": 0.8146, + "num_input_tokens_seen": 38654984, + "step": 66590 + }, + { + "epoch": 9.91882633303545, + "grad_norm": 0.0269775390625, + "learning_rate": 0.017814120895498022, + "loss": 0.7809, + "num_input_tokens_seen": 38658120, + "step": 66595 + }, + { + "epoch": 9.919571045576408, + "grad_norm": 0.017333984375, + "learning_rate": 0.017812205836955027, + "loss": 0.7767, + "num_input_tokens_seen": 38661096, + "step": 66600 + }, + { + "epoch": 9.920315758117367, + "grad_norm": 0.016357421875, + "learning_rate": 0.01781029073090267, + "loss": 0.797, + "num_input_tokens_seen": 38663880, + "step": 66605 + }, + { + "epoch": 9.921060470658325, + "grad_norm": 0.02880859375, + "learning_rate": 0.01780837557737331, + "loss": 0.8117, + "num_input_tokens_seen": 38667208, + "step": 66610 + }, + { + "epoch": 9.921805183199286, + "grad_norm": 0.0184326171875, + "learning_rate": 0.017806460376399307, + "loss": 0.7853, + "num_input_tokens_seen": 38669928, + "step": 66615 + }, + { + "epoch": 9.922549895740245, + "grad_norm": 0.0478515625, + "learning_rate": 0.017804545128013, + "loss": 0.7433, + "num_input_tokens_seen": 38672840, + "step": 66620 + }, + { + "epoch": 9.923294608281203, + "grad_norm": 0.02734375, + "learning_rate": 0.017802629832246755, + "loss": 0.8031, + "num_input_tokens_seen": 38675688, + "step": 66625 + }, + { + "epoch": 9.924039320822162, + "grad_norm": 0.044189453125, + "learning_rate": 0.017800714489132927, + "loss": 0.8046, + "num_input_tokens_seen": 38678568, + "step": 66630 + }, + { + "epoch": 9.924784033363121, + "grad_norm": 0.0673828125, + "learning_rate": 0.017798799098703872, + "loss": 0.8212, + "num_input_tokens_seen": 38681288, + "step": 66635 + }, + { + "epoch": 9.925528745904082, + "grad_norm": 0.02587890625, + "learning_rate": 0.017796883660991955, + "loss": 0.7763, + "num_input_tokens_seen": 38683880, + "step": 66640 + }, + { + "epoch": 9.92627345844504, + "grad_norm": 0.031494140625, + "learning_rate": 0.017794968176029526, + "loss": 0.802, + "num_input_tokens_seen": 38686792, + "step": 66645 + }, + { + "epoch": 9.927018170985999, + "grad_norm": 0.0296630859375, + "learning_rate": 0.017793052643848957, + "loss": 0.775, + "num_input_tokens_seen": 38689512, + "step": 66650 + }, + { + "epoch": 9.927762883526958, + "grad_norm": 0.024169921875, + "learning_rate": 0.017791137064482602, + "loss": 0.7952, + "num_input_tokens_seen": 38692744, + "step": 66655 + }, + { + "epoch": 9.928507596067918, + "grad_norm": 0.025634765625, + "learning_rate": 0.01778922143796282, + "loss": 0.794, + "num_input_tokens_seen": 38695816, + "step": 66660 + }, + { + "epoch": 9.929252308608877, + "grad_norm": 0.03369140625, + "learning_rate": 0.017787305764321974, + "loss": 0.8063, + "num_input_tokens_seen": 38698792, + "step": 66665 + }, + { + "epoch": 9.929997021149836, + "grad_norm": 0.03564453125, + "learning_rate": 0.017785390043592433, + "loss": 0.8169, + "num_input_tokens_seen": 38701608, + "step": 66670 + }, + { + "epoch": 9.930741733690795, + "grad_norm": 0.029052734375, + "learning_rate": 0.01778347427580656, + "loss": 0.7761, + "num_input_tokens_seen": 38704424, + "step": 66675 + }, + { + "epoch": 9.931486446231755, + "grad_norm": 0.026123046875, + "learning_rate": 0.017781558460996712, + "loss": 0.8044, + "num_input_tokens_seen": 38707400, + "step": 66680 + }, + { + "epoch": 9.932231158772714, + "grad_norm": 0.035400390625, + "learning_rate": 0.017779642599195268, + "loss": 0.796, + "num_input_tokens_seen": 38710248, + "step": 66685 + }, + { + "epoch": 9.932975871313673, + "grad_norm": 0.03173828125, + "learning_rate": 0.017777726690434585, + "loss": 0.8027, + "num_input_tokens_seen": 38713096, + "step": 66690 + }, + { + "epoch": 9.933720583854631, + "grad_norm": 0.032470703125, + "learning_rate": 0.01777581073474703, + "loss": 0.7894, + "num_input_tokens_seen": 38716328, + "step": 66695 + }, + { + "epoch": 9.934465296395592, + "grad_norm": 0.0240478515625, + "learning_rate": 0.017773894732164976, + "loss": 0.8359, + "num_input_tokens_seen": 38719400, + "step": 66700 + }, + { + "epoch": 9.93521000893655, + "grad_norm": 0.0283203125, + "learning_rate": 0.01777197868272079, + "loss": 0.8278, + "num_input_tokens_seen": 38722344, + "step": 66705 + }, + { + "epoch": 9.93595472147751, + "grad_norm": 0.042724609375, + "learning_rate": 0.017770062586446842, + "loss": 0.805, + "num_input_tokens_seen": 38725192, + "step": 66710 + }, + { + "epoch": 9.936699434018468, + "grad_norm": 0.04150390625, + "learning_rate": 0.0177681464433755, + "loss": 0.7745, + "num_input_tokens_seen": 38728072, + "step": 66715 + }, + { + "epoch": 9.937444146559429, + "grad_norm": 0.033447265625, + "learning_rate": 0.01776623025353914, + "loss": 0.7708, + "num_input_tokens_seen": 38730856, + "step": 66720 + }, + { + "epoch": 9.938188859100388, + "grad_norm": 0.043701171875, + "learning_rate": 0.017764314016970126, + "loss": 0.8187, + "num_input_tokens_seen": 38733640, + "step": 66725 + }, + { + "epoch": 9.938933571641346, + "grad_norm": 0.027587890625, + "learning_rate": 0.017762397733700842, + "loss": 0.7953, + "num_input_tokens_seen": 38736488, + "step": 66730 + }, + { + "epoch": 9.939678284182305, + "grad_norm": 0.03271484375, + "learning_rate": 0.017760481403763653, + "loss": 0.7808, + "num_input_tokens_seen": 38739816, + "step": 66735 + }, + { + "epoch": 9.940422996723266, + "grad_norm": 0.016845703125, + "learning_rate": 0.01775856502719094, + "loss": 0.8147, + "num_input_tokens_seen": 38742760, + "step": 66740 + }, + { + "epoch": 9.941167709264224, + "grad_norm": 0.033935546875, + "learning_rate": 0.01775664860401507, + "loss": 0.8016, + "num_input_tokens_seen": 38745288, + "step": 66745 + }, + { + "epoch": 9.941912421805183, + "grad_norm": 0.0250244140625, + "learning_rate": 0.017754732134268425, + "loss": 0.7964, + "num_input_tokens_seen": 38747944, + "step": 66750 + }, + { + "epoch": 9.942657134346142, + "grad_norm": 0.02783203125, + "learning_rate": 0.01775281561798338, + "loss": 0.815, + "num_input_tokens_seen": 38750792, + "step": 66755 + }, + { + "epoch": 9.943401846887102, + "grad_norm": 0.035888671875, + "learning_rate": 0.017750899055192317, + "loss": 0.7879, + "num_input_tokens_seen": 38753704, + "step": 66760 + }, + { + "epoch": 9.944146559428061, + "grad_norm": 0.02783203125, + "learning_rate": 0.017748982445927604, + "loss": 0.7999, + "num_input_tokens_seen": 38756744, + "step": 66765 + }, + { + "epoch": 9.94489127196902, + "grad_norm": 0.02392578125, + "learning_rate": 0.017747065790221627, + "loss": 0.7975, + "num_input_tokens_seen": 38759656, + "step": 66770 + }, + { + "epoch": 9.945635984509979, + "grad_norm": 0.0301513671875, + "learning_rate": 0.01774514908810677, + "loss": 0.7871, + "num_input_tokens_seen": 38763720, + "step": 66775 + }, + { + "epoch": 9.946380697050937, + "grad_norm": 0.041015625, + "learning_rate": 0.017743232339615406, + "loss": 0.7897, + "num_input_tokens_seen": 38766376, + "step": 66780 + }, + { + "epoch": 9.947125409591898, + "grad_norm": 0.033203125, + "learning_rate": 0.017741315544779927, + "loss": 0.7861, + "num_input_tokens_seen": 38769512, + "step": 66785 + }, + { + "epoch": 9.947870122132857, + "grad_norm": 0.0302734375, + "learning_rate": 0.0177393987036327, + "loss": 0.8112, + "num_input_tokens_seen": 38772648, + "step": 66790 + }, + { + "epoch": 9.948614834673815, + "grad_norm": 0.0250244140625, + "learning_rate": 0.017737481816206116, + "loss": 0.7815, + "num_input_tokens_seen": 38775528, + "step": 66795 + }, + { + "epoch": 9.949359547214776, + "grad_norm": 0.039306640625, + "learning_rate": 0.01773556488253256, + "loss": 0.8025, + "num_input_tokens_seen": 38778568, + "step": 66800 + }, + { + "epoch": 9.950104259755735, + "grad_norm": 0.04345703125, + "learning_rate": 0.017733647902644416, + "loss": 0.8255, + "num_input_tokens_seen": 38781384, + "step": 66805 + }, + { + "epoch": 9.950848972296694, + "grad_norm": 0.04833984375, + "learning_rate": 0.017731730876574068, + "loss": 0.8141, + "num_input_tokens_seen": 38784040, + "step": 66810 + }, + { + "epoch": 9.951593684837652, + "grad_norm": 0.035400390625, + "learning_rate": 0.017729813804353907, + "loss": 0.7863, + "num_input_tokens_seen": 38787176, + "step": 66815 + }, + { + "epoch": 9.952338397378611, + "grad_norm": 0.03662109375, + "learning_rate": 0.017727896686016317, + "loss": 0.8188, + "num_input_tokens_seen": 38789992, + "step": 66820 + }, + { + "epoch": 9.953083109919572, + "grad_norm": 0.0322265625, + "learning_rate": 0.01772597952159368, + "loss": 0.7965, + "num_input_tokens_seen": 38793320, + "step": 66825 + }, + { + "epoch": 9.95382782246053, + "grad_norm": 0.02197265625, + "learning_rate": 0.01772406231111839, + "loss": 0.7951, + "num_input_tokens_seen": 38796200, + "step": 66830 + }, + { + "epoch": 9.954572535001489, + "grad_norm": 0.03125, + "learning_rate": 0.01772214505462284, + "loss": 0.7871, + "num_input_tokens_seen": 38799272, + "step": 66835 + }, + { + "epoch": 9.955317247542448, + "grad_norm": 0.0478515625, + "learning_rate": 0.01772022775213941, + "loss": 0.7959, + "num_input_tokens_seen": 38802024, + "step": 66840 + }, + { + "epoch": 9.956061960083408, + "grad_norm": 0.0380859375, + "learning_rate": 0.0177183104037005, + "loss": 0.8193, + "num_input_tokens_seen": 38804776, + "step": 66845 + }, + { + "epoch": 9.956806672624367, + "grad_norm": 0.02783203125, + "learning_rate": 0.017716393009338503, + "loss": 0.7931, + "num_input_tokens_seen": 38807688, + "step": 66850 + }, + { + "epoch": 9.957551385165326, + "grad_norm": 0.0380859375, + "learning_rate": 0.017714475569085803, + "loss": 0.8158, + "num_input_tokens_seen": 38810280, + "step": 66855 + }, + { + "epoch": 9.958296097706285, + "grad_norm": 0.043212890625, + "learning_rate": 0.017712558082974797, + "loss": 0.7972, + "num_input_tokens_seen": 38813128, + "step": 66860 + }, + { + "epoch": 9.959040810247245, + "grad_norm": 0.02880859375, + "learning_rate": 0.017710640551037878, + "loss": 0.7876, + "num_input_tokens_seen": 38816072, + "step": 66865 + }, + { + "epoch": 9.959785522788204, + "grad_norm": 0.032958984375, + "learning_rate": 0.01770872297330744, + "loss": 0.7937, + "num_input_tokens_seen": 38819240, + "step": 66870 + }, + { + "epoch": 9.960530235329163, + "grad_norm": 0.035400390625, + "learning_rate": 0.017706805349815884, + "loss": 0.799, + "num_input_tokens_seen": 38821928, + "step": 66875 + }, + { + "epoch": 9.961274947870121, + "grad_norm": 0.03857421875, + "learning_rate": 0.0177048876805956, + "loss": 0.8081, + "num_input_tokens_seen": 38825032, + "step": 66880 + }, + { + "epoch": 9.962019660411082, + "grad_norm": 0.03955078125, + "learning_rate": 0.01770296996567899, + "loss": 0.8102, + "num_input_tokens_seen": 38827656, + "step": 66885 + }, + { + "epoch": 9.96276437295204, + "grad_norm": 0.0311279296875, + "learning_rate": 0.017701052205098454, + "loss": 0.7997, + "num_input_tokens_seen": 38830568, + "step": 66890 + }, + { + "epoch": 9.963509085493, + "grad_norm": 0.032958984375, + "learning_rate": 0.017699134398886377, + "loss": 0.7763, + "num_input_tokens_seen": 38833640, + "step": 66895 + }, + { + "epoch": 9.964253798033958, + "grad_norm": 0.037353515625, + "learning_rate": 0.017697216547075175, + "loss": 0.789, + "num_input_tokens_seen": 38836456, + "step": 66900 + }, + { + "epoch": 9.964998510574919, + "grad_norm": 0.03662109375, + "learning_rate": 0.01769529864969724, + "loss": 0.8209, + "num_input_tokens_seen": 38839208, + "step": 66905 + }, + { + "epoch": 9.965743223115878, + "grad_norm": 0.03662109375, + "learning_rate": 0.017693380706784965, + "loss": 0.7958, + "num_input_tokens_seen": 38841992, + "step": 66910 + }, + { + "epoch": 9.966487935656836, + "grad_norm": 0.0284423828125, + "learning_rate": 0.01769146271837077, + "loss": 0.8077, + "num_input_tokens_seen": 38844808, + "step": 66915 + }, + { + "epoch": 9.967232648197795, + "grad_norm": 0.025146484375, + "learning_rate": 0.01768954468448705, + "loss": 0.7901, + "num_input_tokens_seen": 38847656, + "step": 66920 + }, + { + "epoch": 9.967977360738756, + "grad_norm": 0.0291748046875, + "learning_rate": 0.017687626605166196, + "loss": 0.8273, + "num_input_tokens_seen": 38850696, + "step": 66925 + }, + { + "epoch": 9.968722073279714, + "grad_norm": 0.0286865234375, + "learning_rate": 0.017685708480440623, + "loss": 0.8062, + "num_input_tokens_seen": 38854056, + "step": 66930 + }, + { + "epoch": 9.969466785820673, + "grad_norm": 0.0301513671875, + "learning_rate": 0.01768379031034274, + "loss": 0.8223, + "num_input_tokens_seen": 38857032, + "step": 66935 + }, + { + "epoch": 9.970211498361632, + "grad_norm": 0.046875, + "learning_rate": 0.01768187209490495, + "loss": 0.7963, + "num_input_tokens_seen": 38860136, + "step": 66940 + }, + { + "epoch": 9.970956210902592, + "grad_norm": 0.0296630859375, + "learning_rate": 0.01767995383415965, + "loss": 0.8037, + "num_input_tokens_seen": 38863048, + "step": 66945 + }, + { + "epoch": 9.971700923443551, + "grad_norm": 0.048583984375, + "learning_rate": 0.01767803552813926, + "loss": 0.8052, + "num_input_tokens_seen": 38865992, + "step": 66950 + }, + { + "epoch": 9.97244563598451, + "grad_norm": 0.046875, + "learning_rate": 0.017676117176876175, + "loss": 0.8069, + "num_input_tokens_seen": 38869032, + "step": 66955 + }, + { + "epoch": 9.973190348525469, + "grad_norm": 0.044921875, + "learning_rate": 0.017674198780402817, + "loss": 0.8188, + "num_input_tokens_seen": 38871944, + "step": 66960 + }, + { + "epoch": 9.973935061066427, + "grad_norm": 0.041015625, + "learning_rate": 0.01767228033875158, + "loss": 0.8017, + "num_input_tokens_seen": 38874408, + "step": 66965 + }, + { + "epoch": 9.974679773607388, + "grad_norm": 0.027587890625, + "learning_rate": 0.017670361851954888, + "loss": 0.7963, + "num_input_tokens_seen": 38877256, + "step": 66970 + }, + { + "epoch": 9.975424486148347, + "grad_norm": 0.03662109375, + "learning_rate": 0.017668443320045146, + "loss": 0.8008, + "num_input_tokens_seen": 38879912, + "step": 66975 + }, + { + "epoch": 9.976169198689306, + "grad_norm": 0.0341796875, + "learning_rate": 0.017666524743054766, + "loss": 0.8066, + "num_input_tokens_seen": 38882600, + "step": 66980 + }, + { + "epoch": 9.976913911230266, + "grad_norm": 0.02978515625, + "learning_rate": 0.017664606121016165, + "loss": 0.7824, + "num_input_tokens_seen": 38885672, + "step": 66985 + }, + { + "epoch": 9.977658623771225, + "grad_norm": 0.033203125, + "learning_rate": 0.01766268745396175, + "loss": 0.7892, + "num_input_tokens_seen": 38888808, + "step": 66990 + }, + { + "epoch": 9.978403336312184, + "grad_norm": 0.03662109375, + "learning_rate": 0.01766076874192393, + "loss": 0.7833, + "num_input_tokens_seen": 38891816, + "step": 66995 + }, + { + "epoch": 9.979148048853142, + "grad_norm": 0.042236328125, + "learning_rate": 0.017658849984935134, + "loss": 0.7801, + "num_input_tokens_seen": 38894728, + "step": 67000 + }, + { + "epoch": 9.979892761394101, + "grad_norm": 0.03369140625, + "learning_rate": 0.017656931183027764, + "loss": 0.8046, + "num_input_tokens_seen": 38897864, + "step": 67005 + }, + { + "epoch": 9.980637473935062, + "grad_norm": 0.0286865234375, + "learning_rate": 0.017655012336234243, + "loss": 0.792, + "num_input_tokens_seen": 38901096, + "step": 67010 + }, + { + "epoch": 9.98138218647602, + "grad_norm": 0.09228515625, + "learning_rate": 0.01765309344458699, + "loss": 0.8106, + "num_input_tokens_seen": 38904040, + "step": 67015 + }, + { + "epoch": 9.98212689901698, + "grad_norm": 0.0289306640625, + "learning_rate": 0.017651174508118424, + "loss": 0.7845, + "num_input_tokens_seen": 38906888, + "step": 67020 + }, + { + "epoch": 9.982871611557938, + "grad_norm": 0.030517578125, + "learning_rate": 0.01764925552686095, + "loss": 0.7883, + "num_input_tokens_seen": 38909864, + "step": 67025 + }, + { + "epoch": 9.983616324098898, + "grad_norm": 0.033447265625, + "learning_rate": 0.017647336500847002, + "loss": 0.824, + "num_input_tokens_seen": 38912616, + "step": 67030 + }, + { + "epoch": 9.984361036639857, + "grad_norm": 0.02587890625, + "learning_rate": 0.01764541743010899, + "loss": 0.8025, + "num_input_tokens_seen": 38915368, + "step": 67035 + }, + { + "epoch": 9.985105749180816, + "grad_norm": 0.031982421875, + "learning_rate": 0.017643498314679342, + "loss": 0.8157, + "num_input_tokens_seen": 38918312, + "step": 67040 + }, + { + "epoch": 9.985850461721775, + "grad_norm": 0.0299072265625, + "learning_rate": 0.017641579154590473, + "loss": 0.799, + "num_input_tokens_seen": 38921320, + "step": 67045 + }, + { + "epoch": 9.986595174262735, + "grad_norm": 0.033447265625, + "learning_rate": 0.017639659949874816, + "loss": 0.7884, + "num_input_tokens_seen": 38924360, + "step": 67050 + }, + { + "epoch": 9.987339886803694, + "grad_norm": 0.0498046875, + "learning_rate": 0.017637740700564782, + "loss": 0.7906, + "num_input_tokens_seen": 38927336, + "step": 67055 + }, + { + "epoch": 9.988084599344653, + "grad_norm": 0.0299072265625, + "learning_rate": 0.017635821406692804, + "loss": 0.7883, + "num_input_tokens_seen": 38930056, + "step": 67060 + }, + { + "epoch": 9.988829311885612, + "grad_norm": 0.03857421875, + "learning_rate": 0.0176339020682913, + "loss": 0.7948, + "num_input_tokens_seen": 38932904, + "step": 67065 + }, + { + "epoch": 9.989574024426572, + "grad_norm": 0.033203125, + "learning_rate": 0.017631982685392695, + "loss": 0.8057, + "num_input_tokens_seen": 38935912, + "step": 67070 + }, + { + "epoch": 9.99031873696753, + "grad_norm": 0.04248046875, + "learning_rate": 0.01763006325802942, + "loss": 0.8048, + "num_input_tokens_seen": 38938920, + "step": 67075 + }, + { + "epoch": 9.99106344950849, + "grad_norm": 0.043212890625, + "learning_rate": 0.0176281437862339, + "loss": 0.7768, + "num_input_tokens_seen": 38941672, + "step": 67080 + }, + { + "epoch": 9.991808162049448, + "grad_norm": 0.044677734375, + "learning_rate": 0.01762622427003856, + "loss": 0.8081, + "num_input_tokens_seen": 38945064, + "step": 67085 + }, + { + "epoch": 9.992552874590409, + "grad_norm": 0.054443359375, + "learning_rate": 0.01762430470947583, + "loss": 0.8227, + "num_input_tokens_seen": 38947816, + "step": 67090 + }, + { + "epoch": 9.993297587131368, + "grad_norm": 0.0537109375, + "learning_rate": 0.01762238510457814, + "loss": 0.8048, + "num_input_tokens_seen": 38951048, + "step": 67095 + }, + { + "epoch": 9.994042299672326, + "grad_norm": 0.05712890625, + "learning_rate": 0.017620465455377918, + "loss": 0.7956, + "num_input_tokens_seen": 38953864, + "step": 67100 + }, + { + "epoch": 9.994787012213285, + "grad_norm": 0.03662109375, + "learning_rate": 0.0176185457619076, + "loss": 0.7837, + "num_input_tokens_seen": 38956616, + "step": 67105 + }, + { + "epoch": 9.995531724754246, + "grad_norm": 0.03369140625, + "learning_rate": 0.017616626024199604, + "loss": 0.8213, + "num_input_tokens_seen": 38959272, + "step": 67110 + }, + { + "epoch": 9.996276437295204, + "grad_norm": 0.0255126953125, + "learning_rate": 0.01761470624228638, + "loss": 0.8003, + "num_input_tokens_seen": 38961960, + "step": 67115 + }, + { + "epoch": 9.997021149836163, + "grad_norm": 0.035400390625, + "learning_rate": 0.017612786416200343, + "loss": 0.8147, + "num_input_tokens_seen": 38965192, + "step": 67120 + }, + { + "epoch": 9.997765862377122, + "grad_norm": 0.03662109375, + "learning_rate": 0.017610866545973942, + "loss": 0.8092, + "num_input_tokens_seen": 38968424, + "step": 67125 + }, + { + "epoch": 9.998510574918082, + "grad_norm": 0.04296875, + "learning_rate": 0.017608946631639602, + "loss": 0.7926, + "num_input_tokens_seen": 38971208, + "step": 67130 + }, + { + "epoch": 9.999255287459041, + "grad_norm": 0.035888671875, + "learning_rate": 0.017607026673229758, + "loss": 0.7866, + "num_input_tokens_seen": 38973960, + "step": 67135 + }, + { + "epoch": 10.0, + "grad_norm": 0.0849609375, + "learning_rate": 0.01760510667077685, + "loss": 0.8102, + "num_input_tokens_seen": 38976336, + "step": 67140 + }, + { + "epoch": 10.0, + "eval_loss": 0.8017409443855286, + "eval_runtime": 70.6577, + "eval_samples_per_second": 42.232, + "eval_steps_per_second": 10.558, + "num_input_tokens_seen": 38976336, + "step": 67140 + }, + { + "epoch": 10.000744712540959, + "grad_norm": 0.0279541015625, + "learning_rate": 0.017603186624313313, + "loss": 0.8091, + "num_input_tokens_seen": 38979152, + "step": 67145 + }, + { + "epoch": 10.001489425081918, + "grad_norm": 0.02880859375, + "learning_rate": 0.017601266533871587, + "loss": 0.7918, + "num_input_tokens_seen": 38982416, + "step": 67150 + }, + { + "epoch": 10.002234137622878, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0175993463994841, + "loss": 0.7996, + "num_input_tokens_seen": 38985136, + "step": 67155 + }, + { + "epoch": 10.002978850163837, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0175974262211833, + "loss": 0.8032, + "num_input_tokens_seen": 38988048, + "step": 67160 + }, + { + "epoch": 10.003723562704796, + "grad_norm": 0.04345703125, + "learning_rate": 0.017595505999001622, + "loss": 0.8192, + "num_input_tokens_seen": 38991152, + "step": 67165 + }, + { + "epoch": 10.004468275245754, + "grad_norm": 0.0517578125, + "learning_rate": 0.017593585732971512, + "loss": 0.8028, + "num_input_tokens_seen": 38994256, + "step": 67170 + }, + { + "epoch": 10.005212987786715, + "grad_norm": 0.0263671875, + "learning_rate": 0.017591665423125402, + "loss": 0.7831, + "num_input_tokens_seen": 38996976, + "step": 67175 + }, + { + "epoch": 10.005957700327674, + "grad_norm": 0.0311279296875, + "learning_rate": 0.017589745069495743, + "loss": 0.7978, + "num_input_tokens_seen": 38999792, + "step": 67180 + }, + { + "epoch": 10.006702412868632, + "grad_norm": 0.0380859375, + "learning_rate": 0.017587824672114977, + "loss": 0.7972, + "num_input_tokens_seen": 39003152, + "step": 67185 + }, + { + "epoch": 10.007447125409591, + "grad_norm": 0.0279541015625, + "learning_rate": 0.01758590423101554, + "loss": 0.7821, + "num_input_tokens_seen": 39005840, + "step": 67190 + }, + { + "epoch": 10.008191837950552, + "grad_norm": 0.0272216796875, + "learning_rate": 0.017583983746229874, + "loss": 0.7998, + "num_input_tokens_seen": 39008496, + "step": 67195 + }, + { + "epoch": 10.00893655049151, + "grad_norm": 0.0296630859375, + "learning_rate": 0.01758206321779043, + "loss": 0.7931, + "num_input_tokens_seen": 39011504, + "step": 67200 + }, + { + "epoch": 10.00968126303247, + "grad_norm": 0.054443359375, + "learning_rate": 0.017580142645729657, + "loss": 0.8125, + "num_input_tokens_seen": 39014608, + "step": 67205 + }, + { + "epoch": 10.010425975573428, + "grad_norm": 0.0186767578125, + "learning_rate": 0.017578222030079995, + "loss": 0.7984, + "num_input_tokens_seen": 39017648, + "step": 67210 + }, + { + "epoch": 10.011170688114388, + "grad_norm": 0.0380859375, + "learning_rate": 0.017576301370873893, + "loss": 0.8132, + "num_input_tokens_seen": 39020528, + "step": 67215 + }, + { + "epoch": 10.011915400655347, + "grad_norm": 0.06494140625, + "learning_rate": 0.017574380668143794, + "loss": 0.8047, + "num_input_tokens_seen": 39023568, + "step": 67220 + }, + { + "epoch": 10.012660113196306, + "grad_norm": 0.0302734375, + "learning_rate": 0.017572459921922155, + "loss": 0.7994, + "num_input_tokens_seen": 39026640, + "step": 67225 + }, + { + "epoch": 10.013404825737265, + "grad_norm": 0.024169921875, + "learning_rate": 0.017570539132241415, + "loss": 0.7761, + "num_input_tokens_seen": 39029360, + "step": 67230 + }, + { + "epoch": 10.014149538278225, + "grad_norm": 0.03759765625, + "learning_rate": 0.01756861829913403, + "loss": 0.7927, + "num_input_tokens_seen": 39032528, + "step": 67235 + }, + { + "epoch": 10.014894250819184, + "grad_norm": 0.0281982421875, + "learning_rate": 0.017566697422632453, + "loss": 0.7885, + "num_input_tokens_seen": 39035600, + "step": 67240 + }, + { + "epoch": 10.015638963360143, + "grad_norm": 0.068359375, + "learning_rate": 0.017564776502769133, + "loss": 0.8346, + "num_input_tokens_seen": 39038416, + "step": 67245 + }, + { + "epoch": 10.016383675901102, + "grad_norm": 0.033935546875, + "learning_rate": 0.017562855539576514, + "loss": 0.772, + "num_input_tokens_seen": 39041136, + "step": 67250 + }, + { + "epoch": 10.017128388442062, + "grad_norm": 0.052001953125, + "learning_rate": 0.017560934533087057, + "loss": 0.8128, + "num_input_tokens_seen": 39044240, + "step": 67255 + }, + { + "epoch": 10.01787310098302, + "grad_norm": 0.051513671875, + "learning_rate": 0.017559013483333216, + "loss": 0.8176, + "num_input_tokens_seen": 39047216, + "step": 67260 + }, + { + "epoch": 10.01861781352398, + "grad_norm": 0.0306396484375, + "learning_rate": 0.01755709239034744, + "loss": 0.7866, + "num_input_tokens_seen": 39050032, + "step": 67265 + }, + { + "epoch": 10.019362526064938, + "grad_norm": 0.032470703125, + "learning_rate": 0.017555171254162193, + "loss": 0.7886, + "num_input_tokens_seen": 39053104, + "step": 67270 + }, + { + "epoch": 10.020107238605899, + "grad_norm": 0.0322265625, + "learning_rate": 0.017553250074809918, + "loss": 0.7956, + "num_input_tokens_seen": 39056080, + "step": 67275 + }, + { + "epoch": 10.020851951146858, + "grad_norm": 0.04638671875, + "learning_rate": 0.017551328852323085, + "loss": 0.7957, + "num_input_tokens_seen": 39058800, + "step": 67280 + }, + { + "epoch": 10.021596663687816, + "grad_norm": 0.046630859375, + "learning_rate": 0.01754940758673414, + "loss": 0.7857, + "num_input_tokens_seen": 39061520, + "step": 67285 + }, + { + "epoch": 10.022341376228775, + "grad_norm": 0.041015625, + "learning_rate": 0.017547486278075546, + "loss": 0.7751, + "num_input_tokens_seen": 39064400, + "step": 67290 + }, + { + "epoch": 10.023086088769736, + "grad_norm": 0.0311279296875, + "learning_rate": 0.017545564926379757, + "loss": 0.8055, + "num_input_tokens_seen": 39067120, + "step": 67295 + }, + { + "epoch": 10.023830801310694, + "grad_norm": 0.037109375, + "learning_rate": 0.017543643531679236, + "loss": 0.8062, + "num_input_tokens_seen": 39070160, + "step": 67300 + }, + { + "epoch": 10.024575513851653, + "grad_norm": 0.060302734375, + "learning_rate": 0.017541722094006448, + "loss": 0.835, + "num_input_tokens_seen": 39073200, + "step": 67305 + }, + { + "epoch": 10.025320226392612, + "grad_norm": 0.032958984375, + "learning_rate": 0.017539800613393847, + "loss": 0.7742, + "num_input_tokens_seen": 39075824, + "step": 67310 + }, + { + "epoch": 10.02606493893357, + "grad_norm": 0.0301513671875, + "learning_rate": 0.017537879089873896, + "loss": 0.7763, + "num_input_tokens_seen": 39078544, + "step": 67315 + }, + { + "epoch": 10.026809651474531, + "grad_norm": 0.035400390625, + "learning_rate": 0.017535957523479057, + "loss": 0.8021, + "num_input_tokens_seen": 39081328, + "step": 67320 + }, + { + "epoch": 10.02755436401549, + "grad_norm": 0.06103515625, + "learning_rate": 0.017534035914241786, + "loss": 0.8028, + "num_input_tokens_seen": 39084272, + "step": 67325 + }, + { + "epoch": 10.028299076556449, + "grad_norm": 0.03369140625, + "learning_rate": 0.017532114262194563, + "loss": 0.7909, + "num_input_tokens_seen": 39087344, + "step": 67330 + }, + { + "epoch": 10.029043789097408, + "grad_norm": 0.0576171875, + "learning_rate": 0.01753019256736984, + "loss": 0.8139, + "num_input_tokens_seen": 39090128, + "step": 67335 + }, + { + "epoch": 10.029788501638368, + "grad_norm": 0.03857421875, + "learning_rate": 0.01752827082980008, + "loss": 0.7954, + "num_input_tokens_seen": 39093200, + "step": 67340 + }, + { + "epoch": 10.030533214179327, + "grad_norm": 0.0419921875, + "learning_rate": 0.017526349049517765, + "loss": 0.796, + "num_input_tokens_seen": 39096464, + "step": 67345 + }, + { + "epoch": 10.031277926720286, + "grad_norm": 0.056396484375, + "learning_rate": 0.017524427226555343, + "loss": 0.7922, + "num_input_tokens_seen": 39099664, + "step": 67350 + }, + { + "epoch": 10.032022639261244, + "grad_norm": 0.0458984375, + "learning_rate": 0.017522505360945292, + "loss": 0.8077, + "num_input_tokens_seen": 39102736, + "step": 67355 + }, + { + "epoch": 10.032767351802205, + "grad_norm": 0.04248046875, + "learning_rate": 0.017520583452720073, + "loss": 0.8031, + "num_input_tokens_seen": 39105872, + "step": 67360 + }, + { + "epoch": 10.033512064343164, + "grad_norm": 0.037841796875, + "learning_rate": 0.01751866150191216, + "loss": 0.7976, + "num_input_tokens_seen": 39108720, + "step": 67365 + }, + { + "epoch": 10.034256776884122, + "grad_norm": 0.037841796875, + "learning_rate": 0.017516739508554018, + "loss": 0.795, + "num_input_tokens_seen": 39111504, + "step": 67370 + }, + { + "epoch": 10.035001489425081, + "grad_norm": 0.0322265625, + "learning_rate": 0.01751481747267812, + "loss": 0.806, + "num_input_tokens_seen": 39114416, + "step": 67375 + }, + { + "epoch": 10.035746201966042, + "grad_norm": 0.0419921875, + "learning_rate": 0.017512895394316944, + "loss": 0.7974, + "num_input_tokens_seen": 39117360, + "step": 67380 + }, + { + "epoch": 10.036490914507, + "grad_norm": 0.0341796875, + "learning_rate": 0.017510973273502952, + "loss": 0.8072, + "num_input_tokens_seen": 39120464, + "step": 67385 + }, + { + "epoch": 10.03723562704796, + "grad_norm": 0.04296875, + "learning_rate": 0.01750905111026862, + "loss": 0.8144, + "num_input_tokens_seen": 39123376, + "step": 67390 + }, + { + "epoch": 10.037980339588918, + "grad_norm": 0.037109375, + "learning_rate": 0.01750712890464641, + "loss": 0.8105, + "num_input_tokens_seen": 39126224, + "step": 67395 + }, + { + "epoch": 10.038725052129879, + "grad_norm": 0.0303955078125, + "learning_rate": 0.017505206656668812, + "loss": 0.7942, + "num_input_tokens_seen": 39129072, + "step": 67400 + }, + { + "epoch": 10.039469764670837, + "grad_norm": 0.026123046875, + "learning_rate": 0.017503284366368294, + "loss": 0.8111, + "num_input_tokens_seen": 39131952, + "step": 67405 + }, + { + "epoch": 10.040214477211796, + "grad_norm": 0.04736328125, + "learning_rate": 0.01750136203377733, + "loss": 0.8005, + "num_input_tokens_seen": 39134864, + "step": 67410 + }, + { + "epoch": 10.040959189752755, + "grad_norm": 0.03369140625, + "learning_rate": 0.017499439658928393, + "loss": 0.7868, + "num_input_tokens_seen": 39137552, + "step": 67415 + }, + { + "epoch": 10.041703902293715, + "grad_norm": 0.043701171875, + "learning_rate": 0.017497517241853968, + "loss": 0.8199, + "num_input_tokens_seen": 39140720, + "step": 67420 + }, + { + "epoch": 10.042448614834674, + "grad_norm": 0.026611328125, + "learning_rate": 0.01749559478258653, + "loss": 0.7979, + "num_input_tokens_seen": 39143856, + "step": 67425 + }, + { + "epoch": 10.043193327375633, + "grad_norm": 0.0220947265625, + "learning_rate": 0.01749367228115855, + "loss": 0.8465, + "num_input_tokens_seen": 39146544, + "step": 67430 + }, + { + "epoch": 10.043938039916592, + "grad_norm": 0.0244140625, + "learning_rate": 0.01749174973760251, + "loss": 0.7872, + "num_input_tokens_seen": 39149296, + "step": 67435 + }, + { + "epoch": 10.044682752457552, + "grad_norm": 0.025146484375, + "learning_rate": 0.017489827151950892, + "loss": 0.7868, + "num_input_tokens_seen": 39152336, + "step": 67440 + }, + { + "epoch": 10.045427464998511, + "grad_norm": 0.0311279296875, + "learning_rate": 0.01748790452423618, + "loss": 0.7874, + "num_input_tokens_seen": 39155440, + "step": 67445 + }, + { + "epoch": 10.04617217753947, + "grad_norm": 0.041748046875, + "learning_rate": 0.017485981854490844, + "loss": 0.8106, + "num_input_tokens_seen": 39158480, + "step": 67450 + }, + { + "epoch": 10.046916890080428, + "grad_norm": 0.031982421875, + "learning_rate": 0.017484059142747373, + "loss": 0.8143, + "num_input_tokens_seen": 39161712, + "step": 67455 + }, + { + "epoch": 10.047661602621389, + "grad_norm": 0.056396484375, + "learning_rate": 0.01748213638903824, + "loss": 0.8106, + "num_input_tokens_seen": 39164624, + "step": 67460 + }, + { + "epoch": 10.048406315162348, + "grad_norm": 0.038330078125, + "learning_rate": 0.01748021359339595, + "loss": 0.8057, + "num_input_tokens_seen": 39167696, + "step": 67465 + }, + { + "epoch": 10.049151027703306, + "grad_norm": 0.0223388671875, + "learning_rate": 0.017478290755852964, + "loss": 0.8043, + "num_input_tokens_seen": 39170512, + "step": 67470 + }, + { + "epoch": 10.049895740244265, + "grad_norm": 0.026123046875, + "learning_rate": 0.017476367876441773, + "loss": 0.8185, + "num_input_tokens_seen": 39173648, + "step": 67475 + }, + { + "epoch": 10.050640452785226, + "grad_norm": 0.04345703125, + "learning_rate": 0.01747444495519487, + "loss": 0.7919, + "num_input_tokens_seen": 39176496, + "step": 67480 + }, + { + "epoch": 10.051385165326185, + "grad_norm": 0.041259765625, + "learning_rate": 0.01747252199214473, + "loss": 0.8102, + "num_input_tokens_seen": 39179408, + "step": 67485 + }, + { + "epoch": 10.052129877867143, + "grad_norm": 0.05078125, + "learning_rate": 0.01747059898732384, + "loss": 0.7994, + "num_input_tokens_seen": 39182480, + "step": 67490 + }, + { + "epoch": 10.052874590408102, + "grad_norm": 0.035888671875, + "learning_rate": 0.017468675940764695, + "loss": 0.8034, + "num_input_tokens_seen": 39185296, + "step": 67495 + }, + { + "epoch": 10.05361930294906, + "grad_norm": 0.0260009765625, + "learning_rate": 0.017466752852499778, + "loss": 0.8041, + "num_input_tokens_seen": 39188080, + "step": 67500 + }, + { + "epoch": 10.054364015490021, + "grad_norm": 0.0269775390625, + "learning_rate": 0.01746482972256158, + "loss": 0.8064, + "num_input_tokens_seen": 39191024, + "step": 67505 + }, + { + "epoch": 10.05510872803098, + "grad_norm": 0.0208740234375, + "learning_rate": 0.01746290655098259, + "loss": 0.8109, + "num_input_tokens_seen": 39194000, + "step": 67510 + }, + { + "epoch": 10.055853440571939, + "grad_norm": 0.0205078125, + "learning_rate": 0.017460983337795298, + "loss": 0.7948, + "num_input_tokens_seen": 39196784, + "step": 67515 + }, + { + "epoch": 10.056598153112898, + "grad_norm": 0.037109375, + "learning_rate": 0.01745906008303219, + "loss": 0.8027, + "num_input_tokens_seen": 39199632, + "step": 67520 + }, + { + "epoch": 10.057342865653858, + "grad_norm": 0.0264892578125, + "learning_rate": 0.017457136786725762, + "loss": 0.7965, + "num_input_tokens_seen": 39202288, + "step": 67525 + }, + { + "epoch": 10.058087578194817, + "grad_norm": 0.031005859375, + "learning_rate": 0.017455213448908508, + "loss": 0.8124, + "num_input_tokens_seen": 39205200, + "step": 67530 + }, + { + "epoch": 10.058832290735776, + "grad_norm": 0.0341796875, + "learning_rate": 0.017453290069612912, + "loss": 0.7908, + "num_input_tokens_seen": 39208624, + "step": 67535 + }, + { + "epoch": 10.059577003276734, + "grad_norm": 0.0262451171875, + "learning_rate": 0.01745136664887148, + "loss": 0.787, + "num_input_tokens_seen": 39211312, + "step": 67540 + }, + { + "epoch": 10.060321715817695, + "grad_norm": 0.0250244140625, + "learning_rate": 0.0174494431867167, + "loss": 0.8061, + "num_input_tokens_seen": 39214480, + "step": 67545 + }, + { + "epoch": 10.061066428358654, + "grad_norm": 0.0306396484375, + "learning_rate": 0.017447519683181065, + "loss": 0.788, + "num_input_tokens_seen": 39217424, + "step": 67550 + }, + { + "epoch": 10.061811140899612, + "grad_norm": 0.0244140625, + "learning_rate": 0.01744559613829707, + "loss": 0.8017, + "num_input_tokens_seen": 39220016, + "step": 67555 + }, + { + "epoch": 10.062555853440571, + "grad_norm": 0.045166015625, + "learning_rate": 0.01744367255209722, + "loss": 0.7965, + "num_input_tokens_seen": 39222832, + "step": 67560 + }, + { + "epoch": 10.063300565981532, + "grad_norm": 0.028076171875, + "learning_rate": 0.017441748924614, + "loss": 0.7918, + "num_input_tokens_seen": 39225808, + "step": 67565 + }, + { + "epoch": 10.06404527852249, + "grad_norm": 0.04248046875, + "learning_rate": 0.01743982525587991, + "loss": 0.8039, + "num_input_tokens_seen": 39228944, + "step": 67570 + }, + { + "epoch": 10.06478999106345, + "grad_norm": 0.041748046875, + "learning_rate": 0.01743790154592746, + "loss": 0.8089, + "num_input_tokens_seen": 39231920, + "step": 67575 + }, + { + "epoch": 10.065534703604408, + "grad_norm": 0.0419921875, + "learning_rate": 0.01743597779478914, + "loss": 0.8126, + "num_input_tokens_seen": 39235088, + "step": 67580 + }, + { + "epoch": 10.066279416145369, + "grad_norm": 0.03759765625, + "learning_rate": 0.01743405400249745, + "loss": 0.8378, + "num_input_tokens_seen": 39237840, + "step": 67585 + }, + { + "epoch": 10.067024128686327, + "grad_norm": 0.035400390625, + "learning_rate": 0.017432130169084892, + "loss": 0.8201, + "num_input_tokens_seen": 39240400, + "step": 67590 + }, + { + "epoch": 10.067768841227286, + "grad_norm": 0.0274658203125, + "learning_rate": 0.017430206294583965, + "loss": 0.8, + "num_input_tokens_seen": 39243184, + "step": 67595 + }, + { + "epoch": 10.068513553768245, + "grad_norm": 0.035888671875, + "learning_rate": 0.017428282379027174, + "loss": 0.7877, + "num_input_tokens_seen": 39246160, + "step": 67600 + }, + { + "epoch": 10.069258266309205, + "grad_norm": 0.0247802734375, + "learning_rate": 0.017426358422447018, + "loss": 0.7819, + "num_input_tokens_seen": 39249008, + "step": 67605 + }, + { + "epoch": 10.070002978850164, + "grad_norm": 0.026611328125, + "learning_rate": 0.017424434424876005, + "loss": 0.7858, + "num_input_tokens_seen": 39251888, + "step": 67610 + }, + { + "epoch": 10.070747691391123, + "grad_norm": 0.043701171875, + "learning_rate": 0.017422510386346634, + "loss": 0.7716, + "num_input_tokens_seen": 39254896, + "step": 67615 + }, + { + "epoch": 10.071492403932082, + "grad_norm": 0.0274658203125, + "learning_rate": 0.017420586306891413, + "loss": 0.7867, + "num_input_tokens_seen": 39257520, + "step": 67620 + }, + { + "epoch": 10.072237116473042, + "grad_norm": 0.033935546875, + "learning_rate": 0.017418662186542852, + "loss": 0.8233, + "num_input_tokens_seen": 39260432, + "step": 67625 + }, + { + "epoch": 10.072981829014001, + "grad_norm": 0.038818359375, + "learning_rate": 0.01741673802533344, + "loss": 0.7868, + "num_input_tokens_seen": 39263312, + "step": 67630 + }, + { + "epoch": 10.07372654155496, + "grad_norm": 0.032470703125, + "learning_rate": 0.017414813823295707, + "loss": 0.7765, + "num_input_tokens_seen": 39266000, + "step": 67635 + }, + { + "epoch": 10.074471254095918, + "grad_norm": 0.032470703125, + "learning_rate": 0.01741288958046214, + "loss": 0.7908, + "num_input_tokens_seen": 39268976, + "step": 67640 + }, + { + "epoch": 10.075215966636879, + "grad_norm": 0.044921875, + "learning_rate": 0.017410965296865265, + "loss": 0.8055, + "num_input_tokens_seen": 39271792, + "step": 67645 + }, + { + "epoch": 10.075960679177838, + "grad_norm": 0.0703125, + "learning_rate": 0.017409040972537572, + "loss": 0.7997, + "num_input_tokens_seen": 39274704, + "step": 67650 + }, + { + "epoch": 10.076705391718797, + "grad_norm": 0.034423828125, + "learning_rate": 0.017407116607511586, + "loss": 0.8012, + "num_input_tokens_seen": 39277680, + "step": 67655 + }, + { + "epoch": 10.077450104259755, + "grad_norm": 0.031494140625, + "learning_rate": 0.017405192201819807, + "loss": 0.7711, + "num_input_tokens_seen": 39280624, + "step": 67660 + }, + { + "epoch": 10.078194816800714, + "grad_norm": 0.0322265625, + "learning_rate": 0.017403267755494757, + "loss": 0.7882, + "num_input_tokens_seen": 39283600, + "step": 67665 + }, + { + "epoch": 10.078939529341675, + "grad_norm": 0.03271484375, + "learning_rate": 0.017401343268568938, + "loss": 0.8428, + "num_input_tokens_seen": 39286800, + "step": 67670 + }, + { + "epoch": 10.079684241882633, + "grad_norm": 0.02880859375, + "learning_rate": 0.017399418741074864, + "loss": 0.8042, + "num_input_tokens_seen": 39289488, + "step": 67675 + }, + { + "epoch": 10.080428954423592, + "grad_norm": 0.03271484375, + "learning_rate": 0.01739749417304505, + "loss": 0.7808, + "num_input_tokens_seen": 39292720, + "step": 67680 + }, + { + "epoch": 10.08117366696455, + "grad_norm": 0.052490234375, + "learning_rate": 0.017395569564512006, + "loss": 0.784, + "num_input_tokens_seen": 39295472, + "step": 67685 + }, + { + "epoch": 10.081918379505511, + "grad_norm": 0.0208740234375, + "learning_rate": 0.017393644915508252, + "loss": 0.7767, + "num_input_tokens_seen": 39298288, + "step": 67690 + }, + { + "epoch": 10.08266309204647, + "grad_norm": 0.033935546875, + "learning_rate": 0.0173917202260663, + "loss": 0.7957, + "num_input_tokens_seen": 39300944, + "step": 67695 + }, + { + "epoch": 10.083407804587429, + "grad_norm": 0.05078125, + "learning_rate": 0.01738979549621866, + "loss": 0.782, + "num_input_tokens_seen": 39303952, + "step": 67700 + }, + { + "epoch": 10.084152517128388, + "grad_norm": 0.0242919921875, + "learning_rate": 0.017387870725997862, + "loss": 0.8403, + "num_input_tokens_seen": 39306832, + "step": 67705 + }, + { + "epoch": 10.084897229669348, + "grad_norm": 0.0634765625, + "learning_rate": 0.017385945915436407, + "loss": 0.789, + "num_input_tokens_seen": 39310224, + "step": 67710 + }, + { + "epoch": 10.085641942210307, + "grad_norm": 0.06884765625, + "learning_rate": 0.01738402106456683, + "loss": 0.8228, + "num_input_tokens_seen": 39313232, + "step": 67715 + }, + { + "epoch": 10.086386654751266, + "grad_norm": 0.032470703125, + "learning_rate": 0.017382096173421638, + "loss": 0.8427, + "num_input_tokens_seen": 39315760, + "step": 67720 + }, + { + "epoch": 10.087131367292224, + "grad_norm": 0.03857421875, + "learning_rate": 0.017380171242033348, + "loss": 0.7956, + "num_input_tokens_seen": 39318544, + "step": 67725 + }, + { + "epoch": 10.087876079833185, + "grad_norm": 0.033447265625, + "learning_rate": 0.017378246270434484, + "loss": 0.8193, + "num_input_tokens_seen": 39321328, + "step": 67730 + }, + { + "epoch": 10.088620792374144, + "grad_norm": 0.037353515625, + "learning_rate": 0.017376321258657568, + "loss": 0.817, + "num_input_tokens_seen": 39324240, + "step": 67735 + }, + { + "epoch": 10.089365504915103, + "grad_norm": 0.036865234375, + "learning_rate": 0.01737439620673512, + "loss": 0.7781, + "num_input_tokens_seen": 39327184, + "step": 67740 + }, + { + "epoch": 10.090110217456061, + "grad_norm": 0.0238037109375, + "learning_rate": 0.01737247111469966, + "loss": 0.7765, + "num_input_tokens_seen": 39329968, + "step": 67745 + }, + { + "epoch": 10.090854929997022, + "grad_norm": 0.0242919921875, + "learning_rate": 0.017370545982583717, + "loss": 0.8033, + "num_input_tokens_seen": 39332944, + "step": 67750 + }, + { + "epoch": 10.09159964253798, + "grad_norm": 0.0224609375, + "learning_rate": 0.0173686208104198, + "loss": 0.7845, + "num_input_tokens_seen": 39335696, + "step": 67755 + }, + { + "epoch": 10.09234435507894, + "grad_norm": 0.032470703125, + "learning_rate": 0.017366695598240445, + "loss": 0.7765, + "num_input_tokens_seen": 39338736, + "step": 67760 + }, + { + "epoch": 10.093089067619898, + "grad_norm": 0.0302734375, + "learning_rate": 0.017364770346078175, + "loss": 0.816, + "num_input_tokens_seen": 39341744, + "step": 67765 + }, + { + "epoch": 10.093833780160859, + "grad_norm": 0.032470703125, + "learning_rate": 0.01736284505396551, + "loss": 0.7681, + "num_input_tokens_seen": 39344720, + "step": 67770 + }, + { + "epoch": 10.094578492701817, + "grad_norm": 0.0208740234375, + "learning_rate": 0.017360919721934986, + "loss": 0.7934, + "num_input_tokens_seen": 39347792, + "step": 67775 + }, + { + "epoch": 10.095323205242776, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01735899435001912, + "loss": 0.7965, + "num_input_tokens_seen": 39351120, + "step": 67780 + }, + { + "epoch": 10.096067917783735, + "grad_norm": 0.0191650390625, + "learning_rate": 0.017357068938250442, + "loss": 0.7851, + "num_input_tokens_seen": 39354128, + "step": 67785 + }, + { + "epoch": 10.096812630324695, + "grad_norm": 0.01904296875, + "learning_rate": 0.01735514348666148, + "loss": 0.7766, + "num_input_tokens_seen": 39358192, + "step": 67790 + }, + { + "epoch": 10.097557342865654, + "grad_norm": 0.0250244140625, + "learning_rate": 0.01735321799528476, + "loss": 0.8076, + "num_input_tokens_seen": 39361008, + "step": 67795 + }, + { + "epoch": 10.098302055406613, + "grad_norm": 0.03369140625, + "learning_rate": 0.017351292464152814, + "loss": 0.784, + "num_input_tokens_seen": 39363952, + "step": 67800 + }, + { + "epoch": 10.099046767947572, + "grad_norm": 0.033203125, + "learning_rate": 0.01734936689329817, + "loss": 0.8319, + "num_input_tokens_seen": 39366896, + "step": 67805 + }, + { + "epoch": 10.099791480488532, + "grad_norm": 0.032958984375, + "learning_rate": 0.017347441282753364, + "loss": 0.7895, + "num_input_tokens_seen": 39369808, + "step": 67810 + }, + { + "epoch": 10.100536193029491, + "grad_norm": 0.035888671875, + "learning_rate": 0.017345515632550924, + "loss": 0.7758, + "num_input_tokens_seen": 39372688, + "step": 67815 + }, + { + "epoch": 10.10128090557045, + "grad_norm": 0.01708984375, + "learning_rate": 0.01734358994272338, + "loss": 0.7894, + "num_input_tokens_seen": 39375728, + "step": 67820 + }, + { + "epoch": 10.102025618111409, + "grad_norm": 0.029296875, + "learning_rate": 0.017341664213303267, + "loss": 0.7978, + "num_input_tokens_seen": 39378768, + "step": 67825 + }, + { + "epoch": 10.102770330652369, + "grad_norm": 0.0185546875, + "learning_rate": 0.017339738444323113, + "loss": 0.8304, + "num_input_tokens_seen": 39381808, + "step": 67830 + }, + { + "epoch": 10.103515043193328, + "grad_norm": 0.025390625, + "learning_rate": 0.017337812635815457, + "loss": 0.8092, + "num_input_tokens_seen": 39384720, + "step": 67835 + }, + { + "epoch": 10.104259755734287, + "grad_norm": 0.028564453125, + "learning_rate": 0.017335886787812834, + "loss": 0.8006, + "num_input_tokens_seen": 39387824, + "step": 67840 + }, + { + "epoch": 10.105004468275245, + "grad_norm": 0.044677734375, + "learning_rate": 0.017333960900347778, + "loss": 0.7943, + "num_input_tokens_seen": 39390704, + "step": 67845 + }, + { + "epoch": 10.105749180816204, + "grad_norm": 0.0216064453125, + "learning_rate": 0.017332034973452825, + "loss": 0.8058, + "num_input_tokens_seen": 39393648, + "step": 67850 + }, + { + "epoch": 10.106493893357165, + "grad_norm": 0.039306640625, + "learning_rate": 0.01733010900716051, + "loss": 0.8699, + "num_input_tokens_seen": 39396432, + "step": 67855 + }, + { + "epoch": 10.107238605898123, + "grad_norm": 0.02099609375, + "learning_rate": 0.017328183001503374, + "loss": 0.8049, + "num_input_tokens_seen": 39399184, + "step": 67860 + }, + { + "epoch": 10.107983318439082, + "grad_norm": 0.019775390625, + "learning_rate": 0.01732625695651395, + "loss": 0.7977, + "num_input_tokens_seen": 39402160, + "step": 67865 + }, + { + "epoch": 10.108728030980041, + "grad_norm": 0.033203125, + "learning_rate": 0.01732433087222478, + "loss": 0.7693, + "num_input_tokens_seen": 39405104, + "step": 67870 + }, + { + "epoch": 10.109472743521001, + "grad_norm": 0.0439453125, + "learning_rate": 0.0173224047486684, + "loss": 0.8397, + "num_input_tokens_seen": 39407984, + "step": 67875 + }, + { + "epoch": 10.11021745606196, + "grad_norm": 0.0224609375, + "learning_rate": 0.017320478585877357, + "loss": 0.8008, + "num_input_tokens_seen": 39410960, + "step": 67880 + }, + { + "epoch": 10.110962168602919, + "grad_norm": 0.031494140625, + "learning_rate": 0.017318552383884186, + "loss": 0.8033, + "num_input_tokens_seen": 39413776, + "step": 67885 + }, + { + "epoch": 10.111706881143878, + "grad_norm": 0.015869140625, + "learning_rate": 0.017316626142721423, + "loss": 0.7951, + "num_input_tokens_seen": 39416912, + "step": 67890 + }, + { + "epoch": 10.112451593684838, + "grad_norm": 0.04052734375, + "learning_rate": 0.01731469986242162, + "loss": 0.8133, + "num_input_tokens_seen": 39419888, + "step": 67895 + }, + { + "epoch": 10.113196306225797, + "grad_norm": 0.0252685546875, + "learning_rate": 0.017312773543017315, + "loss": 0.8204, + "num_input_tokens_seen": 39422576, + "step": 67900 + }, + { + "epoch": 10.113941018766756, + "grad_norm": 0.0189208984375, + "learning_rate": 0.01731084718454105, + "loss": 0.7963, + "num_input_tokens_seen": 39426000, + "step": 67905 + }, + { + "epoch": 10.114685731307715, + "grad_norm": 0.033203125, + "learning_rate": 0.017308920787025374, + "loss": 0.7777, + "num_input_tokens_seen": 39428784, + "step": 67910 + }, + { + "epoch": 10.115430443848675, + "grad_norm": 0.0196533203125, + "learning_rate": 0.017306994350502827, + "loss": 0.7988, + "num_input_tokens_seen": 39431920, + "step": 67915 + }, + { + "epoch": 10.116175156389634, + "grad_norm": 0.02880859375, + "learning_rate": 0.017305067875005953, + "loss": 0.8097, + "num_input_tokens_seen": 39434608, + "step": 67920 + }, + { + "epoch": 10.116919868930593, + "grad_norm": 0.025634765625, + "learning_rate": 0.017303141360567303, + "loss": 0.8181, + "num_input_tokens_seen": 39437616, + "step": 67925 + }, + { + "epoch": 10.117664581471551, + "grad_norm": 0.035888671875, + "learning_rate": 0.01730121480721942, + "loss": 0.8051, + "num_input_tokens_seen": 39440528, + "step": 67930 + }, + { + "epoch": 10.118409294012512, + "grad_norm": 0.0400390625, + "learning_rate": 0.017299288214994848, + "loss": 0.8283, + "num_input_tokens_seen": 39443376, + "step": 67935 + }, + { + "epoch": 10.11915400655347, + "grad_norm": 0.030517578125, + "learning_rate": 0.017297361583926144, + "loss": 0.7936, + "num_input_tokens_seen": 39446032, + "step": 67940 + }, + { + "epoch": 10.11989871909443, + "grad_norm": 0.042724609375, + "learning_rate": 0.017295434914045844, + "loss": 0.8056, + "num_input_tokens_seen": 39448848, + "step": 67945 + }, + { + "epoch": 10.120643431635388, + "grad_norm": 0.03955078125, + "learning_rate": 0.017293508205386512, + "loss": 0.8167, + "num_input_tokens_seen": 39451792, + "step": 67950 + }, + { + "epoch": 10.121388144176349, + "grad_norm": 0.026611328125, + "learning_rate": 0.017291581457980685, + "loss": 0.8078, + "num_input_tokens_seen": 39454704, + "step": 67955 + }, + { + "epoch": 10.122132856717307, + "grad_norm": 0.03759765625, + "learning_rate": 0.01728965467186092, + "loss": 0.7899, + "num_input_tokens_seen": 39457872, + "step": 67960 + }, + { + "epoch": 10.122877569258266, + "grad_norm": 0.02734375, + "learning_rate": 0.017287727847059762, + "loss": 0.805, + "num_input_tokens_seen": 39461968, + "step": 67965 + }, + { + "epoch": 10.123622281799225, + "grad_norm": 0.026611328125, + "learning_rate": 0.01728580098360977, + "loss": 0.8011, + "num_input_tokens_seen": 39464784, + "step": 67970 + }, + { + "epoch": 10.124366994340185, + "grad_norm": 0.0186767578125, + "learning_rate": 0.0172838740815435, + "loss": 0.8223, + "num_input_tokens_seen": 39467504, + "step": 67975 + }, + { + "epoch": 10.125111706881144, + "grad_norm": 0.022705078125, + "learning_rate": 0.01728194714089349, + "loss": 0.8199, + "num_input_tokens_seen": 39470608, + "step": 67980 + }, + { + "epoch": 10.125856419422103, + "grad_norm": 0.031005859375, + "learning_rate": 0.017280020161692302, + "loss": 0.7904, + "num_input_tokens_seen": 39473520, + "step": 67985 + }, + { + "epoch": 10.126601131963062, + "grad_norm": 0.037109375, + "learning_rate": 0.0172780931439725, + "loss": 0.7872, + "num_input_tokens_seen": 39476304, + "step": 67990 + }, + { + "epoch": 10.127345844504022, + "grad_norm": 0.0234375, + "learning_rate": 0.017276166087766623, + "loss": 0.8004, + "num_input_tokens_seen": 39479344, + "step": 67995 + }, + { + "epoch": 10.128090557044981, + "grad_norm": 0.0291748046875, + "learning_rate": 0.017274238993107233, + "loss": 0.797, + "num_input_tokens_seen": 39481968, + "step": 68000 + }, + { + "epoch": 10.12883526958594, + "grad_norm": 0.036865234375, + "learning_rate": 0.017272311860026885, + "loss": 0.8018, + "num_input_tokens_seen": 39484688, + "step": 68005 + }, + { + "epoch": 10.129579982126899, + "grad_norm": 0.0284423828125, + "learning_rate": 0.017270384688558144, + "loss": 0.7913, + "num_input_tokens_seen": 39488208, + "step": 68010 + }, + { + "epoch": 10.130324694667857, + "grad_norm": 0.04150390625, + "learning_rate": 0.017268457478733554, + "loss": 0.8132, + "num_input_tokens_seen": 39491024, + "step": 68015 + }, + { + "epoch": 10.131069407208818, + "grad_norm": 0.0279541015625, + "learning_rate": 0.017266530230585685, + "loss": 0.8033, + "num_input_tokens_seen": 39493968, + "step": 68020 + }, + { + "epoch": 10.131814119749777, + "grad_norm": 0.03076171875, + "learning_rate": 0.01726460294414709, + "loss": 0.8065, + "num_input_tokens_seen": 39496720, + "step": 68025 + }, + { + "epoch": 10.132558832290735, + "grad_norm": 0.02880859375, + "learning_rate": 0.017262675619450333, + "loss": 0.8112, + "num_input_tokens_seen": 39499728, + "step": 68030 + }, + { + "epoch": 10.133303544831694, + "grad_norm": 0.021240234375, + "learning_rate": 0.01726074825652797, + "loss": 0.7976, + "num_input_tokens_seen": 39502672, + "step": 68035 + }, + { + "epoch": 10.134048257372655, + "grad_norm": 0.033935546875, + "learning_rate": 0.017258820855412554, + "loss": 0.7963, + "num_input_tokens_seen": 39505776, + "step": 68040 + }, + { + "epoch": 10.134792969913613, + "grad_norm": 0.0235595703125, + "learning_rate": 0.01725689341613667, + "loss": 0.7734, + "num_input_tokens_seen": 39508560, + "step": 68045 + }, + { + "epoch": 10.135537682454572, + "grad_norm": 0.039306640625, + "learning_rate": 0.017254965938732852, + "loss": 0.8155, + "num_input_tokens_seen": 39511600, + "step": 68050 + }, + { + "epoch": 10.136282394995531, + "grad_norm": 0.031494140625, + "learning_rate": 0.017253038423233686, + "loss": 0.8018, + "num_input_tokens_seen": 39514480, + "step": 68055 + }, + { + "epoch": 10.137027107536491, + "grad_norm": 0.02734375, + "learning_rate": 0.01725111086967172, + "loss": 0.8039, + "num_input_tokens_seen": 39517168, + "step": 68060 + }, + { + "epoch": 10.13777182007745, + "grad_norm": 0.0198974609375, + "learning_rate": 0.017249183278079526, + "loss": 0.8163, + "num_input_tokens_seen": 39519984, + "step": 68065 + }, + { + "epoch": 10.138516532618409, + "grad_norm": 0.0400390625, + "learning_rate": 0.017247255648489668, + "loss": 0.7925, + "num_input_tokens_seen": 39522768, + "step": 68070 + }, + { + "epoch": 10.139261245159368, + "grad_norm": 0.0291748046875, + "learning_rate": 0.017245327980934706, + "loss": 0.7968, + "num_input_tokens_seen": 39526000, + "step": 68075 + }, + { + "epoch": 10.140005957700328, + "grad_norm": 0.0230712890625, + "learning_rate": 0.017243400275447216, + "loss": 0.8025, + "num_input_tokens_seen": 39529040, + "step": 68080 + }, + { + "epoch": 10.140750670241287, + "grad_norm": 0.03564453125, + "learning_rate": 0.017241472532059755, + "loss": 0.8066, + "num_input_tokens_seen": 39532016, + "step": 68085 + }, + { + "epoch": 10.141495382782246, + "grad_norm": 0.0189208984375, + "learning_rate": 0.01723954475080489, + "loss": 0.7875, + "num_input_tokens_seen": 39534736, + "step": 68090 + }, + { + "epoch": 10.142240095323205, + "grad_norm": 0.0302734375, + "learning_rate": 0.017237616931715195, + "loss": 0.7879, + "num_input_tokens_seen": 39537840, + "step": 68095 + }, + { + "epoch": 10.142984807864165, + "grad_norm": 0.0390625, + "learning_rate": 0.017235689074823236, + "loss": 0.8068, + "num_input_tokens_seen": 39540528, + "step": 68100 + }, + { + "epoch": 10.143729520405124, + "grad_norm": 0.034912109375, + "learning_rate": 0.017233761180161587, + "loss": 0.797, + "num_input_tokens_seen": 39543568, + "step": 68105 + }, + { + "epoch": 10.144474232946083, + "grad_norm": 0.0274658203125, + "learning_rate": 0.01723183324776281, + "loss": 0.8188, + "num_input_tokens_seen": 39546384, + "step": 68110 + }, + { + "epoch": 10.145218945487041, + "grad_norm": 0.0301513671875, + "learning_rate": 0.01722990527765948, + "loss": 0.7834, + "num_input_tokens_seen": 39549296, + "step": 68115 + }, + { + "epoch": 10.145963658028002, + "grad_norm": 0.035400390625, + "learning_rate": 0.017227977269884166, + "loss": 0.7985, + "num_input_tokens_seen": 39552048, + "step": 68120 + }, + { + "epoch": 10.14670837056896, + "grad_norm": 0.032470703125, + "learning_rate": 0.017226049224469433, + "loss": 0.8184, + "num_input_tokens_seen": 39555088, + "step": 68125 + }, + { + "epoch": 10.14745308310992, + "grad_norm": 0.03271484375, + "learning_rate": 0.017224121141447866, + "loss": 0.7893, + "num_input_tokens_seen": 39558064, + "step": 68130 + }, + { + "epoch": 10.148197795650878, + "grad_norm": 0.03955078125, + "learning_rate": 0.017222193020852036, + "loss": 0.7908, + "num_input_tokens_seen": 39560752, + "step": 68135 + }, + { + "epoch": 10.148942508191839, + "grad_norm": 0.032958984375, + "learning_rate": 0.017220264862714517, + "loss": 0.8037, + "num_input_tokens_seen": 39564016, + "step": 68140 + }, + { + "epoch": 10.149687220732797, + "grad_norm": 0.05224609375, + "learning_rate": 0.01721833666706787, + "loss": 0.8136, + "num_input_tokens_seen": 39567184, + "step": 68145 + }, + { + "epoch": 10.150431933273756, + "grad_norm": 0.0301513671875, + "learning_rate": 0.017216408433944688, + "loss": 0.7941, + "num_input_tokens_seen": 39570448, + "step": 68150 + }, + { + "epoch": 10.151176645814715, + "grad_norm": 0.03271484375, + "learning_rate": 0.01721448016337753, + "loss": 0.792, + "num_input_tokens_seen": 39573168, + "step": 68155 + }, + { + "epoch": 10.151921358355676, + "grad_norm": 0.037353515625, + "learning_rate": 0.017212551855398984, + "loss": 0.8131, + "num_input_tokens_seen": 39576080, + "step": 68160 + }, + { + "epoch": 10.152666070896634, + "grad_norm": 0.0281982421875, + "learning_rate": 0.017210623510041625, + "loss": 0.8067, + "num_input_tokens_seen": 39578928, + "step": 68165 + }, + { + "epoch": 10.153410783437593, + "grad_norm": 0.02734375, + "learning_rate": 0.01720869512733802, + "loss": 0.7903, + "num_input_tokens_seen": 39581840, + "step": 68170 + }, + { + "epoch": 10.154155495978552, + "grad_norm": 0.0196533203125, + "learning_rate": 0.017206766707320766, + "loss": 0.807, + "num_input_tokens_seen": 39584880, + "step": 68175 + }, + { + "epoch": 10.15490020851951, + "grad_norm": 0.041015625, + "learning_rate": 0.01720483825002243, + "loss": 0.7948, + "num_input_tokens_seen": 39588144, + "step": 68180 + }, + { + "epoch": 10.155644921060471, + "grad_norm": 0.0302734375, + "learning_rate": 0.01720290975547559, + "loss": 0.8249, + "num_input_tokens_seen": 39591056, + "step": 68185 + }, + { + "epoch": 10.15638963360143, + "grad_norm": 0.037841796875, + "learning_rate": 0.01720098122371283, + "loss": 0.8091, + "num_input_tokens_seen": 39594160, + "step": 68190 + }, + { + "epoch": 10.157134346142389, + "grad_norm": 0.02490234375, + "learning_rate": 0.01719905265476673, + "loss": 0.8062, + "num_input_tokens_seen": 39597136, + "step": 68195 + }, + { + "epoch": 10.157879058683347, + "grad_norm": 0.0223388671875, + "learning_rate": 0.017197124048669868, + "loss": 0.7877, + "num_input_tokens_seen": 39600144, + "step": 68200 + }, + { + "epoch": 10.158623771224308, + "grad_norm": 0.038330078125, + "learning_rate": 0.01719519540545483, + "loss": 0.81, + "num_input_tokens_seen": 39603248, + "step": 68205 + }, + { + "epoch": 10.159368483765267, + "grad_norm": 0.04541015625, + "learning_rate": 0.017193266725154196, + "loss": 0.792, + "num_input_tokens_seen": 39606096, + "step": 68210 + }, + { + "epoch": 10.160113196306225, + "grad_norm": 0.05712890625, + "learning_rate": 0.017191338007800545, + "loss": 0.8131, + "num_input_tokens_seen": 39609072, + "step": 68215 + }, + { + "epoch": 10.160857908847184, + "grad_norm": 0.0322265625, + "learning_rate": 0.017189409253426473, + "loss": 0.835, + "num_input_tokens_seen": 39611952, + "step": 68220 + }, + { + "epoch": 10.161602621388145, + "grad_norm": 0.030517578125, + "learning_rate": 0.01718748046206455, + "loss": 0.8037, + "num_input_tokens_seen": 39614736, + "step": 68225 + }, + { + "epoch": 10.162347333929103, + "grad_norm": 0.03759765625, + "learning_rate": 0.017185551633747378, + "loss": 0.7934, + "num_input_tokens_seen": 39617424, + "step": 68230 + }, + { + "epoch": 10.163092046470062, + "grad_norm": 0.0189208984375, + "learning_rate": 0.01718362276850752, + "loss": 0.8073, + "num_input_tokens_seen": 39620592, + "step": 68235 + }, + { + "epoch": 10.163836759011021, + "grad_norm": 0.0223388671875, + "learning_rate": 0.017181693866377582, + "loss": 0.802, + "num_input_tokens_seen": 39623632, + "step": 68240 + }, + { + "epoch": 10.164581471551982, + "grad_norm": 0.033447265625, + "learning_rate": 0.017179764927390147, + "loss": 0.8133, + "num_input_tokens_seen": 39626704, + "step": 68245 + }, + { + "epoch": 10.16532618409294, + "grad_norm": 0.038330078125, + "learning_rate": 0.01717783595157779, + "loss": 0.8114, + "num_input_tokens_seen": 39629488, + "step": 68250 + }, + { + "epoch": 10.166070896633899, + "grad_norm": 0.0291748046875, + "learning_rate": 0.017175906938973116, + "loss": 0.7951, + "num_input_tokens_seen": 39632400, + "step": 68255 + }, + { + "epoch": 10.166815609174858, + "grad_norm": 0.02197265625, + "learning_rate": 0.0171739778896087, + "loss": 0.7904, + "num_input_tokens_seen": 39635536, + "step": 68260 + }, + { + "epoch": 10.167560321715818, + "grad_norm": 0.04296875, + "learning_rate": 0.017172048803517137, + "loss": 0.8139, + "num_input_tokens_seen": 39638480, + "step": 68265 + }, + { + "epoch": 10.168305034256777, + "grad_norm": 0.033447265625, + "learning_rate": 0.01717011968073102, + "loss": 0.8093, + "num_input_tokens_seen": 39641616, + "step": 68270 + }, + { + "epoch": 10.169049746797736, + "grad_norm": 0.0234375, + "learning_rate": 0.017168190521282937, + "loss": 0.7986, + "num_input_tokens_seen": 39644528, + "step": 68275 + }, + { + "epoch": 10.169794459338695, + "grad_norm": 0.02783203125, + "learning_rate": 0.017166261325205483, + "loss": 0.8125, + "num_input_tokens_seen": 39647504, + "step": 68280 + }, + { + "epoch": 10.170539171879655, + "grad_norm": 0.06103515625, + "learning_rate": 0.017164332092531237, + "loss": 0.7981, + "num_input_tokens_seen": 39650544, + "step": 68285 + }, + { + "epoch": 10.171283884420614, + "grad_norm": 0.0250244140625, + "learning_rate": 0.017162402823292802, + "loss": 0.7956, + "num_input_tokens_seen": 39653296, + "step": 68290 + }, + { + "epoch": 10.172028596961573, + "grad_norm": 0.0284423828125, + "learning_rate": 0.01716047351752277, + "loss": 0.7924, + "num_input_tokens_seen": 39656176, + "step": 68295 + }, + { + "epoch": 10.172773309502531, + "grad_norm": 0.0218505859375, + "learning_rate": 0.017158544175253733, + "loss": 0.8152, + "num_input_tokens_seen": 39659568, + "step": 68300 + }, + { + "epoch": 10.173518022043492, + "grad_norm": 0.0458984375, + "learning_rate": 0.01715661479651829, + "loss": 0.8082, + "num_input_tokens_seen": 39662352, + "step": 68305 + }, + { + "epoch": 10.17426273458445, + "grad_norm": 0.033447265625, + "learning_rate": 0.017154685381349035, + "loss": 0.8126, + "num_input_tokens_seen": 39665360, + "step": 68310 + }, + { + "epoch": 10.17500744712541, + "grad_norm": 0.0306396484375, + "learning_rate": 0.01715275592977855, + "loss": 0.8051, + "num_input_tokens_seen": 39668144, + "step": 68315 + }, + { + "epoch": 10.175752159666368, + "grad_norm": 0.02880859375, + "learning_rate": 0.017150826441839444, + "loss": 0.8076, + "num_input_tokens_seen": 39670960, + "step": 68320 + }, + { + "epoch": 10.176496872207329, + "grad_norm": 0.038330078125, + "learning_rate": 0.017148896917564316, + "loss": 0.8054, + "num_input_tokens_seen": 39674096, + "step": 68325 + }, + { + "epoch": 10.177241584748288, + "grad_norm": 0.03515625, + "learning_rate": 0.017146967356985756, + "loss": 0.8134, + "num_input_tokens_seen": 39676624, + "step": 68330 + }, + { + "epoch": 10.177986297289246, + "grad_norm": 0.0218505859375, + "learning_rate": 0.017145037760136362, + "loss": 0.8032, + "num_input_tokens_seen": 39679376, + "step": 68335 + }, + { + "epoch": 10.178731009830205, + "grad_norm": 0.045166015625, + "learning_rate": 0.017143108127048737, + "loss": 0.7912, + "num_input_tokens_seen": 39682256, + "step": 68340 + }, + { + "epoch": 10.179475722371166, + "grad_norm": 0.031982421875, + "learning_rate": 0.01714117845775548, + "loss": 0.788, + "num_input_tokens_seen": 39685360, + "step": 68345 + }, + { + "epoch": 10.180220434912124, + "grad_norm": 0.021728515625, + "learning_rate": 0.017139248752289192, + "loss": 0.8096, + "num_input_tokens_seen": 39688272, + "step": 68350 + }, + { + "epoch": 10.180965147453083, + "grad_norm": 0.032470703125, + "learning_rate": 0.017137319010682463, + "loss": 0.7801, + "num_input_tokens_seen": 39691536, + "step": 68355 + }, + { + "epoch": 10.181709859994042, + "grad_norm": 0.04345703125, + "learning_rate": 0.017135389232967906, + "loss": 0.808, + "num_input_tokens_seen": 39694384, + "step": 68360 + }, + { + "epoch": 10.182454572535, + "grad_norm": 0.035888671875, + "learning_rate": 0.01713345941917812, + "loss": 0.7854, + "num_input_tokens_seen": 39697136, + "step": 68365 + }, + { + "epoch": 10.183199285075961, + "grad_norm": 0.038330078125, + "learning_rate": 0.0171315295693457, + "loss": 0.7913, + "num_input_tokens_seen": 39700176, + "step": 68370 + }, + { + "epoch": 10.18394399761692, + "grad_norm": 0.031982421875, + "learning_rate": 0.01712959968350326, + "loss": 0.7957, + "num_input_tokens_seen": 39703152, + "step": 68375 + }, + { + "epoch": 10.184688710157879, + "grad_norm": 0.054931640625, + "learning_rate": 0.01712766976168339, + "loss": 0.8165, + "num_input_tokens_seen": 39706160, + "step": 68380 + }, + { + "epoch": 10.185433422698837, + "grad_norm": 0.0294189453125, + "learning_rate": 0.017125739803918707, + "loss": 0.8006, + "num_input_tokens_seen": 39709264, + "step": 68385 + }, + { + "epoch": 10.186178135239798, + "grad_norm": 0.04345703125, + "learning_rate": 0.017123809810241815, + "loss": 0.8175, + "num_input_tokens_seen": 39712112, + "step": 68390 + }, + { + "epoch": 10.186922847780757, + "grad_norm": 0.03955078125, + "learning_rate": 0.017121879780685308, + "loss": 0.7815, + "num_input_tokens_seen": 39714960, + "step": 68395 + }, + { + "epoch": 10.187667560321715, + "grad_norm": 0.039794921875, + "learning_rate": 0.017119949715281798, + "loss": 0.7735, + "num_input_tokens_seen": 39718032, + "step": 68400 + }, + { + "epoch": 10.188412272862674, + "grad_norm": 0.048095703125, + "learning_rate": 0.017118019614063895, + "loss": 0.8433, + "num_input_tokens_seen": 39721072, + "step": 68405 + }, + { + "epoch": 10.189156985403635, + "grad_norm": 0.02978515625, + "learning_rate": 0.01711608947706421, + "loss": 0.7886, + "num_input_tokens_seen": 39724176, + "step": 68410 + }, + { + "epoch": 10.189901697944594, + "grad_norm": 0.022216796875, + "learning_rate": 0.017114159304315337, + "loss": 0.8141, + "num_input_tokens_seen": 39726992, + "step": 68415 + }, + { + "epoch": 10.190646410485552, + "grad_norm": 0.0595703125, + "learning_rate": 0.017112229095849887, + "loss": 0.7986, + "num_input_tokens_seen": 39730064, + "step": 68420 + }, + { + "epoch": 10.191391123026511, + "grad_norm": 0.041748046875, + "learning_rate": 0.01711029885170048, + "loss": 0.79, + "num_input_tokens_seen": 39732976, + "step": 68425 + }, + { + "epoch": 10.192135835567472, + "grad_norm": 0.0299072265625, + "learning_rate": 0.017108368571899717, + "loss": 0.7797, + "num_input_tokens_seen": 39736144, + "step": 68430 + }, + { + "epoch": 10.19288054810843, + "grad_norm": 0.048828125, + "learning_rate": 0.017106438256480207, + "loss": 0.7994, + "num_input_tokens_seen": 39738992, + "step": 68435 + }, + { + "epoch": 10.19362526064939, + "grad_norm": 0.02880859375, + "learning_rate": 0.017104507905474565, + "loss": 0.7811, + "num_input_tokens_seen": 39741808, + "step": 68440 + }, + { + "epoch": 10.194369973190348, + "grad_norm": 0.051025390625, + "learning_rate": 0.017102577518915404, + "loss": 0.8239, + "num_input_tokens_seen": 39744720, + "step": 68445 + }, + { + "epoch": 10.195114685731308, + "grad_norm": 0.0220947265625, + "learning_rate": 0.01710064709683533, + "loss": 0.7959, + "num_input_tokens_seen": 39747408, + "step": 68450 + }, + { + "epoch": 10.195859398272267, + "grad_norm": 0.02392578125, + "learning_rate": 0.01709871663926696, + "loss": 0.7915, + "num_input_tokens_seen": 39750128, + "step": 68455 + }, + { + "epoch": 10.196604110813226, + "grad_norm": 0.0439453125, + "learning_rate": 0.017096786146242902, + "loss": 0.8023, + "num_input_tokens_seen": 39753296, + "step": 68460 + }, + { + "epoch": 10.197348823354185, + "grad_norm": 0.02783203125, + "learning_rate": 0.017094855617795775, + "loss": 0.8327, + "num_input_tokens_seen": 39756016, + "step": 68465 + }, + { + "epoch": 10.198093535895145, + "grad_norm": 0.03857421875, + "learning_rate": 0.017092925053958197, + "loss": 0.7825, + "num_input_tokens_seen": 39759312, + "step": 68470 + }, + { + "epoch": 10.198838248436104, + "grad_norm": 0.02978515625, + "learning_rate": 0.017090994454762774, + "loss": 0.811, + "num_input_tokens_seen": 39762192, + "step": 68475 + }, + { + "epoch": 10.199582960977063, + "grad_norm": 0.04052734375, + "learning_rate": 0.01708906382024212, + "loss": 0.826, + "num_input_tokens_seen": 39765232, + "step": 68480 + }, + { + "epoch": 10.200327673518021, + "grad_norm": 0.0267333984375, + "learning_rate": 0.017087133150428858, + "loss": 0.8115, + "num_input_tokens_seen": 39768208, + "step": 68485 + }, + { + "epoch": 10.201072386058982, + "grad_norm": 0.040283203125, + "learning_rate": 0.017085202445355604, + "loss": 0.7896, + "num_input_tokens_seen": 39771312, + "step": 68490 + }, + { + "epoch": 10.20181709859994, + "grad_norm": 0.031494140625, + "learning_rate": 0.01708327170505497, + "loss": 0.8181, + "num_input_tokens_seen": 39774192, + "step": 68495 + }, + { + "epoch": 10.2025618111409, + "grad_norm": 0.0439453125, + "learning_rate": 0.017081340929559582, + "loss": 0.8041, + "num_input_tokens_seen": 39777424, + "step": 68500 + }, + { + "epoch": 10.203306523681858, + "grad_norm": 0.041748046875, + "learning_rate": 0.017079410118902058, + "loss": 0.7893, + "num_input_tokens_seen": 39780208, + "step": 68505 + }, + { + "epoch": 10.204051236222819, + "grad_norm": 0.034912109375, + "learning_rate": 0.01707747927311501, + "loss": 0.8063, + "num_input_tokens_seen": 39782928, + "step": 68510 + }, + { + "epoch": 10.204795948763778, + "grad_norm": 0.02197265625, + "learning_rate": 0.017075548392231063, + "loss": 0.8049, + "num_input_tokens_seen": 39785488, + "step": 68515 + }, + { + "epoch": 10.205540661304736, + "grad_norm": 0.04052734375, + "learning_rate": 0.017073617476282832, + "loss": 0.799, + "num_input_tokens_seen": 39788176, + "step": 68520 + }, + { + "epoch": 10.206285373845695, + "grad_norm": 0.041015625, + "learning_rate": 0.01707168652530294, + "loss": 0.7984, + "num_input_tokens_seen": 39790832, + "step": 68525 + }, + { + "epoch": 10.207030086386654, + "grad_norm": 0.036376953125, + "learning_rate": 0.01706975553932401, + "loss": 0.8041, + "num_input_tokens_seen": 39793968, + "step": 68530 + }, + { + "epoch": 10.207774798927614, + "grad_norm": 0.038330078125, + "learning_rate": 0.01706782451837867, + "loss": 0.8196, + "num_input_tokens_seen": 39796816, + "step": 68535 + }, + { + "epoch": 10.208519511468573, + "grad_norm": 0.0400390625, + "learning_rate": 0.01706589346249953, + "loss": 0.7919, + "num_input_tokens_seen": 39799824, + "step": 68540 + }, + { + "epoch": 10.209264224009532, + "grad_norm": 0.027587890625, + "learning_rate": 0.01706396237171922, + "loss": 0.7981, + "num_input_tokens_seen": 39802896, + "step": 68545 + }, + { + "epoch": 10.21000893655049, + "grad_norm": 0.0233154296875, + "learning_rate": 0.017062031246070367, + "loss": 0.781, + "num_input_tokens_seen": 39806128, + "step": 68550 + }, + { + "epoch": 10.210753649091451, + "grad_norm": 0.0380859375, + "learning_rate": 0.017060100085585592, + "loss": 0.7974, + "num_input_tokens_seen": 39809040, + "step": 68555 + }, + { + "epoch": 10.21149836163241, + "grad_norm": 0.03466796875, + "learning_rate": 0.017058168890297515, + "loss": 0.7837, + "num_input_tokens_seen": 39811920, + "step": 68560 + }, + { + "epoch": 10.212243074173369, + "grad_norm": 0.02392578125, + "learning_rate": 0.017056237660238772, + "loss": 0.8015, + "num_input_tokens_seen": 39814672, + "step": 68565 + }, + { + "epoch": 10.212987786714327, + "grad_norm": 0.037841796875, + "learning_rate": 0.017054306395441976, + "loss": 0.7854, + "num_input_tokens_seen": 39817456, + "step": 68570 + }, + { + "epoch": 10.213732499255288, + "grad_norm": 0.04296875, + "learning_rate": 0.01705237509593977, + "loss": 0.8019, + "num_input_tokens_seen": 39820496, + "step": 68575 + }, + { + "epoch": 10.214477211796247, + "grad_norm": 0.040771484375, + "learning_rate": 0.01705044376176477, + "loss": 0.7951, + "num_input_tokens_seen": 39823120, + "step": 68580 + }, + { + "epoch": 10.215221924337206, + "grad_norm": 0.049560546875, + "learning_rate": 0.0170485123929496, + "loss": 0.7989, + "num_input_tokens_seen": 39826000, + "step": 68585 + }, + { + "epoch": 10.215966636878164, + "grad_norm": 0.051513671875, + "learning_rate": 0.017046580989526907, + "loss": 0.7964, + "num_input_tokens_seen": 39828720, + "step": 68590 + }, + { + "epoch": 10.216711349419125, + "grad_norm": 0.044921875, + "learning_rate": 0.017044649551529303, + "loss": 0.7892, + "num_input_tokens_seen": 39831984, + "step": 68595 + }, + { + "epoch": 10.217456061960084, + "grad_norm": 0.0341796875, + "learning_rate": 0.01704271807898942, + "loss": 0.8013, + "num_input_tokens_seen": 39835056, + "step": 68600 + }, + { + "epoch": 10.218200774501042, + "grad_norm": 0.0732421875, + "learning_rate": 0.017040786571939893, + "loss": 0.7971, + "num_input_tokens_seen": 39838032, + "step": 68605 + }, + { + "epoch": 10.218945487042001, + "grad_norm": 0.032958984375, + "learning_rate": 0.017038855030413354, + "loss": 0.8479, + "num_input_tokens_seen": 39840912, + "step": 68610 + }, + { + "epoch": 10.219690199582962, + "grad_norm": 0.034912109375, + "learning_rate": 0.01703692345444243, + "loss": 0.8159, + "num_input_tokens_seen": 39843600, + "step": 68615 + }, + { + "epoch": 10.22043491212392, + "grad_norm": 0.047119140625, + "learning_rate": 0.01703499184405975, + "loss": 0.8045, + "num_input_tokens_seen": 39846416, + "step": 68620 + }, + { + "epoch": 10.22117962466488, + "grad_norm": 0.038330078125, + "learning_rate": 0.017033060199297952, + "loss": 0.8075, + "num_input_tokens_seen": 39849264, + "step": 68625 + }, + { + "epoch": 10.221924337205838, + "grad_norm": 0.041015625, + "learning_rate": 0.01703112852018967, + "loss": 0.8083, + "num_input_tokens_seen": 39852016, + "step": 68630 + }, + { + "epoch": 10.222669049746798, + "grad_norm": 0.1611328125, + "learning_rate": 0.017029196806767538, + "loss": 0.8244, + "num_input_tokens_seen": 39854896, + "step": 68635 + }, + { + "epoch": 10.223413762287757, + "grad_norm": 0.0859375, + "learning_rate": 0.01702726505906419, + "loss": 0.8162, + "num_input_tokens_seen": 39857776, + "step": 68640 + }, + { + "epoch": 10.224158474828716, + "grad_norm": 0.04150390625, + "learning_rate": 0.017025333277112257, + "loss": 0.7991, + "num_input_tokens_seen": 39860688, + "step": 68645 + }, + { + "epoch": 10.224903187369675, + "grad_norm": 0.037353515625, + "learning_rate": 0.017023401460944373, + "loss": 0.7917, + "num_input_tokens_seen": 39863504, + "step": 68650 + }, + { + "epoch": 10.225647899910635, + "grad_norm": 0.02294921875, + "learning_rate": 0.01702146961059318, + "loss": 0.8002, + "num_input_tokens_seen": 39866544, + "step": 68655 + }, + { + "epoch": 10.226392612451594, + "grad_norm": 0.0286865234375, + "learning_rate": 0.01701953772609131, + "loss": 0.7923, + "num_input_tokens_seen": 39869392, + "step": 68660 + }, + { + "epoch": 10.227137324992553, + "grad_norm": 0.03271484375, + "learning_rate": 0.017017605807471407, + "loss": 0.7973, + "num_input_tokens_seen": 39872816, + "step": 68665 + }, + { + "epoch": 10.227882037533512, + "grad_norm": 0.0308837890625, + "learning_rate": 0.017015673854766107, + "loss": 0.7937, + "num_input_tokens_seen": 39875696, + "step": 68670 + }, + { + "epoch": 10.228626750074472, + "grad_norm": 0.0194091796875, + "learning_rate": 0.017013741868008043, + "loss": 0.8052, + "num_input_tokens_seen": 39878224, + "step": 68675 + }, + { + "epoch": 10.22937146261543, + "grad_norm": 0.052001953125, + "learning_rate": 0.017011809847229855, + "loss": 0.8077, + "num_input_tokens_seen": 39880880, + "step": 68680 + }, + { + "epoch": 10.23011617515639, + "grad_norm": 0.034423828125, + "learning_rate": 0.017009877792464184, + "loss": 0.7982, + "num_input_tokens_seen": 39883728, + "step": 68685 + }, + { + "epoch": 10.230860887697348, + "grad_norm": 0.035888671875, + "learning_rate": 0.01700794570374367, + "loss": 0.7886, + "num_input_tokens_seen": 39886768, + "step": 68690 + }, + { + "epoch": 10.231605600238307, + "grad_norm": 0.040771484375, + "learning_rate": 0.017006013581100956, + "loss": 0.7964, + "num_input_tokens_seen": 39889808, + "step": 68695 + }, + { + "epoch": 10.232350312779268, + "grad_norm": 0.049560546875, + "learning_rate": 0.01700408142456868, + "loss": 0.7819, + "num_input_tokens_seen": 39892784, + "step": 68700 + }, + { + "epoch": 10.233095025320226, + "grad_norm": 0.032470703125, + "learning_rate": 0.01700214923417949, + "loss": 0.7777, + "num_input_tokens_seen": 39895440, + "step": 68705 + }, + { + "epoch": 10.233839737861185, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01700021700996602, + "loss": 0.7841, + "num_input_tokens_seen": 39898096, + "step": 68710 + }, + { + "epoch": 10.234584450402144, + "grad_norm": 0.033447265625, + "learning_rate": 0.016998284751960916, + "loss": 0.7881, + "num_input_tokens_seen": 39901424, + "step": 68715 + }, + { + "epoch": 10.235329162943104, + "grad_norm": 0.0537109375, + "learning_rate": 0.016996352460196824, + "loss": 0.8599, + "num_input_tokens_seen": 39904400, + "step": 68720 + }, + { + "epoch": 10.236073875484063, + "grad_norm": 0.0203857421875, + "learning_rate": 0.016994420134706385, + "loss": 0.7825, + "num_input_tokens_seen": 39906928, + "step": 68725 + }, + { + "epoch": 10.236818588025022, + "grad_norm": 0.045166015625, + "learning_rate": 0.016992487775522244, + "loss": 0.7965, + "num_input_tokens_seen": 39909744, + "step": 68730 + }, + { + "epoch": 10.23756330056598, + "grad_norm": 0.03271484375, + "learning_rate": 0.016990555382677044, + "loss": 0.7941, + "num_input_tokens_seen": 39912560, + "step": 68735 + }, + { + "epoch": 10.238308013106941, + "grad_norm": 0.0213623046875, + "learning_rate": 0.01698862295620344, + "loss": 0.7771, + "num_input_tokens_seen": 39915152, + "step": 68740 + }, + { + "epoch": 10.2390527256479, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01698669049613407, + "loss": 0.7878, + "num_input_tokens_seen": 39918000, + "step": 68745 + }, + { + "epoch": 10.239797438188859, + "grad_norm": 0.03173828125, + "learning_rate": 0.016984758002501585, + "loss": 0.7801, + "num_input_tokens_seen": 39920624, + "step": 68750 + }, + { + "epoch": 10.240542150729818, + "grad_norm": 0.033447265625, + "learning_rate": 0.01698282547533863, + "loss": 0.7826, + "num_input_tokens_seen": 39923792, + "step": 68755 + }, + { + "epoch": 10.241286863270778, + "grad_norm": 0.042724609375, + "learning_rate": 0.01698089291467785, + "loss": 0.7735, + "num_input_tokens_seen": 39926608, + "step": 68760 + }, + { + "epoch": 10.242031575811737, + "grad_norm": 0.07421875, + "learning_rate": 0.016978960320551902, + "loss": 0.7747, + "num_input_tokens_seen": 39929296, + "step": 68765 + }, + { + "epoch": 10.242776288352696, + "grad_norm": 0.06201171875, + "learning_rate": 0.016977027692993425, + "loss": 0.8115, + "num_input_tokens_seen": 39932144, + "step": 68770 + }, + { + "epoch": 10.243521000893654, + "grad_norm": 0.043701171875, + "learning_rate": 0.016975095032035082, + "loss": 0.7851, + "num_input_tokens_seen": 39934992, + "step": 68775 + }, + { + "epoch": 10.244265713434615, + "grad_norm": 0.040771484375, + "learning_rate": 0.01697316233770951, + "loss": 0.7775, + "num_input_tokens_seen": 39938000, + "step": 68780 + }, + { + "epoch": 10.245010425975574, + "grad_norm": 0.042236328125, + "learning_rate": 0.01697122961004937, + "loss": 0.7671, + "num_input_tokens_seen": 39940880, + "step": 68785 + }, + { + "epoch": 10.245755138516532, + "grad_norm": 0.044677734375, + "learning_rate": 0.016969296849087306, + "loss": 0.8363, + "num_input_tokens_seen": 39943536, + "step": 68790 + }, + { + "epoch": 10.246499851057491, + "grad_norm": 0.041259765625, + "learning_rate": 0.016967364054855976, + "loss": 0.7993, + "num_input_tokens_seen": 39946352, + "step": 68795 + }, + { + "epoch": 10.247244563598452, + "grad_norm": 0.059326171875, + "learning_rate": 0.016965431227388028, + "loss": 0.8043, + "num_input_tokens_seen": 39949200, + "step": 68800 + }, + { + "epoch": 10.24798927613941, + "grad_norm": 0.03759765625, + "learning_rate": 0.01696349836671612, + "loss": 0.7954, + "num_input_tokens_seen": 39952144, + "step": 68805 + }, + { + "epoch": 10.24873398868037, + "grad_norm": 0.044677734375, + "learning_rate": 0.0169615654728729, + "loss": 0.7709, + "num_input_tokens_seen": 39955376, + "step": 68810 + }, + { + "epoch": 10.249478701221328, + "grad_norm": 0.0247802734375, + "learning_rate": 0.016959632545891022, + "loss": 0.7987, + "num_input_tokens_seen": 39958096, + "step": 68815 + }, + { + "epoch": 10.250223413762289, + "grad_norm": 0.044921875, + "learning_rate": 0.016957699585803145, + "loss": 0.7583, + "num_input_tokens_seen": 39960816, + "step": 68820 + }, + { + "epoch": 10.250968126303247, + "grad_norm": 0.037109375, + "learning_rate": 0.016955766592641926, + "loss": 0.8028, + "num_input_tokens_seen": 39963984, + "step": 68825 + }, + { + "epoch": 10.251712838844206, + "grad_norm": 0.0380859375, + "learning_rate": 0.016953833566440018, + "loss": 0.7738, + "num_input_tokens_seen": 39966928, + "step": 68830 + }, + { + "epoch": 10.252457551385165, + "grad_norm": 0.036376953125, + "learning_rate": 0.01695190050723008, + "loss": 0.7712, + "num_input_tokens_seen": 39969840, + "step": 68835 + }, + { + "epoch": 10.253202263926125, + "grad_norm": 0.04248046875, + "learning_rate": 0.016949967415044762, + "loss": 0.8295, + "num_input_tokens_seen": 39972784, + "step": 68840 + }, + { + "epoch": 10.253946976467084, + "grad_norm": 0.037353515625, + "learning_rate": 0.016948034289916727, + "loss": 0.7968, + "num_input_tokens_seen": 39975632, + "step": 68845 + }, + { + "epoch": 10.254691689008043, + "grad_norm": 0.03564453125, + "learning_rate": 0.016946101131878635, + "loss": 0.8, + "num_input_tokens_seen": 39978480, + "step": 68850 + }, + { + "epoch": 10.255436401549002, + "grad_norm": 0.0458984375, + "learning_rate": 0.01694416794096314, + "loss": 0.8193, + "num_input_tokens_seen": 39981328, + "step": 68855 + }, + { + "epoch": 10.256181114089962, + "grad_norm": 0.0322265625, + "learning_rate": 0.0169422347172029, + "loss": 0.7924, + "num_input_tokens_seen": 39984048, + "step": 68860 + }, + { + "epoch": 10.256925826630921, + "grad_norm": 0.0250244140625, + "learning_rate": 0.016940301460630584, + "loss": 0.8132, + "num_input_tokens_seen": 39986928, + "step": 68865 + }, + { + "epoch": 10.25767053917188, + "grad_norm": 0.042236328125, + "learning_rate": 0.01693836817127885, + "loss": 0.7956, + "num_input_tokens_seen": 39990096, + "step": 68870 + }, + { + "epoch": 10.258415251712838, + "grad_norm": 0.056640625, + "learning_rate": 0.016936434849180355, + "loss": 0.8036, + "num_input_tokens_seen": 39992912, + "step": 68875 + }, + { + "epoch": 10.259159964253797, + "grad_norm": 0.03564453125, + "learning_rate": 0.016934501494367756, + "loss": 0.7808, + "num_input_tokens_seen": 39995920, + "step": 68880 + }, + { + "epoch": 10.259904676794758, + "grad_norm": 0.0198974609375, + "learning_rate": 0.01693256810687372, + "loss": 0.8096, + "num_input_tokens_seen": 39999088, + "step": 68885 + }, + { + "epoch": 10.260649389335716, + "grad_norm": 0.036865234375, + "learning_rate": 0.01693063468673091, + "loss": 0.7999, + "num_input_tokens_seen": 40002192, + "step": 68890 + }, + { + "epoch": 10.261394101876675, + "grad_norm": 0.0830078125, + "learning_rate": 0.016928701233971992, + "loss": 0.7696, + "num_input_tokens_seen": 40005040, + "step": 68895 + }, + { + "epoch": 10.262138814417634, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01692676774862963, + "loss": 0.7732, + "num_input_tokens_seen": 40007760, + "step": 68900 + }, + { + "epoch": 10.262883526958595, + "grad_norm": 0.042724609375, + "learning_rate": 0.01692483423073648, + "loss": 0.7759, + "num_input_tokens_seen": 40010640, + "step": 68905 + }, + { + "epoch": 10.263628239499553, + "grad_norm": 0.046630859375, + "learning_rate": 0.016922900680325214, + "loss": 0.7858, + "num_input_tokens_seen": 40013488, + "step": 68910 + }, + { + "epoch": 10.264372952040512, + "grad_norm": 0.03662109375, + "learning_rate": 0.016920967097428498, + "loss": 0.8278, + "num_input_tokens_seen": 40016368, + "step": 68915 + }, + { + "epoch": 10.26511766458147, + "grad_norm": 0.036865234375, + "learning_rate": 0.01691903348207899, + "loss": 0.7872, + "num_input_tokens_seen": 40019376, + "step": 68920 + }, + { + "epoch": 10.265862377122431, + "grad_norm": 0.037109375, + "learning_rate": 0.016917099834309363, + "loss": 0.7823, + "num_input_tokens_seen": 40022768, + "step": 68925 + }, + { + "epoch": 10.26660708966339, + "grad_norm": 0.035400390625, + "learning_rate": 0.01691516615415228, + "loss": 0.7876, + "num_input_tokens_seen": 40025616, + "step": 68930 + }, + { + "epoch": 10.267351802204349, + "grad_norm": 0.049072265625, + "learning_rate": 0.016913232441640415, + "loss": 0.8443, + "num_input_tokens_seen": 40028560, + "step": 68935 + }, + { + "epoch": 10.268096514745308, + "grad_norm": 0.03564453125, + "learning_rate": 0.016911298696806437, + "loss": 0.7819, + "num_input_tokens_seen": 40031600, + "step": 68940 + }, + { + "epoch": 10.268841227286268, + "grad_norm": 0.040771484375, + "learning_rate": 0.016909364919683003, + "loss": 0.8111, + "num_input_tokens_seen": 40034576, + "step": 68945 + }, + { + "epoch": 10.269585939827227, + "grad_norm": 0.06103515625, + "learning_rate": 0.016907431110302792, + "loss": 0.806, + "num_input_tokens_seen": 40037584, + "step": 68950 + }, + { + "epoch": 10.270330652368186, + "grad_norm": 0.0247802734375, + "learning_rate": 0.016905497268698475, + "loss": 0.8112, + "num_input_tokens_seen": 40040496, + "step": 68955 + }, + { + "epoch": 10.271075364909144, + "grad_norm": 0.038818359375, + "learning_rate": 0.016903563394902715, + "loss": 0.8009, + "num_input_tokens_seen": 40043696, + "step": 68960 + }, + { + "epoch": 10.271820077450105, + "grad_norm": 0.037109375, + "learning_rate": 0.016901629488948182, + "loss": 0.7814, + "num_input_tokens_seen": 40046480, + "step": 68965 + }, + { + "epoch": 10.272564789991064, + "grad_norm": 0.041015625, + "learning_rate": 0.016899695550867556, + "loss": 0.8004, + "num_input_tokens_seen": 40049488, + "step": 68970 + }, + { + "epoch": 10.273309502532022, + "grad_norm": 0.0556640625, + "learning_rate": 0.0168977615806935, + "loss": 0.8304, + "num_input_tokens_seen": 40052336, + "step": 68975 + }, + { + "epoch": 10.274054215072981, + "grad_norm": 0.040283203125, + "learning_rate": 0.016895827578458696, + "loss": 0.7952, + "num_input_tokens_seen": 40055120, + "step": 68980 + }, + { + "epoch": 10.274798927613942, + "grad_norm": 0.037353515625, + "learning_rate": 0.01689389354419581, + "loss": 0.7568, + "num_input_tokens_seen": 40057808, + "step": 68985 + }, + { + "epoch": 10.2755436401549, + "grad_norm": 0.0301513671875, + "learning_rate": 0.016891959477937514, + "loss": 0.8376, + "num_input_tokens_seen": 40060528, + "step": 68990 + }, + { + "epoch": 10.27628835269586, + "grad_norm": 0.0220947265625, + "learning_rate": 0.016890025379716494, + "loss": 0.7837, + "num_input_tokens_seen": 40063280, + "step": 68995 + }, + { + "epoch": 10.277033065236818, + "grad_norm": 0.0303955078125, + "learning_rate": 0.016888091249565407, + "loss": 0.8033, + "num_input_tokens_seen": 40066256, + "step": 69000 + }, + { + "epoch": 10.277777777777779, + "grad_norm": 0.041748046875, + "learning_rate": 0.016886157087516947, + "loss": 0.8035, + "num_input_tokens_seen": 40069232, + "step": 69005 + }, + { + "epoch": 10.278522490318737, + "grad_norm": 0.0380859375, + "learning_rate": 0.01688422289360377, + "loss": 0.8301, + "num_input_tokens_seen": 40071952, + "step": 69010 + }, + { + "epoch": 10.279267202859696, + "grad_norm": 0.041259765625, + "learning_rate": 0.01688228866785857, + "loss": 0.8221, + "num_input_tokens_seen": 40074768, + "step": 69015 + }, + { + "epoch": 10.280011915400655, + "grad_norm": 0.0322265625, + "learning_rate": 0.016880354410314012, + "loss": 0.8088, + "num_input_tokens_seen": 40077616, + "step": 69020 + }, + { + "epoch": 10.280756627941615, + "grad_norm": 0.0341796875, + "learning_rate": 0.016878420121002775, + "loss": 0.8041, + "num_input_tokens_seen": 40080560, + "step": 69025 + }, + { + "epoch": 10.281501340482574, + "grad_norm": 0.033935546875, + "learning_rate": 0.01687648579995754, + "loss": 0.7874, + "num_input_tokens_seen": 40083408, + "step": 69030 + }, + { + "epoch": 10.282246053023533, + "grad_norm": 0.04052734375, + "learning_rate": 0.016874551447210992, + "loss": 0.8077, + "num_input_tokens_seen": 40086352, + "step": 69035 + }, + { + "epoch": 10.282990765564492, + "grad_norm": 0.0390625, + "learning_rate": 0.0168726170627958, + "loss": 0.8116, + "num_input_tokens_seen": 40089264, + "step": 69040 + }, + { + "epoch": 10.283735478105452, + "grad_norm": 0.0218505859375, + "learning_rate": 0.016870682646744642, + "loss": 0.7984, + "num_input_tokens_seen": 40092272, + "step": 69045 + }, + { + "epoch": 10.284480190646411, + "grad_norm": 0.043212890625, + "learning_rate": 0.016868748199090202, + "loss": 0.7954, + "num_input_tokens_seen": 40095056, + "step": 69050 + }, + { + "epoch": 10.28522490318737, + "grad_norm": 0.03515625, + "learning_rate": 0.01686681371986516, + "loss": 0.7896, + "num_input_tokens_seen": 40098288, + "step": 69055 + }, + { + "epoch": 10.285969615728328, + "grad_norm": 0.04150390625, + "learning_rate": 0.0168648792091022, + "loss": 0.7924, + "num_input_tokens_seen": 40101232, + "step": 69060 + }, + { + "epoch": 10.286714328269287, + "grad_norm": 0.0203857421875, + "learning_rate": 0.016862944666834, + "loss": 0.8187, + "num_input_tokens_seen": 40104048, + "step": 69065 + }, + { + "epoch": 10.287459040810248, + "grad_norm": 0.038330078125, + "learning_rate": 0.016861010093093244, + "loss": 0.7829, + "num_input_tokens_seen": 40107024, + "step": 69070 + }, + { + "epoch": 10.288203753351207, + "grad_norm": 0.037841796875, + "learning_rate": 0.016859075487912614, + "loss": 0.8043, + "num_input_tokens_seen": 40109904, + "step": 69075 + }, + { + "epoch": 10.288948465892165, + "grad_norm": 0.0390625, + "learning_rate": 0.016857140851324795, + "loss": 0.7974, + "num_input_tokens_seen": 40112816, + "step": 69080 + }, + { + "epoch": 10.289693178433124, + "grad_norm": 0.03662109375, + "learning_rate": 0.016855206183362465, + "loss": 0.8022, + "num_input_tokens_seen": 40115664, + "step": 69085 + }, + { + "epoch": 10.290437890974085, + "grad_norm": 0.034912109375, + "learning_rate": 0.016853271484058314, + "loss": 0.802, + "num_input_tokens_seen": 40118128, + "step": 69090 + }, + { + "epoch": 10.291182603515043, + "grad_norm": 0.034912109375, + "learning_rate": 0.016851336753445023, + "loss": 0.7991, + "num_input_tokens_seen": 40121232, + "step": 69095 + }, + { + "epoch": 10.291927316056002, + "grad_norm": 0.027587890625, + "learning_rate": 0.01684940199155528, + "loss": 0.8127, + "num_input_tokens_seen": 40124304, + "step": 69100 + }, + { + "epoch": 10.29267202859696, + "grad_norm": 0.035400390625, + "learning_rate": 0.016847467198421773, + "loss": 0.7807, + "num_input_tokens_seen": 40127056, + "step": 69105 + }, + { + "epoch": 10.293416741137921, + "grad_norm": 0.034912109375, + "learning_rate": 0.016845532374077182, + "loss": 0.851, + "num_input_tokens_seen": 40129872, + "step": 69110 + }, + { + "epoch": 10.29416145367888, + "grad_norm": 0.031005859375, + "learning_rate": 0.016843597518554198, + "loss": 0.7902, + "num_input_tokens_seen": 40132912, + "step": 69115 + }, + { + "epoch": 10.294906166219839, + "grad_norm": 0.0255126953125, + "learning_rate": 0.01684166263188551, + "loss": 0.8041, + "num_input_tokens_seen": 40135568, + "step": 69120 + }, + { + "epoch": 10.295650878760798, + "grad_norm": 0.045654296875, + "learning_rate": 0.0168397277141038, + "loss": 0.8035, + "num_input_tokens_seen": 40138192, + "step": 69125 + }, + { + "epoch": 10.296395591301758, + "grad_norm": 0.03955078125, + "learning_rate": 0.01683779276524176, + "loss": 0.8076, + "num_input_tokens_seen": 40141264, + "step": 69130 + }, + { + "epoch": 10.297140303842717, + "grad_norm": 0.03369140625, + "learning_rate": 0.016835857785332087, + "loss": 0.776, + "num_input_tokens_seen": 40144112, + "step": 69135 + }, + { + "epoch": 10.297885016383676, + "grad_norm": 0.0230712890625, + "learning_rate": 0.016833922774407453, + "loss": 0.7984, + "num_input_tokens_seen": 40147120, + "step": 69140 + }, + { + "epoch": 10.298629728924634, + "grad_norm": 0.0281982421875, + "learning_rate": 0.01683198773250056, + "loss": 0.7814, + "num_input_tokens_seen": 40150064, + "step": 69145 + }, + { + "epoch": 10.299374441465595, + "grad_norm": 0.0439453125, + "learning_rate": 0.016830052659644098, + "loss": 0.8339, + "num_input_tokens_seen": 40153040, + "step": 69150 + }, + { + "epoch": 10.300119154006554, + "grad_norm": 0.01806640625, + "learning_rate": 0.016828117555870752, + "loss": 0.8105, + "num_input_tokens_seen": 40156144, + "step": 69155 + }, + { + "epoch": 10.300863866547513, + "grad_norm": 0.0255126953125, + "learning_rate": 0.016826182421213223, + "loss": 0.7789, + "num_input_tokens_seen": 40159056, + "step": 69160 + }, + { + "epoch": 10.301608579088471, + "grad_norm": 0.03515625, + "learning_rate": 0.016824247255704194, + "loss": 0.8041, + "num_input_tokens_seen": 40161968, + "step": 69165 + }, + { + "epoch": 10.302353291629432, + "grad_norm": 0.0291748046875, + "learning_rate": 0.01682231205937637, + "loss": 0.7938, + "num_input_tokens_seen": 40165008, + "step": 69170 + }, + { + "epoch": 10.30309800417039, + "grad_norm": 0.029541015625, + "learning_rate": 0.01682037683226243, + "loss": 0.8059, + "num_input_tokens_seen": 40168048, + "step": 69175 + }, + { + "epoch": 10.30384271671135, + "grad_norm": 0.03759765625, + "learning_rate": 0.016818441574395073, + "loss": 0.766, + "num_input_tokens_seen": 40170832, + "step": 69180 + }, + { + "epoch": 10.304587429252308, + "grad_norm": 0.02783203125, + "learning_rate": 0.01681650628580699, + "loss": 0.7922, + "num_input_tokens_seen": 40173840, + "step": 69185 + }, + { + "epoch": 10.305332141793269, + "grad_norm": 0.045166015625, + "learning_rate": 0.016814570966530884, + "loss": 0.7846, + "num_input_tokens_seen": 40176688, + "step": 69190 + }, + { + "epoch": 10.306076854334227, + "grad_norm": 0.037353515625, + "learning_rate": 0.016812635616599448, + "loss": 0.8087, + "num_input_tokens_seen": 40179184, + "step": 69195 + }, + { + "epoch": 10.306821566875186, + "grad_norm": 0.038330078125, + "learning_rate": 0.016810700236045376, + "loss": 0.7832, + "num_input_tokens_seen": 40182128, + "step": 69200 + }, + { + "epoch": 10.307566279416145, + "grad_norm": 0.037841796875, + "learning_rate": 0.016808764824901363, + "loss": 0.78, + "num_input_tokens_seen": 40185008, + "step": 69205 + }, + { + "epoch": 10.308310991957104, + "grad_norm": 0.03955078125, + "learning_rate": 0.016806829383200102, + "loss": 0.7728, + "num_input_tokens_seen": 40187792, + "step": 69210 + }, + { + "epoch": 10.309055704498064, + "grad_norm": 0.0732421875, + "learning_rate": 0.016804893910974297, + "loss": 0.8032, + "num_input_tokens_seen": 40190896, + "step": 69215 + }, + { + "epoch": 10.309800417039023, + "grad_norm": 0.041259765625, + "learning_rate": 0.016802958408256644, + "loss": 0.7853, + "num_input_tokens_seen": 40193936, + "step": 69220 + }, + { + "epoch": 10.310545129579982, + "grad_norm": 0.10595703125, + "learning_rate": 0.016801022875079837, + "loss": 0.8333, + "num_input_tokens_seen": 40196880, + "step": 69225 + }, + { + "epoch": 10.31128984212094, + "grad_norm": 0.038330078125, + "learning_rate": 0.016799087311476585, + "loss": 0.7748, + "num_input_tokens_seen": 40199696, + "step": 69230 + }, + { + "epoch": 10.312034554661901, + "grad_norm": 0.040283203125, + "learning_rate": 0.016797151717479582, + "loss": 0.7997, + "num_input_tokens_seen": 40202928, + "step": 69235 + }, + { + "epoch": 10.31277926720286, + "grad_norm": 0.03466796875, + "learning_rate": 0.01679521609312153, + "loss": 0.8185, + "num_input_tokens_seen": 40205872, + "step": 69240 + }, + { + "epoch": 10.313523979743819, + "grad_norm": 0.039794921875, + "learning_rate": 0.01679328043843512, + "loss": 0.8179, + "num_input_tokens_seen": 40208656, + "step": 69245 + }, + { + "epoch": 10.314268692284777, + "grad_norm": 0.0311279296875, + "learning_rate": 0.01679134475345306, + "loss": 0.8053, + "num_input_tokens_seen": 40212016, + "step": 69250 + }, + { + "epoch": 10.315013404825738, + "grad_norm": 0.0380859375, + "learning_rate": 0.01678940903820805, + "loss": 0.797, + "num_input_tokens_seen": 40214928, + "step": 69255 + }, + { + "epoch": 10.315758117366697, + "grad_norm": 0.0595703125, + "learning_rate": 0.016787473292732794, + "loss": 0.8192, + "num_input_tokens_seen": 40218000, + "step": 69260 + }, + { + "epoch": 10.316502829907655, + "grad_norm": 0.0390625, + "learning_rate": 0.016785537517059995, + "loss": 0.779, + "num_input_tokens_seen": 40221072, + "step": 69265 + }, + { + "epoch": 10.317247542448614, + "grad_norm": 0.03173828125, + "learning_rate": 0.016783601711222357, + "loss": 0.8136, + "num_input_tokens_seen": 40223888, + "step": 69270 + }, + { + "epoch": 10.317992254989575, + "grad_norm": 0.03955078125, + "learning_rate": 0.016781665875252576, + "loss": 0.7996, + "num_input_tokens_seen": 40226480, + "step": 69275 + }, + { + "epoch": 10.318736967530533, + "grad_norm": 0.0361328125, + "learning_rate": 0.016779730009183366, + "loss": 0.8024, + "num_input_tokens_seen": 40229584, + "step": 69280 + }, + { + "epoch": 10.319481680071492, + "grad_norm": 0.030517578125, + "learning_rate": 0.016777794113047424, + "loss": 0.7857, + "num_input_tokens_seen": 40232528, + "step": 69285 + }, + { + "epoch": 10.320226392612451, + "grad_norm": 0.041015625, + "learning_rate": 0.016775858186877456, + "loss": 0.8125, + "num_input_tokens_seen": 40235248, + "step": 69290 + }, + { + "epoch": 10.320971105153411, + "grad_norm": 0.040283203125, + "learning_rate": 0.01677392223070617, + "loss": 0.7935, + "num_input_tokens_seen": 40238096, + "step": 69295 + }, + { + "epoch": 10.32171581769437, + "grad_norm": 0.0269775390625, + "learning_rate": 0.016771986244566273, + "loss": 0.818, + "num_input_tokens_seen": 40240720, + "step": 69300 + }, + { + "epoch": 10.322460530235329, + "grad_norm": 0.0255126953125, + "learning_rate": 0.016770050228490464, + "loss": 0.7964, + "num_input_tokens_seen": 40243728, + "step": 69305 + }, + { + "epoch": 10.323205242776288, + "grad_norm": 0.038818359375, + "learning_rate": 0.016768114182511458, + "loss": 0.8012, + "num_input_tokens_seen": 40246480, + "step": 69310 + }, + { + "epoch": 10.323949955317248, + "grad_norm": 0.0283203125, + "learning_rate": 0.016766178106661957, + "loss": 0.806, + "num_input_tokens_seen": 40249552, + "step": 69315 + }, + { + "epoch": 10.324694667858207, + "grad_norm": 0.0228271484375, + "learning_rate": 0.016764242000974677, + "loss": 0.8029, + "num_input_tokens_seen": 40252624, + "step": 69320 + }, + { + "epoch": 10.325439380399166, + "grad_norm": 0.037841796875, + "learning_rate": 0.01676230586548232, + "loss": 0.8199, + "num_input_tokens_seen": 40255664, + "step": 69325 + }, + { + "epoch": 10.326184092940125, + "grad_norm": 0.047119140625, + "learning_rate": 0.016760369700217596, + "loss": 0.8064, + "num_input_tokens_seen": 40258608, + "step": 69330 + }, + { + "epoch": 10.326928805481085, + "grad_norm": 0.018310546875, + "learning_rate": 0.016758433505213215, + "loss": 0.7883, + "num_input_tokens_seen": 40261264, + "step": 69335 + }, + { + "epoch": 10.327673518022044, + "grad_norm": 0.032470703125, + "learning_rate": 0.016756497280501888, + "loss": 0.7991, + "num_input_tokens_seen": 40264336, + "step": 69340 + }, + { + "epoch": 10.328418230563003, + "grad_norm": 0.0308837890625, + "learning_rate": 0.016754561026116322, + "loss": 0.7875, + "num_input_tokens_seen": 40267184, + "step": 69345 + }, + { + "epoch": 10.329162943103961, + "grad_norm": 0.06591796875, + "learning_rate": 0.01675262474208923, + "loss": 0.7963, + "num_input_tokens_seen": 40270000, + "step": 69350 + }, + { + "epoch": 10.329907655644922, + "grad_norm": 0.042236328125, + "learning_rate": 0.016750688428453328, + "loss": 0.7985, + "num_input_tokens_seen": 40272976, + "step": 69355 + }, + { + "epoch": 10.33065236818588, + "grad_norm": 0.028076171875, + "learning_rate": 0.016748752085241323, + "loss": 0.7922, + "num_input_tokens_seen": 40275696, + "step": 69360 + }, + { + "epoch": 10.33139708072684, + "grad_norm": 0.037841796875, + "learning_rate": 0.01674681571248593, + "loss": 0.7947, + "num_input_tokens_seen": 40278416, + "step": 69365 + }, + { + "epoch": 10.332141793267798, + "grad_norm": 0.028564453125, + "learning_rate": 0.016744879310219864, + "loss": 0.7784, + "num_input_tokens_seen": 40281392, + "step": 69370 + }, + { + "epoch": 10.332886505808759, + "grad_norm": 0.07958984375, + "learning_rate": 0.01674294287847583, + "loss": 0.8386, + "num_input_tokens_seen": 40284240, + "step": 69375 + }, + { + "epoch": 10.333631218349717, + "grad_norm": 0.03466796875, + "learning_rate": 0.01674100641728655, + "loss": 0.794, + "num_input_tokens_seen": 40287376, + "step": 69380 + }, + { + "epoch": 10.334375930890676, + "grad_norm": 0.043212890625, + "learning_rate": 0.01673906992668473, + "loss": 0.7951, + "num_input_tokens_seen": 40291056, + "step": 69385 + }, + { + "epoch": 10.335120643431635, + "grad_norm": 0.02001953125, + "learning_rate": 0.016737133406703096, + "loss": 0.7898, + "num_input_tokens_seen": 40293776, + "step": 69390 + }, + { + "epoch": 10.335865355972594, + "grad_norm": 0.031494140625, + "learning_rate": 0.01673519685737436, + "loss": 0.8078, + "num_input_tokens_seen": 40296720, + "step": 69395 + }, + { + "epoch": 10.336610068513554, + "grad_norm": 0.06591796875, + "learning_rate": 0.016733260278731236, + "loss": 0.7895, + "num_input_tokens_seen": 40299568, + "step": 69400 + }, + { + "epoch": 10.337354781054513, + "grad_norm": 0.044677734375, + "learning_rate": 0.016731323670806444, + "loss": 0.785, + "num_input_tokens_seen": 40302320, + "step": 69405 + }, + { + "epoch": 10.338099493595472, + "grad_norm": 0.0255126953125, + "learning_rate": 0.016729387033632696, + "loss": 0.7939, + "num_input_tokens_seen": 40304848, + "step": 69410 + }, + { + "epoch": 10.33884420613643, + "grad_norm": 0.04150390625, + "learning_rate": 0.01672745036724271, + "loss": 0.779, + "num_input_tokens_seen": 40307632, + "step": 69415 + }, + { + "epoch": 10.339588918677391, + "grad_norm": 0.027099609375, + "learning_rate": 0.016725513671669204, + "loss": 0.7735, + "num_input_tokens_seen": 40310960, + "step": 69420 + }, + { + "epoch": 10.34033363121835, + "grad_norm": 0.048828125, + "learning_rate": 0.016723576946944898, + "loss": 0.7717, + "num_input_tokens_seen": 40313744, + "step": 69425 + }, + { + "epoch": 10.341078343759309, + "grad_norm": 0.029541015625, + "learning_rate": 0.016721640193102513, + "loss": 0.7977, + "num_input_tokens_seen": 40316976, + "step": 69430 + }, + { + "epoch": 10.341823056300267, + "grad_norm": 0.05126953125, + "learning_rate": 0.01671970341017477, + "loss": 0.8091, + "num_input_tokens_seen": 40319664, + "step": 69435 + }, + { + "epoch": 10.342567768841228, + "grad_norm": 0.037109375, + "learning_rate": 0.01671776659819439, + "loss": 0.8043, + "num_input_tokens_seen": 40322768, + "step": 69440 + }, + { + "epoch": 10.343312481382187, + "grad_norm": 0.0498046875, + "learning_rate": 0.016715829757194076, + "loss": 0.8061, + "num_input_tokens_seen": 40325680, + "step": 69445 + }, + { + "epoch": 10.344057193923145, + "grad_norm": 0.03076171875, + "learning_rate": 0.01671389288720657, + "loss": 0.806, + "num_input_tokens_seen": 40328720, + "step": 69450 + }, + { + "epoch": 10.344801906464104, + "grad_norm": 0.044677734375, + "learning_rate": 0.016711955988264582, + "loss": 0.7886, + "num_input_tokens_seen": 40331408, + "step": 69455 + }, + { + "epoch": 10.345546619005065, + "grad_norm": 0.0296630859375, + "learning_rate": 0.01671001906040084, + "loss": 0.7881, + "num_input_tokens_seen": 40334384, + "step": 69460 + }, + { + "epoch": 10.346291331546023, + "grad_norm": 0.029541015625, + "learning_rate": 0.016708082103648065, + "loss": 0.7799, + "num_input_tokens_seen": 40337104, + "step": 69465 + }, + { + "epoch": 10.347036044086982, + "grad_norm": 0.0283203125, + "learning_rate": 0.016706145118038977, + "loss": 0.8057, + "num_input_tokens_seen": 40339888, + "step": 69470 + }, + { + "epoch": 10.347780756627941, + "grad_norm": 0.0277099609375, + "learning_rate": 0.016704208103606302, + "loss": 0.7794, + "num_input_tokens_seen": 40342992, + "step": 69475 + }, + { + "epoch": 10.348525469168901, + "grad_norm": 0.039794921875, + "learning_rate": 0.016702271060382766, + "loss": 0.7744, + "num_input_tokens_seen": 40345840, + "step": 69480 + }, + { + "epoch": 10.34927018170986, + "grad_norm": 0.037353515625, + "learning_rate": 0.016700333988401084, + "loss": 0.8245, + "num_input_tokens_seen": 40348880, + "step": 69485 + }, + { + "epoch": 10.350014894250819, + "grad_norm": 0.03515625, + "learning_rate": 0.016698396887693992, + "loss": 0.8011, + "num_input_tokens_seen": 40351664, + "step": 69490 + }, + { + "epoch": 10.350759606791778, + "grad_norm": 0.0303955078125, + "learning_rate": 0.01669645975829421, + "loss": 0.8089, + "num_input_tokens_seen": 40354576, + "step": 69495 + }, + { + "epoch": 10.351504319332738, + "grad_norm": 0.05126953125, + "learning_rate": 0.016694522600234466, + "loss": 0.8223, + "num_input_tokens_seen": 40357296, + "step": 69500 + }, + { + "epoch": 10.352249031873697, + "grad_norm": 0.03857421875, + "learning_rate": 0.01669258541354748, + "loss": 0.7925, + "num_input_tokens_seen": 40360624, + "step": 69505 + }, + { + "epoch": 10.352993744414656, + "grad_norm": 0.0263671875, + "learning_rate": 0.01669064819826599, + "loss": 0.8166, + "num_input_tokens_seen": 40363664, + "step": 69510 + }, + { + "epoch": 10.353738456955615, + "grad_norm": 0.0224609375, + "learning_rate": 0.01668871095442271, + "loss": 0.8176, + "num_input_tokens_seen": 40366416, + "step": 69515 + }, + { + "epoch": 10.354483169496575, + "grad_norm": 0.0289306640625, + "learning_rate": 0.016686773682050383, + "loss": 0.7807, + "num_input_tokens_seen": 40369168, + "step": 69520 + }, + { + "epoch": 10.355227882037534, + "grad_norm": 0.03857421875, + "learning_rate": 0.016684836381181723, + "loss": 0.8332, + "num_input_tokens_seen": 40371984, + "step": 69525 + }, + { + "epoch": 10.355972594578493, + "grad_norm": 0.04052734375, + "learning_rate": 0.016682899051849467, + "loss": 0.812, + "num_input_tokens_seen": 40374768, + "step": 69530 + }, + { + "epoch": 10.356717307119451, + "grad_norm": 0.1591796875, + "learning_rate": 0.016680961694086338, + "loss": 0.8262, + "num_input_tokens_seen": 40377648, + "step": 69535 + }, + { + "epoch": 10.357462019660412, + "grad_norm": 0.0439453125, + "learning_rate": 0.016679024307925075, + "loss": 0.7959, + "num_input_tokens_seen": 40380144, + "step": 69540 + }, + { + "epoch": 10.35820673220137, + "grad_norm": 0.0546875, + "learning_rate": 0.0166770868933984, + "loss": 0.8045, + "num_input_tokens_seen": 40383152, + "step": 69545 + }, + { + "epoch": 10.35895144474233, + "grad_norm": 0.050537109375, + "learning_rate": 0.016675149450539047, + "loss": 0.8053, + "num_input_tokens_seen": 40386064, + "step": 69550 + }, + { + "epoch": 10.359696157283288, + "grad_norm": 0.037841796875, + "learning_rate": 0.016673211979379744, + "loss": 0.8315, + "num_input_tokens_seen": 40389136, + "step": 69555 + }, + { + "epoch": 10.360440869824249, + "grad_norm": 0.027099609375, + "learning_rate": 0.01667127447995323, + "loss": 0.8082, + "num_input_tokens_seen": 40391856, + "step": 69560 + }, + { + "epoch": 10.361185582365207, + "grad_norm": 0.04150390625, + "learning_rate": 0.01666933695229223, + "loss": 0.7775, + "num_input_tokens_seen": 40394832, + "step": 69565 + }, + { + "epoch": 10.361930294906166, + "grad_norm": 0.036865234375, + "learning_rate": 0.016667399396429482, + "loss": 0.7993, + "num_input_tokens_seen": 40397776, + "step": 69570 + }, + { + "epoch": 10.362675007447125, + "grad_norm": 0.042236328125, + "learning_rate": 0.01666546181239771, + "loss": 0.8132, + "num_input_tokens_seen": 40400720, + "step": 69575 + }, + { + "epoch": 10.363419719988084, + "grad_norm": 0.03466796875, + "learning_rate": 0.016663524200229658, + "loss": 0.799, + "num_input_tokens_seen": 40403472, + "step": 69580 + }, + { + "epoch": 10.364164432529044, + "grad_norm": 0.031494140625, + "learning_rate": 0.016661586559958055, + "loss": 0.7963, + "num_input_tokens_seen": 40406160, + "step": 69585 + }, + { + "epoch": 10.364909145070003, + "grad_norm": 0.037353515625, + "learning_rate": 0.01665964889161563, + "loss": 0.7795, + "num_input_tokens_seen": 40409008, + "step": 69590 + }, + { + "epoch": 10.365653857610962, + "grad_norm": 0.041748046875, + "learning_rate": 0.016657711195235125, + "loss": 0.795, + "num_input_tokens_seen": 40411856, + "step": 69595 + }, + { + "epoch": 10.36639857015192, + "grad_norm": 0.041259765625, + "learning_rate": 0.016655773470849282, + "loss": 0.7845, + "num_input_tokens_seen": 40414928, + "step": 69600 + }, + { + "epoch": 10.367143282692881, + "grad_norm": 0.050048828125, + "learning_rate": 0.016653835718490828, + "loss": 0.7835, + "num_input_tokens_seen": 40418160, + "step": 69605 + }, + { + "epoch": 10.36788799523384, + "grad_norm": 0.04443359375, + "learning_rate": 0.016651897938192495, + "loss": 0.7881, + "num_input_tokens_seen": 40421392, + "step": 69610 + }, + { + "epoch": 10.368632707774799, + "grad_norm": 0.06005859375, + "learning_rate": 0.016649960129987026, + "loss": 0.8003, + "num_input_tokens_seen": 40424496, + "step": 69615 + }, + { + "epoch": 10.369377420315757, + "grad_norm": 0.0294189453125, + "learning_rate": 0.016648022293907157, + "loss": 0.7985, + "num_input_tokens_seen": 40427056, + "step": 69620 + }, + { + "epoch": 10.370122132856718, + "grad_norm": 0.045654296875, + "learning_rate": 0.016646084429985632, + "loss": 0.8122, + "num_input_tokens_seen": 40430000, + "step": 69625 + }, + { + "epoch": 10.370866845397677, + "grad_norm": 0.032470703125, + "learning_rate": 0.01664414653825518, + "loss": 0.7835, + "num_input_tokens_seen": 40432976, + "step": 69630 + }, + { + "epoch": 10.371611557938635, + "grad_norm": 0.0306396484375, + "learning_rate": 0.01664220861874854, + "loss": 0.7836, + "num_input_tokens_seen": 40436016, + "step": 69635 + }, + { + "epoch": 10.372356270479594, + "grad_norm": 0.05810546875, + "learning_rate": 0.01664027067149846, + "loss": 0.7934, + "num_input_tokens_seen": 40438992, + "step": 69640 + }, + { + "epoch": 10.373100983020555, + "grad_norm": 0.0654296875, + "learning_rate": 0.01663833269653767, + "loss": 0.8012, + "num_input_tokens_seen": 40441872, + "step": 69645 + }, + { + "epoch": 10.373845695561513, + "grad_norm": 0.068359375, + "learning_rate": 0.016636394693898916, + "loss": 0.7596, + "num_input_tokens_seen": 40444752, + "step": 69650 + }, + { + "epoch": 10.374590408102472, + "grad_norm": 0.041748046875, + "learning_rate": 0.016634456663614936, + "loss": 0.7943, + "num_input_tokens_seen": 40447792, + "step": 69655 + }, + { + "epoch": 10.375335120643431, + "grad_norm": 0.044921875, + "learning_rate": 0.01663251860571847, + "loss": 0.824, + "num_input_tokens_seen": 40450480, + "step": 69660 + }, + { + "epoch": 10.376079833184392, + "grad_norm": 0.07373046875, + "learning_rate": 0.01663058052024227, + "loss": 0.8127, + "num_input_tokens_seen": 40453456, + "step": 69665 + }, + { + "epoch": 10.37682454572535, + "grad_norm": 0.021484375, + "learning_rate": 0.01662864240721906, + "loss": 0.7645, + "num_input_tokens_seen": 40456432, + "step": 69670 + }, + { + "epoch": 10.377569258266309, + "grad_norm": 0.044189453125, + "learning_rate": 0.016626704266681593, + "loss": 0.8354, + "num_input_tokens_seen": 40459728, + "step": 69675 + }, + { + "epoch": 10.378313970807268, + "grad_norm": 0.0439453125, + "learning_rate": 0.016624766098662612, + "loss": 0.7927, + "num_input_tokens_seen": 40462640, + "step": 69680 + }, + { + "epoch": 10.379058683348228, + "grad_norm": 0.0228271484375, + "learning_rate": 0.016622827903194862, + "loss": 0.8011, + "num_input_tokens_seen": 40465200, + "step": 69685 + }, + { + "epoch": 10.379803395889187, + "grad_norm": 0.059814453125, + "learning_rate": 0.016620889680311085, + "loss": 0.7942, + "num_input_tokens_seen": 40468336, + "step": 69690 + }, + { + "epoch": 10.380548108430146, + "grad_norm": 0.02978515625, + "learning_rate": 0.016618951430044018, + "loss": 0.7922, + "num_input_tokens_seen": 40470992, + "step": 69695 + }, + { + "epoch": 10.381292820971105, + "grad_norm": 0.055419921875, + "learning_rate": 0.016617013152426415, + "loss": 0.7773, + "num_input_tokens_seen": 40474032, + "step": 69700 + }, + { + "epoch": 10.382037533512065, + "grad_norm": 0.08837890625, + "learning_rate": 0.01661507484749102, + "loss": 0.7879, + "num_input_tokens_seen": 40476688, + "step": 69705 + }, + { + "epoch": 10.382782246053024, + "grad_norm": 0.0303955078125, + "learning_rate": 0.016613136515270573, + "loss": 0.7995, + "num_input_tokens_seen": 40479216, + "step": 69710 + }, + { + "epoch": 10.383526958593983, + "grad_norm": 0.041259765625, + "learning_rate": 0.016611198155797827, + "loss": 0.8082, + "num_input_tokens_seen": 40481872, + "step": 69715 + }, + { + "epoch": 10.384271671134941, + "grad_norm": 0.0283203125, + "learning_rate": 0.01660925976910552, + "loss": 0.8057, + "num_input_tokens_seen": 40484528, + "step": 69720 + }, + { + "epoch": 10.3850163836759, + "grad_norm": 0.030029296875, + "learning_rate": 0.016607321355226416, + "loss": 0.8152, + "num_input_tokens_seen": 40487312, + "step": 69725 + }, + { + "epoch": 10.38576109621686, + "grad_norm": 0.045654296875, + "learning_rate": 0.016605382914193244, + "loss": 0.8156, + "num_input_tokens_seen": 40490000, + "step": 69730 + }, + { + "epoch": 10.38650580875782, + "grad_norm": 0.045166015625, + "learning_rate": 0.01660344444603876, + "loss": 0.7888, + "num_input_tokens_seen": 40493136, + "step": 69735 + }, + { + "epoch": 10.387250521298778, + "grad_norm": 0.0703125, + "learning_rate": 0.016601505950795712, + "loss": 0.7911, + "num_input_tokens_seen": 40496048, + "step": 69740 + }, + { + "epoch": 10.387995233839739, + "grad_norm": 0.06591796875, + "learning_rate": 0.016599567428496845, + "loss": 0.8054, + "num_input_tokens_seen": 40499120, + "step": 69745 + }, + { + "epoch": 10.388739946380698, + "grad_norm": 0.0238037109375, + "learning_rate": 0.01659762887917492, + "loss": 0.7998, + "num_input_tokens_seen": 40502160, + "step": 69750 + }, + { + "epoch": 10.389484658921656, + "grad_norm": 0.035888671875, + "learning_rate": 0.016595690302862672, + "loss": 0.817, + "num_input_tokens_seen": 40505168, + "step": 69755 + }, + { + "epoch": 10.390229371462615, + "grad_norm": 0.041748046875, + "learning_rate": 0.01659375169959286, + "loss": 0.8185, + "num_input_tokens_seen": 40507728, + "step": 69760 + }, + { + "epoch": 10.390974084003574, + "grad_norm": 0.04296875, + "learning_rate": 0.016591813069398234, + "loss": 0.7766, + "num_input_tokens_seen": 40510896, + "step": 69765 + }, + { + "epoch": 10.391718796544534, + "grad_norm": 0.0390625, + "learning_rate": 0.016589874412311547, + "loss": 0.8076, + "num_input_tokens_seen": 40513456, + "step": 69770 + }, + { + "epoch": 10.392463509085493, + "grad_norm": 0.053955078125, + "learning_rate": 0.016587935728365543, + "loss": 0.8019, + "num_input_tokens_seen": 40516240, + "step": 69775 + }, + { + "epoch": 10.393208221626452, + "grad_norm": 0.04248046875, + "learning_rate": 0.01658599701759298, + "loss": 0.781, + "num_input_tokens_seen": 40519120, + "step": 69780 + }, + { + "epoch": 10.39395293416741, + "grad_norm": 0.05078125, + "learning_rate": 0.016584058280026605, + "loss": 0.7621, + "num_input_tokens_seen": 40522128, + "step": 69785 + }, + { + "epoch": 10.394697646708371, + "grad_norm": 0.03173828125, + "learning_rate": 0.016582119515699184, + "loss": 0.8091, + "num_input_tokens_seen": 40524880, + "step": 69790 + }, + { + "epoch": 10.39544235924933, + "grad_norm": 0.060302734375, + "learning_rate": 0.016580180724643456, + "loss": 0.7804, + "num_input_tokens_seen": 40527600, + "step": 69795 + }, + { + "epoch": 10.396187071790289, + "grad_norm": 0.038330078125, + "learning_rate": 0.016578241906892183, + "loss": 0.818, + "num_input_tokens_seen": 40530704, + "step": 69800 + }, + { + "epoch": 10.396931784331247, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01657630306247812, + "loss": 0.8034, + "num_input_tokens_seen": 40533648, + "step": 69805 + }, + { + "epoch": 10.397676496872208, + "grad_norm": 0.0458984375, + "learning_rate": 0.016574364191434012, + "loss": 0.8024, + "num_input_tokens_seen": 40536528, + "step": 69810 + }, + { + "epoch": 10.398421209413167, + "grad_norm": 0.03466796875, + "learning_rate": 0.016572425293792624, + "loss": 0.8139, + "num_input_tokens_seen": 40539504, + "step": 69815 + }, + { + "epoch": 10.399165921954125, + "grad_norm": 0.046142578125, + "learning_rate": 0.016570486369586707, + "loss": 0.7856, + "num_input_tokens_seen": 40542416, + "step": 69820 + }, + { + "epoch": 10.399910634495084, + "grad_norm": 0.046142578125, + "learning_rate": 0.01656854741884902, + "loss": 0.8052, + "num_input_tokens_seen": 40545392, + "step": 69825 + }, + { + "epoch": 10.400655347036045, + "grad_norm": 0.056396484375, + "learning_rate": 0.016566608441612323, + "loss": 0.8261, + "num_input_tokens_seen": 40548432, + "step": 69830 + }, + { + "epoch": 10.401400059577004, + "grad_norm": 0.038818359375, + "learning_rate": 0.016564669437909362, + "loss": 0.7999, + "num_input_tokens_seen": 40551696, + "step": 69835 + }, + { + "epoch": 10.402144772117962, + "grad_norm": 0.06298828125, + "learning_rate": 0.016562730407772903, + "loss": 0.8129, + "num_input_tokens_seen": 40554576, + "step": 69840 + }, + { + "epoch": 10.402889484658921, + "grad_norm": 0.04931640625, + "learning_rate": 0.01656079135123571, + "loss": 1.0187, + "num_input_tokens_seen": 40557552, + "step": 69845 + }, + { + "epoch": 10.403634197199882, + "grad_norm": 0.03173828125, + "learning_rate": 0.016558852268330523, + "loss": 0.8113, + "num_input_tokens_seen": 40560464, + "step": 69850 + }, + { + "epoch": 10.40437890974084, + "grad_norm": 0.041748046875, + "learning_rate": 0.016556913159090116, + "loss": 0.8073, + "num_input_tokens_seen": 40563280, + "step": 69855 + }, + { + "epoch": 10.405123622281799, + "grad_norm": 0.051513671875, + "learning_rate": 0.01655497402354724, + "loss": 0.8178, + "num_input_tokens_seen": 40566128, + "step": 69860 + }, + { + "epoch": 10.405868334822758, + "grad_norm": 0.0283203125, + "learning_rate": 0.016553034861734664, + "loss": 0.8049, + "num_input_tokens_seen": 40568816, + "step": 69865 + }, + { + "epoch": 10.406613047363718, + "grad_norm": 0.0341796875, + "learning_rate": 0.016551095673685137, + "loss": 0.8073, + "num_input_tokens_seen": 40571568, + "step": 69870 + }, + { + "epoch": 10.407357759904677, + "grad_norm": 0.028564453125, + "learning_rate": 0.016549156459431426, + "loss": 0.7954, + "num_input_tokens_seen": 40574128, + "step": 69875 + }, + { + "epoch": 10.408102472445636, + "grad_norm": 0.041015625, + "learning_rate": 0.016547217219006292, + "loss": 0.8024, + "num_input_tokens_seen": 40577072, + "step": 69880 + }, + { + "epoch": 10.408847184986595, + "grad_norm": 0.05126953125, + "learning_rate": 0.016545277952442497, + "loss": 0.7997, + "num_input_tokens_seen": 40580528, + "step": 69885 + }, + { + "epoch": 10.409591897527555, + "grad_norm": 0.031982421875, + "learning_rate": 0.0165433386597728, + "loss": 0.8176, + "num_input_tokens_seen": 40583408, + "step": 69890 + }, + { + "epoch": 10.410336610068514, + "grad_norm": 0.04296875, + "learning_rate": 0.016541399341029966, + "loss": 0.7999, + "num_input_tokens_seen": 40586384, + "step": 69895 + }, + { + "epoch": 10.411081322609473, + "grad_norm": 0.033203125, + "learning_rate": 0.016539459996246758, + "loss": 0.787, + "num_input_tokens_seen": 40589264, + "step": 69900 + }, + { + "epoch": 10.411826035150431, + "grad_norm": 0.060302734375, + "learning_rate": 0.016537520625455937, + "loss": 0.8051, + "num_input_tokens_seen": 40592016, + "step": 69905 + }, + { + "epoch": 10.41257074769139, + "grad_norm": 0.0255126953125, + "learning_rate": 0.016535581228690265, + "loss": 0.7865, + "num_input_tokens_seen": 40594512, + "step": 69910 + }, + { + "epoch": 10.41331546023235, + "grad_norm": 0.03369140625, + "learning_rate": 0.01653364180598251, + "loss": 0.8091, + "num_input_tokens_seen": 40597360, + "step": 69915 + }, + { + "epoch": 10.41406017277331, + "grad_norm": 0.0203857421875, + "learning_rate": 0.016531702357365435, + "loss": 0.8005, + "num_input_tokens_seen": 40600048, + "step": 69920 + }, + { + "epoch": 10.414804885314268, + "grad_norm": 0.037353515625, + "learning_rate": 0.016529762882871807, + "loss": 0.7837, + "num_input_tokens_seen": 40603216, + "step": 69925 + }, + { + "epoch": 10.415549597855227, + "grad_norm": 0.04833984375, + "learning_rate": 0.01652782338253439, + "loss": 0.801, + "num_input_tokens_seen": 40606064, + "step": 69930 + }, + { + "epoch": 10.416294310396188, + "grad_norm": 0.0262451171875, + "learning_rate": 0.016525883856385953, + "loss": 0.7967, + "num_input_tokens_seen": 40608880, + "step": 69935 + }, + { + "epoch": 10.417039022937146, + "grad_norm": 0.03662109375, + "learning_rate": 0.01652394430445925, + "loss": 0.7751, + "num_input_tokens_seen": 40611472, + "step": 69940 + }, + { + "epoch": 10.417783735478105, + "grad_norm": 0.0206298828125, + "learning_rate": 0.016522004726787064, + "loss": 0.8267, + "num_input_tokens_seen": 40614128, + "step": 69945 + }, + { + "epoch": 10.418528448019064, + "grad_norm": 0.03271484375, + "learning_rate": 0.016520065123402154, + "loss": 0.794, + "num_input_tokens_seen": 40616848, + "step": 69950 + }, + { + "epoch": 10.419273160560024, + "grad_norm": 0.072265625, + "learning_rate": 0.016518125494337284, + "loss": 0.8157, + "num_input_tokens_seen": 40619728, + "step": 69955 + }, + { + "epoch": 10.420017873100983, + "grad_norm": 0.0341796875, + "learning_rate": 0.016516185839625233, + "loss": 0.8319, + "num_input_tokens_seen": 40622608, + "step": 69960 + }, + { + "epoch": 10.420762585641942, + "grad_norm": 0.0274658203125, + "learning_rate": 0.01651424615929876, + "loss": 0.7923, + "num_input_tokens_seen": 40625648, + "step": 69965 + }, + { + "epoch": 10.4215072981829, + "grad_norm": 0.04248046875, + "learning_rate": 0.01651230645339064, + "loss": 0.8062, + "num_input_tokens_seen": 40628688, + "step": 69970 + }, + { + "epoch": 10.422252010723861, + "grad_norm": 0.0283203125, + "learning_rate": 0.016510366721933636, + "loss": 0.7873, + "num_input_tokens_seen": 40631632, + "step": 69975 + }, + { + "epoch": 10.42299672326482, + "grad_norm": 0.0439453125, + "learning_rate": 0.016508426964960524, + "loss": 0.7973, + "num_input_tokens_seen": 40634544, + "step": 69980 + }, + { + "epoch": 10.423741435805779, + "grad_norm": 0.03125, + "learning_rate": 0.016506487182504068, + "loss": 0.8049, + "num_input_tokens_seen": 40637520, + "step": 69985 + }, + { + "epoch": 10.424486148346737, + "grad_norm": 0.0218505859375, + "learning_rate": 0.016504547374597046, + "loss": 0.8101, + "num_input_tokens_seen": 40640304, + "step": 69990 + }, + { + "epoch": 10.425230860887698, + "grad_norm": 0.0281982421875, + "learning_rate": 0.016502607541272224, + "loss": 0.7849, + "num_input_tokens_seen": 40643024, + "step": 69995 + }, + { + "epoch": 10.425975573428657, + "grad_norm": 0.039794921875, + "learning_rate": 0.01650066768256237, + "loss": 0.8092, + "num_input_tokens_seen": 40645840, + "step": 70000 + }, + { + "epoch": 10.426720285969616, + "grad_norm": 0.0228271484375, + "learning_rate": 0.01649872779850027, + "loss": 0.7934, + "num_input_tokens_seen": 40648688, + "step": 70005 + }, + { + "epoch": 10.427464998510574, + "grad_norm": 0.02978515625, + "learning_rate": 0.01649678788911868, + "loss": 0.8036, + "num_input_tokens_seen": 40651632, + "step": 70010 + }, + { + "epoch": 10.428209711051535, + "grad_norm": 0.02587890625, + "learning_rate": 0.016494847954450383, + "loss": 0.8073, + "num_input_tokens_seen": 40654480, + "step": 70015 + }, + { + "epoch": 10.428954423592494, + "grad_norm": 0.0274658203125, + "learning_rate": 0.01649290799452815, + "loss": 0.8018, + "num_input_tokens_seen": 40657392, + "step": 70020 + }, + { + "epoch": 10.429699136133452, + "grad_norm": 0.03076171875, + "learning_rate": 0.016490968009384748, + "loss": 0.8184, + "num_input_tokens_seen": 40660144, + "step": 70025 + }, + { + "epoch": 10.430443848674411, + "grad_norm": 0.022705078125, + "learning_rate": 0.016489027999052965, + "loss": 0.8124, + "num_input_tokens_seen": 40662992, + "step": 70030 + }, + { + "epoch": 10.431188561215372, + "grad_norm": 0.028076171875, + "learning_rate": 0.01648708796356556, + "loss": 0.8163, + "num_input_tokens_seen": 40665648, + "step": 70035 + }, + { + "epoch": 10.43193327375633, + "grad_norm": 0.02978515625, + "learning_rate": 0.01648514790295532, + "loss": 0.8036, + "num_input_tokens_seen": 40668464, + "step": 70040 + }, + { + "epoch": 10.43267798629729, + "grad_norm": 0.033447265625, + "learning_rate": 0.016483207817255017, + "loss": 0.8077, + "num_input_tokens_seen": 40671600, + "step": 70045 + }, + { + "epoch": 10.433422698838248, + "grad_norm": 0.036376953125, + "learning_rate": 0.01648126770649742, + "loss": 0.8095, + "num_input_tokens_seen": 40674416, + "step": 70050 + }, + { + "epoch": 10.434167411379208, + "grad_norm": 0.03369140625, + "learning_rate": 0.016479327570715314, + "loss": 0.816, + "num_input_tokens_seen": 40677584, + "step": 70055 + }, + { + "epoch": 10.434912123920167, + "grad_norm": 0.033203125, + "learning_rate": 0.01647738740994147, + "loss": 0.7958, + "num_input_tokens_seen": 40680432, + "step": 70060 + }, + { + "epoch": 10.435656836461126, + "grad_norm": 0.03662109375, + "learning_rate": 0.01647544722420867, + "loss": 0.7934, + "num_input_tokens_seen": 40683120, + "step": 70065 + }, + { + "epoch": 10.436401549002085, + "grad_norm": 0.0439453125, + "learning_rate": 0.016473507013549684, + "loss": 0.7975, + "num_input_tokens_seen": 40686032, + "step": 70070 + }, + { + "epoch": 10.437146261543045, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0164715667779973, + "loss": 0.8137, + "num_input_tokens_seen": 40688912, + "step": 70075 + }, + { + "epoch": 10.437890974084004, + "grad_norm": 0.033935546875, + "learning_rate": 0.016469626517584286, + "loss": 0.7909, + "num_input_tokens_seen": 40691824, + "step": 70080 + }, + { + "epoch": 10.438635686624963, + "grad_norm": 0.0400390625, + "learning_rate": 0.01646768623234343, + "loss": 0.8105, + "num_input_tokens_seen": 40694992, + "step": 70085 + }, + { + "epoch": 10.439380399165922, + "grad_norm": 0.062255859375, + "learning_rate": 0.016465745922307505, + "loss": 0.8032, + "num_input_tokens_seen": 40697936, + "step": 70090 + }, + { + "epoch": 10.44012511170688, + "grad_norm": 0.018798828125, + "learning_rate": 0.016463805587509293, + "loss": 0.8083, + "num_input_tokens_seen": 40700816, + "step": 70095 + }, + { + "epoch": 10.44086982424784, + "grad_norm": 0.01806640625, + "learning_rate": 0.016461865227981577, + "loss": 0.793, + "num_input_tokens_seen": 40703536, + "step": 70100 + }, + { + "epoch": 10.4416145367888, + "grad_norm": 0.01611328125, + "learning_rate": 0.016459924843757127, + "loss": 0.7929, + "num_input_tokens_seen": 40706416, + "step": 70105 + }, + { + "epoch": 10.442359249329758, + "grad_norm": 0.04150390625, + "learning_rate": 0.016457984434868733, + "loss": 0.813, + "num_input_tokens_seen": 40709168, + "step": 70110 + }, + { + "epoch": 10.443103961870717, + "grad_norm": 0.032470703125, + "learning_rate": 0.016456044001349174, + "loss": 0.7982, + "num_input_tokens_seen": 40712080, + "step": 70115 + }, + { + "epoch": 10.443848674411678, + "grad_norm": 0.028076171875, + "learning_rate": 0.01645410354323123, + "loss": 0.8084, + "num_input_tokens_seen": 40714864, + "step": 70120 + }, + { + "epoch": 10.444593386952636, + "grad_norm": 0.035888671875, + "learning_rate": 0.016452163060547687, + "loss": 0.8295, + "num_input_tokens_seen": 40717936, + "step": 70125 + }, + { + "epoch": 10.445338099493595, + "grad_norm": 0.0225830078125, + "learning_rate": 0.016450222553331322, + "loss": 0.7878, + "num_input_tokens_seen": 40720752, + "step": 70130 + }, + { + "epoch": 10.446082812034554, + "grad_norm": 0.0157470703125, + "learning_rate": 0.016448282021614927, + "loss": 0.788, + "num_input_tokens_seen": 40723440, + "step": 70135 + }, + { + "epoch": 10.446827524575514, + "grad_norm": 0.04248046875, + "learning_rate": 0.016446341465431273, + "loss": 0.8083, + "num_input_tokens_seen": 40726608, + "step": 70140 + }, + { + "epoch": 10.447572237116473, + "grad_norm": 0.033447265625, + "learning_rate": 0.01644440088481315, + "loss": 0.8038, + "num_input_tokens_seen": 40729488, + "step": 70145 + }, + { + "epoch": 10.448316949657432, + "grad_norm": 0.039306640625, + "learning_rate": 0.016442460279793344, + "loss": 0.8077, + "num_input_tokens_seen": 40732208, + "step": 70150 + }, + { + "epoch": 10.44906166219839, + "grad_norm": 0.0517578125, + "learning_rate": 0.016440519650404637, + "loss": 0.7924, + "num_input_tokens_seen": 40735216, + "step": 70155 + }, + { + "epoch": 10.449806374739351, + "grad_norm": 0.0177001953125, + "learning_rate": 0.016438578996679817, + "loss": 0.7943, + "num_input_tokens_seen": 40738192, + "step": 70160 + }, + { + "epoch": 10.45055108728031, + "grad_norm": 0.01953125, + "learning_rate": 0.01643663831865167, + "loss": 0.8077, + "num_input_tokens_seen": 40740944, + "step": 70165 + }, + { + "epoch": 10.451295799821269, + "grad_norm": 0.0296630859375, + "learning_rate": 0.016434697616352974, + "loss": 0.7948, + "num_input_tokens_seen": 40743632, + "step": 70170 + }, + { + "epoch": 10.452040512362228, + "grad_norm": 0.061279296875, + "learning_rate": 0.01643275688981652, + "loss": 0.8015, + "num_input_tokens_seen": 40746544, + "step": 70175 + }, + { + "epoch": 10.452785224903188, + "grad_norm": 0.031982421875, + "learning_rate": 0.01643081613907509, + "loss": 0.8093, + "num_input_tokens_seen": 40749360, + "step": 70180 + }, + { + "epoch": 10.453529937444147, + "grad_norm": 0.0341796875, + "learning_rate": 0.016428875364161478, + "loss": 0.7992, + "num_input_tokens_seen": 40752272, + "step": 70185 + }, + { + "epoch": 10.454274649985106, + "grad_norm": 0.028564453125, + "learning_rate": 0.016426934565108475, + "loss": 0.8005, + "num_input_tokens_seen": 40755024, + "step": 70190 + }, + { + "epoch": 10.455019362526064, + "grad_norm": 0.0164794921875, + "learning_rate": 0.01642499374194886, + "loss": 0.7978, + "num_input_tokens_seen": 40758064, + "step": 70195 + }, + { + "epoch": 10.455764075067025, + "grad_norm": 0.03076171875, + "learning_rate": 0.01642305289471542, + "loss": 0.8106, + "num_input_tokens_seen": 40760784, + "step": 70200 + }, + { + "epoch": 10.456508787607984, + "grad_norm": 0.040771484375, + "learning_rate": 0.016421112023440952, + "loss": 0.7973, + "num_input_tokens_seen": 40763504, + "step": 70205 + }, + { + "epoch": 10.457253500148942, + "grad_norm": 0.033203125, + "learning_rate": 0.01641917112815824, + "loss": 0.7798, + "num_input_tokens_seen": 40766224, + "step": 70210 + }, + { + "epoch": 10.457998212689901, + "grad_norm": 0.031982421875, + "learning_rate": 0.016417230208900072, + "loss": 0.8003, + "num_input_tokens_seen": 40769200, + "step": 70215 + }, + { + "epoch": 10.458742925230862, + "grad_norm": 0.0286865234375, + "learning_rate": 0.01641528926569924, + "loss": 0.7941, + "num_input_tokens_seen": 40772240, + "step": 70220 + }, + { + "epoch": 10.45948763777182, + "grad_norm": 0.06005859375, + "learning_rate": 0.016413348298588534, + "loss": 0.7934, + "num_input_tokens_seen": 40775344, + "step": 70225 + }, + { + "epoch": 10.46023235031278, + "grad_norm": 0.031494140625, + "learning_rate": 0.01641140730760075, + "loss": 0.7858, + "num_input_tokens_seen": 40778128, + "step": 70230 + }, + { + "epoch": 10.460977062853738, + "grad_norm": 0.037353515625, + "learning_rate": 0.01640946629276867, + "loss": 0.8001, + "num_input_tokens_seen": 40780976, + "step": 70235 + }, + { + "epoch": 10.461721775394698, + "grad_norm": 0.03173828125, + "learning_rate": 0.016407525254125088, + "loss": 0.7796, + "num_input_tokens_seen": 40783824, + "step": 70240 + }, + { + "epoch": 10.462466487935657, + "grad_norm": 0.0277099609375, + "learning_rate": 0.016405584191702804, + "loss": 0.7893, + "num_input_tokens_seen": 40786416, + "step": 70245 + }, + { + "epoch": 10.463211200476616, + "grad_norm": 0.044189453125, + "learning_rate": 0.016403643105534597, + "loss": 0.7772, + "num_input_tokens_seen": 40789200, + "step": 70250 + }, + { + "epoch": 10.463955913017575, + "grad_norm": 0.03271484375, + "learning_rate": 0.01640170199565327, + "loss": 0.7868, + "num_input_tokens_seen": 40792240, + "step": 70255 + }, + { + "epoch": 10.464700625558535, + "grad_norm": 0.030517578125, + "learning_rate": 0.01639976086209161, + "loss": 0.7738, + "num_input_tokens_seen": 40795088, + "step": 70260 + }, + { + "epoch": 10.465445338099494, + "grad_norm": 0.035400390625, + "learning_rate": 0.016397819704882417, + "loss": 0.783, + "num_input_tokens_seen": 40798064, + "step": 70265 + }, + { + "epoch": 10.466190050640453, + "grad_norm": 0.027587890625, + "learning_rate": 0.016395878524058473, + "loss": 0.7923, + "num_input_tokens_seen": 40801104, + "step": 70270 + }, + { + "epoch": 10.466934763181412, + "grad_norm": 0.04345703125, + "learning_rate": 0.016393937319652586, + "loss": 0.7728, + "num_input_tokens_seen": 40803600, + "step": 70275 + }, + { + "epoch": 10.46767947572237, + "grad_norm": 0.027587890625, + "learning_rate": 0.01639199609169754, + "loss": 0.7849, + "num_input_tokens_seen": 40806320, + "step": 70280 + }, + { + "epoch": 10.46842418826333, + "grad_norm": 0.060546875, + "learning_rate": 0.01639005484022614, + "loss": 0.7951, + "num_input_tokens_seen": 40809200, + "step": 70285 + }, + { + "epoch": 10.46916890080429, + "grad_norm": 0.0279541015625, + "learning_rate": 0.016388113565271176, + "loss": 0.7932, + "num_input_tokens_seen": 40812144, + "step": 70290 + }, + { + "epoch": 10.469913613345248, + "grad_norm": 0.06103515625, + "learning_rate": 0.016386172266865444, + "loss": 0.8129, + "num_input_tokens_seen": 40815152, + "step": 70295 + }, + { + "epoch": 10.470658325886207, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01638423094504174, + "loss": 0.8525, + "num_input_tokens_seen": 40817936, + "step": 70300 + }, + { + "epoch": 10.471403038427168, + "grad_norm": 0.05078125, + "learning_rate": 0.016382289599832858, + "loss": 0.8147, + "num_input_tokens_seen": 40820688, + "step": 70305 + }, + { + "epoch": 10.472147750968126, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0163803482312716, + "loss": 0.8052, + "num_input_tokens_seen": 40823728, + "step": 70310 + }, + { + "epoch": 10.472892463509085, + "grad_norm": 0.023681640625, + "learning_rate": 0.01637840683939076, + "loss": 0.8115, + "num_input_tokens_seen": 40826544, + "step": 70315 + }, + { + "epoch": 10.473637176050044, + "grad_norm": 0.027587890625, + "learning_rate": 0.01637646542422314, + "loss": 0.7935, + "num_input_tokens_seen": 40829488, + "step": 70320 + }, + { + "epoch": 10.474381888591004, + "grad_norm": 0.0224609375, + "learning_rate": 0.016374523985801536, + "loss": 0.8061, + "num_input_tokens_seen": 40832368, + "step": 70325 + }, + { + "epoch": 10.475126601131963, + "grad_norm": 0.0308837890625, + "learning_rate": 0.016372582524158748, + "loss": 0.7988, + "num_input_tokens_seen": 40835312, + "step": 70330 + }, + { + "epoch": 10.475871313672922, + "grad_norm": 0.0279541015625, + "learning_rate": 0.016370641039327567, + "loss": 0.8127, + "num_input_tokens_seen": 40838064, + "step": 70335 + }, + { + "epoch": 10.47661602621388, + "grad_norm": 0.0240478515625, + "learning_rate": 0.0163686995313408, + "loss": 0.7986, + "num_input_tokens_seen": 40840816, + "step": 70340 + }, + { + "epoch": 10.477360738754841, + "grad_norm": 0.03076171875, + "learning_rate": 0.016366758000231248, + "loss": 0.7849, + "num_input_tokens_seen": 40843472, + "step": 70345 + }, + { + "epoch": 10.4781054512958, + "grad_norm": 0.037841796875, + "learning_rate": 0.01636481644603171, + "loss": 0.8012, + "num_input_tokens_seen": 40846320, + "step": 70350 + }, + { + "epoch": 10.478850163836759, + "grad_norm": 0.0303955078125, + "learning_rate": 0.016362874868774982, + "loss": 0.8294, + "num_input_tokens_seen": 40849296, + "step": 70355 + }, + { + "epoch": 10.479594876377718, + "grad_norm": 0.0380859375, + "learning_rate": 0.01636093326849387, + "loss": 0.7902, + "num_input_tokens_seen": 40852048, + "step": 70360 + }, + { + "epoch": 10.480339588918678, + "grad_norm": 0.03173828125, + "learning_rate": 0.016358991645221175, + "loss": 0.8299, + "num_input_tokens_seen": 40855184, + "step": 70365 + }, + { + "epoch": 10.481084301459637, + "grad_norm": 0.02783203125, + "learning_rate": 0.016357049998989695, + "loss": 0.7784, + "num_input_tokens_seen": 40858032, + "step": 70370 + }, + { + "epoch": 10.481829014000596, + "grad_norm": 0.062255859375, + "learning_rate": 0.016355108329832235, + "loss": 0.8243, + "num_input_tokens_seen": 40861008, + "step": 70375 + }, + { + "epoch": 10.482573726541554, + "grad_norm": 0.048583984375, + "learning_rate": 0.016353166637781597, + "loss": 0.8195, + "num_input_tokens_seen": 40863760, + "step": 70380 + }, + { + "epoch": 10.483318439082515, + "grad_norm": 0.06787109375, + "learning_rate": 0.016351224922870583, + "loss": 0.7924, + "num_input_tokens_seen": 40866544, + "step": 70385 + }, + { + "epoch": 10.484063151623474, + "grad_norm": 0.1875, + "learning_rate": 0.016349283185132, + "loss": 0.8266, + "num_input_tokens_seen": 40869232, + "step": 70390 + }, + { + "epoch": 10.484807864164432, + "grad_norm": 0.06494140625, + "learning_rate": 0.01634734142459865, + "loss": 0.8159, + "num_input_tokens_seen": 40872112, + "step": 70395 + }, + { + "epoch": 10.485552576705391, + "grad_norm": 0.048583984375, + "learning_rate": 0.01634539964130333, + "loss": 0.8044, + "num_input_tokens_seen": 40874640, + "step": 70400 + }, + { + "epoch": 10.486297289246352, + "grad_norm": 0.0284423828125, + "learning_rate": 0.016343457835278852, + "loss": 0.7901, + "num_input_tokens_seen": 40877776, + "step": 70405 + }, + { + "epoch": 10.48704200178731, + "grad_norm": 0.03955078125, + "learning_rate": 0.016341516006558025, + "loss": 0.8103, + "num_input_tokens_seen": 40880304, + "step": 70410 + }, + { + "epoch": 10.48778671432827, + "grad_norm": 0.05322265625, + "learning_rate": 0.016339574155173644, + "loss": 0.7938, + "num_input_tokens_seen": 40883248, + "step": 70415 + }, + { + "epoch": 10.488531426869228, + "grad_norm": 0.06005859375, + "learning_rate": 0.016337632281158518, + "loss": 0.807, + "num_input_tokens_seen": 40886576, + "step": 70420 + }, + { + "epoch": 10.489276139410187, + "grad_norm": 0.03857421875, + "learning_rate": 0.016335690384545454, + "loss": 0.8164, + "num_input_tokens_seen": 40889680, + "step": 70425 + }, + { + "epoch": 10.490020851951147, + "grad_norm": 0.0203857421875, + "learning_rate": 0.016333748465367264, + "loss": 0.8111, + "num_input_tokens_seen": 40892272, + "step": 70430 + }, + { + "epoch": 10.490765564492106, + "grad_norm": 0.01953125, + "learning_rate": 0.016331806523656747, + "loss": 0.7921, + "num_input_tokens_seen": 40894928, + "step": 70435 + }, + { + "epoch": 10.491510277033065, + "grad_norm": 0.01611328125, + "learning_rate": 0.016329864559446707, + "loss": 0.7913, + "num_input_tokens_seen": 40897776, + "step": 70440 + }, + { + "epoch": 10.492254989574024, + "grad_norm": 0.03564453125, + "learning_rate": 0.01632792257276996, + "loss": 0.7943, + "num_input_tokens_seen": 40900560, + "step": 70445 + }, + { + "epoch": 10.492999702114984, + "grad_norm": 0.0302734375, + "learning_rate": 0.016325980563659317, + "loss": 0.7914, + "num_input_tokens_seen": 40903728, + "step": 70450 + }, + { + "epoch": 10.493744414655943, + "grad_norm": 0.03857421875, + "learning_rate": 0.01632403853214757, + "loss": 0.7603, + "num_input_tokens_seen": 40906512, + "step": 70455 + }, + { + "epoch": 10.494489127196902, + "grad_norm": 0.0478515625, + "learning_rate": 0.01632209647826754, + "loss": 0.8183, + "num_input_tokens_seen": 40909680, + "step": 70460 + }, + { + "epoch": 10.49523383973786, + "grad_norm": 0.051025390625, + "learning_rate": 0.01632015440205204, + "loss": 0.816, + "num_input_tokens_seen": 40912528, + "step": 70465 + }, + { + "epoch": 10.495978552278821, + "grad_norm": 0.033203125, + "learning_rate": 0.01631821230353387, + "loss": 0.7842, + "num_input_tokens_seen": 40915728, + "step": 70470 + }, + { + "epoch": 10.49672326481978, + "grad_norm": 0.0390625, + "learning_rate": 0.01631627018274584, + "loss": 0.832, + "num_input_tokens_seen": 40918416, + "step": 70475 + }, + { + "epoch": 10.497467977360738, + "grad_norm": 0.034912109375, + "learning_rate": 0.016314328039720764, + "loss": 0.7838, + "num_input_tokens_seen": 40921680, + "step": 70480 + }, + { + "epoch": 10.498212689901697, + "grad_norm": 0.0235595703125, + "learning_rate": 0.016312385874491454, + "loss": 0.7936, + "num_input_tokens_seen": 40924592, + "step": 70485 + }, + { + "epoch": 10.498957402442658, + "grad_norm": 0.0478515625, + "learning_rate": 0.01631044368709072, + "loss": 0.7556, + "num_input_tokens_seen": 40927376, + "step": 70490 + }, + { + "epoch": 10.499702114983616, + "grad_norm": 0.027587890625, + "learning_rate": 0.01630850147755137, + "loss": 0.8158, + "num_input_tokens_seen": 40930576, + "step": 70495 + }, + { + "epoch": 10.500446827524575, + "grad_norm": 0.03125, + "learning_rate": 0.016306559245906217, + "loss": 0.803, + "num_input_tokens_seen": 40933264, + "step": 70500 + }, + { + "epoch": 10.501191540065534, + "grad_norm": 0.0576171875, + "learning_rate": 0.016304616992188072, + "loss": 0.8372, + "num_input_tokens_seen": 40936208, + "step": 70505 + }, + { + "epoch": 10.501936252606495, + "grad_norm": 0.031494140625, + "learning_rate": 0.01630267471642975, + "loss": 0.8169, + "num_input_tokens_seen": 40938960, + "step": 70510 + }, + { + "epoch": 10.502680965147453, + "grad_norm": 0.03369140625, + "learning_rate": 0.01630073241866406, + "loss": 0.8335, + "num_input_tokens_seen": 40941744, + "step": 70515 + }, + { + "epoch": 10.503425677688412, + "grad_norm": 0.0419921875, + "learning_rate": 0.01629879009892382, + "loss": 0.8198, + "num_input_tokens_seen": 40944560, + "step": 70520 + }, + { + "epoch": 10.50417039022937, + "grad_norm": 0.032470703125, + "learning_rate": 0.016296847757241842, + "loss": 0.8019, + "num_input_tokens_seen": 40947280, + "step": 70525 + }, + { + "epoch": 10.504915102770331, + "grad_norm": 0.0439453125, + "learning_rate": 0.016294905393650944, + "loss": 0.8057, + "num_input_tokens_seen": 40949872, + "step": 70530 + }, + { + "epoch": 10.50565981531129, + "grad_norm": 0.023681640625, + "learning_rate": 0.016292963008183926, + "loss": 0.805, + "num_input_tokens_seen": 40952912, + "step": 70535 + }, + { + "epoch": 10.506404527852249, + "grad_norm": 0.0228271484375, + "learning_rate": 0.016291020600873618, + "loss": 0.8054, + "num_input_tokens_seen": 40955728, + "step": 70540 + }, + { + "epoch": 10.507149240393208, + "grad_norm": 0.03955078125, + "learning_rate": 0.016289078171752826, + "loss": 0.8337, + "num_input_tokens_seen": 40958736, + "step": 70545 + }, + { + "epoch": 10.507893952934168, + "grad_norm": 0.04638671875, + "learning_rate": 0.01628713572085437, + "loss": 0.7993, + "num_input_tokens_seen": 40961328, + "step": 70550 + }, + { + "epoch": 10.508638665475127, + "grad_norm": 0.0341796875, + "learning_rate": 0.016285193248211063, + "loss": 0.7956, + "num_input_tokens_seen": 40964016, + "step": 70555 + }, + { + "epoch": 10.509383378016086, + "grad_norm": 0.02783203125, + "learning_rate": 0.016283250753855723, + "loss": 0.8068, + "num_input_tokens_seen": 40966928, + "step": 70560 + }, + { + "epoch": 10.510128090557044, + "grad_norm": 0.0303955078125, + "learning_rate": 0.01628130823782117, + "loss": 0.7972, + "num_input_tokens_seen": 40969904, + "step": 70565 + }, + { + "epoch": 10.510872803098005, + "grad_norm": 0.038818359375, + "learning_rate": 0.016279365700140214, + "loss": 0.8076, + "num_input_tokens_seen": 40972976, + "step": 70570 + }, + { + "epoch": 10.511617515638964, + "grad_norm": 0.04345703125, + "learning_rate": 0.01627742314084567, + "loss": 0.7913, + "num_input_tokens_seen": 40975952, + "step": 70575 + }, + { + "epoch": 10.512362228179922, + "grad_norm": 0.0277099609375, + "learning_rate": 0.016275480559970364, + "loss": 0.8057, + "num_input_tokens_seen": 40978832, + "step": 70580 + }, + { + "epoch": 10.513106940720881, + "grad_norm": 0.036376953125, + "learning_rate": 0.01627353795754711, + "loss": 0.7958, + "num_input_tokens_seen": 40981744, + "step": 70585 + }, + { + "epoch": 10.513851653261842, + "grad_norm": 0.018310546875, + "learning_rate": 0.016271595333608727, + "loss": 0.8071, + "num_input_tokens_seen": 40984496, + "step": 70590 + }, + { + "epoch": 10.5145963658028, + "grad_norm": 0.03955078125, + "learning_rate": 0.016269652688188033, + "loss": 0.796, + "num_input_tokens_seen": 40987248, + "step": 70595 + }, + { + "epoch": 10.51534107834376, + "grad_norm": 0.042724609375, + "learning_rate": 0.016267710021317848, + "loss": 0.8103, + "num_input_tokens_seen": 40990032, + "step": 70600 + }, + { + "epoch": 10.516085790884718, + "grad_norm": 0.03857421875, + "learning_rate": 0.01626576733303099, + "loss": 0.7972, + "num_input_tokens_seen": 40992976, + "step": 70605 + }, + { + "epoch": 10.516830503425677, + "grad_norm": 0.0264892578125, + "learning_rate": 0.01626382462336028, + "loss": 0.8067, + "num_input_tokens_seen": 40995952, + "step": 70610 + }, + { + "epoch": 10.517575215966637, + "grad_norm": 0.044921875, + "learning_rate": 0.016261881892338535, + "loss": 0.8032, + "num_input_tokens_seen": 40998608, + "step": 70615 + }, + { + "epoch": 10.518319928507596, + "grad_norm": 0.026123046875, + "learning_rate": 0.01625993913999858, + "loss": 0.7851, + "num_input_tokens_seen": 41001488, + "step": 70620 + }, + { + "epoch": 10.519064641048555, + "grad_norm": 0.0289306640625, + "learning_rate": 0.01625799636637323, + "loss": 0.7986, + "num_input_tokens_seen": 41004144, + "step": 70625 + }, + { + "epoch": 10.519809353589514, + "grad_norm": 0.0341796875, + "learning_rate": 0.016256053571495316, + "loss": 0.7963, + "num_input_tokens_seen": 41006960, + "step": 70630 + }, + { + "epoch": 10.520554066130474, + "grad_norm": 0.0311279296875, + "learning_rate": 0.01625411075539765, + "loss": 0.8034, + "num_input_tokens_seen": 41009840, + "step": 70635 + }, + { + "epoch": 10.521298778671433, + "grad_norm": 0.033935546875, + "learning_rate": 0.01625216791811306, + "loss": 0.7994, + "num_input_tokens_seen": 41012688, + "step": 70640 + }, + { + "epoch": 10.522043491212392, + "grad_norm": 0.0439453125, + "learning_rate": 0.016250225059674363, + "loss": 0.7839, + "num_input_tokens_seen": 41015472, + "step": 70645 + }, + { + "epoch": 10.52278820375335, + "grad_norm": 0.0281982421875, + "learning_rate": 0.016248282180114388, + "loss": 0.8028, + "num_input_tokens_seen": 41018384, + "step": 70650 + }, + { + "epoch": 10.523532916294311, + "grad_norm": 0.0299072265625, + "learning_rate": 0.01624633927946595, + "loss": 0.8088, + "num_input_tokens_seen": 41021616, + "step": 70655 + }, + { + "epoch": 10.52427762883527, + "grad_norm": 0.045654296875, + "learning_rate": 0.01624439635776188, + "loss": 0.8088, + "num_input_tokens_seen": 41024432, + "step": 70660 + }, + { + "epoch": 10.525022341376228, + "grad_norm": 0.032470703125, + "learning_rate": 0.016242453415034993, + "loss": 0.8096, + "num_input_tokens_seen": 41026928, + "step": 70665 + }, + { + "epoch": 10.525767053917187, + "grad_norm": 0.030517578125, + "learning_rate": 0.016240510451318125, + "loss": 0.8121, + "num_input_tokens_seen": 41029744, + "step": 70670 + }, + { + "epoch": 10.526511766458148, + "grad_norm": 0.06298828125, + "learning_rate": 0.016238567466644087, + "loss": 0.8221, + "num_input_tokens_seen": 41032528, + "step": 70675 + }, + { + "epoch": 10.527256478999107, + "grad_norm": 0.0274658203125, + "learning_rate": 0.016236624461045714, + "loss": 0.8077, + "num_input_tokens_seen": 41035344, + "step": 70680 + }, + { + "epoch": 10.528001191540065, + "grad_norm": 0.041748046875, + "learning_rate": 0.016234681434555826, + "loss": 0.8163, + "num_input_tokens_seen": 41038128, + "step": 70685 + }, + { + "epoch": 10.528745904081024, + "grad_norm": 0.0281982421875, + "learning_rate": 0.016232738387207256, + "loss": 0.8006, + "num_input_tokens_seen": 41041008, + "step": 70690 + }, + { + "epoch": 10.529490616621985, + "grad_norm": 0.031494140625, + "learning_rate": 0.01623079531903282, + "loss": 0.8117, + "num_input_tokens_seen": 41043984, + "step": 70695 + }, + { + "epoch": 10.530235329162943, + "grad_norm": 0.031494140625, + "learning_rate": 0.016228852230065344, + "loss": 0.795, + "num_input_tokens_seen": 41046960, + "step": 70700 + }, + { + "epoch": 10.530980041703902, + "grad_norm": 0.05126953125, + "learning_rate": 0.01622690912033766, + "loss": 0.7835, + "num_input_tokens_seen": 41049872, + "step": 70705 + }, + { + "epoch": 10.53172475424486, + "grad_norm": 0.037353515625, + "learning_rate": 0.016224965989882597, + "loss": 0.8049, + "num_input_tokens_seen": 41052592, + "step": 70710 + }, + { + "epoch": 10.532469466785821, + "grad_norm": 0.036865234375, + "learning_rate": 0.016223022838732976, + "loss": 0.7955, + "num_input_tokens_seen": 41055952, + "step": 70715 + }, + { + "epoch": 10.53321417932678, + "grad_norm": 0.048095703125, + "learning_rate": 0.01622107966692163, + "loss": 0.8145, + "num_input_tokens_seen": 41058992, + "step": 70720 + }, + { + "epoch": 10.533958891867739, + "grad_norm": 0.0257568359375, + "learning_rate": 0.01621913647448138, + "loss": 0.8082, + "num_input_tokens_seen": 41061712, + "step": 70725 + }, + { + "epoch": 10.534703604408698, + "grad_norm": 0.036376953125, + "learning_rate": 0.01621719326144506, + "loss": 0.7861, + "num_input_tokens_seen": 41064624, + "step": 70730 + }, + { + "epoch": 10.535448316949658, + "grad_norm": 0.031494140625, + "learning_rate": 0.0162152500278455, + "loss": 0.802, + "num_input_tokens_seen": 41067664, + "step": 70735 + }, + { + "epoch": 10.536193029490617, + "grad_norm": 0.028076171875, + "learning_rate": 0.016213306773715522, + "loss": 0.8236, + "num_input_tokens_seen": 41070544, + "step": 70740 + }, + { + "epoch": 10.536937742031576, + "grad_norm": 0.05126953125, + "learning_rate": 0.01621136349908796, + "loss": 0.8063, + "num_input_tokens_seen": 41074064, + "step": 70745 + }, + { + "epoch": 10.537682454572534, + "grad_norm": 0.04248046875, + "learning_rate": 0.016209420203995642, + "loss": 0.8029, + "num_input_tokens_seen": 41076656, + "step": 70750 + }, + { + "epoch": 10.538427167113493, + "grad_norm": 0.04296875, + "learning_rate": 0.0162074768884714, + "loss": 0.8033, + "num_input_tokens_seen": 41079504, + "step": 70755 + }, + { + "epoch": 10.539171879654454, + "grad_norm": 0.03955078125, + "learning_rate": 0.016205533552548067, + "loss": 0.8143, + "num_input_tokens_seen": 41082320, + "step": 70760 + }, + { + "epoch": 10.539916592195413, + "grad_norm": 0.03173828125, + "learning_rate": 0.016203590196258465, + "loss": 0.7886, + "num_input_tokens_seen": 41085072, + "step": 70765 + }, + { + "epoch": 10.540661304736371, + "grad_norm": 0.054443359375, + "learning_rate": 0.016201646819635433, + "loss": 0.7965, + "num_input_tokens_seen": 41087760, + "step": 70770 + }, + { + "epoch": 10.541406017277332, + "grad_norm": 0.05126953125, + "learning_rate": 0.016199703422711795, + "loss": 0.8106, + "num_input_tokens_seen": 41090320, + "step": 70775 + }, + { + "epoch": 10.54215072981829, + "grad_norm": 0.0556640625, + "learning_rate": 0.01619776000552039, + "loss": 0.8103, + "num_input_tokens_seen": 41093456, + "step": 70780 + }, + { + "epoch": 10.54289544235925, + "grad_norm": 0.03173828125, + "learning_rate": 0.016195816568094046, + "loss": 0.7931, + "num_input_tokens_seen": 41096272, + "step": 70785 + }, + { + "epoch": 10.543640154900208, + "grad_norm": 0.028076171875, + "learning_rate": 0.016193873110465596, + "loss": 0.7919, + "num_input_tokens_seen": 41099408, + "step": 70790 + }, + { + "epoch": 10.544384867441167, + "grad_norm": 0.03857421875, + "learning_rate": 0.016191929632667877, + "loss": 0.7753, + "num_input_tokens_seen": 41102256, + "step": 70795 + }, + { + "epoch": 10.545129579982127, + "grad_norm": 0.061279296875, + "learning_rate": 0.016189986134733714, + "loss": 0.8304, + "num_input_tokens_seen": 41105040, + "step": 70800 + }, + { + "epoch": 10.545874292523086, + "grad_norm": 0.0284423828125, + "learning_rate": 0.016188042616695944, + "loss": 0.8026, + "num_input_tokens_seen": 41107760, + "step": 70805 + }, + { + "epoch": 10.546619005064045, + "grad_norm": 0.035888671875, + "learning_rate": 0.016186099078587406, + "loss": 0.803, + "num_input_tokens_seen": 41110608, + "step": 70810 + }, + { + "epoch": 10.547363717605004, + "grad_norm": 0.039306640625, + "learning_rate": 0.01618415552044093, + "loss": 0.8088, + "num_input_tokens_seen": 41113680, + "step": 70815 + }, + { + "epoch": 10.548108430145964, + "grad_norm": 0.1201171875, + "learning_rate": 0.016182211942289346, + "loss": 0.809, + "num_input_tokens_seen": 41116720, + "step": 70820 + }, + { + "epoch": 10.548853142686923, + "grad_norm": 0.031982421875, + "learning_rate": 0.016180268344165496, + "loss": 0.7921, + "num_input_tokens_seen": 41119600, + "step": 70825 + }, + { + "epoch": 10.549597855227882, + "grad_norm": 0.040283203125, + "learning_rate": 0.016178324726102207, + "loss": 0.792, + "num_input_tokens_seen": 41122672, + "step": 70830 + }, + { + "epoch": 10.55034256776884, + "grad_norm": 0.039306640625, + "learning_rate": 0.01617638108813232, + "loss": 0.7862, + "num_input_tokens_seen": 41125840, + "step": 70835 + }, + { + "epoch": 10.551087280309801, + "grad_norm": 0.02587890625, + "learning_rate": 0.016174437430288676, + "loss": 0.8146, + "num_input_tokens_seen": 41129136, + "step": 70840 + }, + { + "epoch": 10.55183199285076, + "grad_norm": 0.035400390625, + "learning_rate": 0.0161724937526041, + "loss": 0.799, + "num_input_tokens_seen": 41131664, + "step": 70845 + }, + { + "epoch": 10.552576705391719, + "grad_norm": 0.031494140625, + "learning_rate": 0.016170550055111435, + "loss": 0.7926, + "num_input_tokens_seen": 41134256, + "step": 70850 + }, + { + "epoch": 10.553321417932677, + "grad_norm": 0.02685546875, + "learning_rate": 0.01616860633784352, + "loss": 0.8043, + "num_input_tokens_seen": 41137200, + "step": 70855 + }, + { + "epoch": 10.554066130473638, + "grad_norm": 0.04248046875, + "learning_rate": 0.016166662600833187, + "loss": 0.8401, + "num_input_tokens_seen": 41140016, + "step": 70860 + }, + { + "epoch": 10.554810843014597, + "grad_norm": 0.017822265625, + "learning_rate": 0.01616471884411327, + "loss": 0.7927, + "num_input_tokens_seen": 41143344, + "step": 70865 + }, + { + "epoch": 10.555555555555555, + "grad_norm": 0.0250244140625, + "learning_rate": 0.016162775067716614, + "loss": 0.8111, + "num_input_tokens_seen": 41146416, + "step": 70870 + }, + { + "epoch": 10.556300268096514, + "grad_norm": 0.0341796875, + "learning_rate": 0.016160831271676057, + "loss": 0.8289, + "num_input_tokens_seen": 41149264, + "step": 70875 + }, + { + "epoch": 10.557044980637475, + "grad_norm": 0.03466796875, + "learning_rate": 0.016158887456024434, + "loss": 0.7987, + "num_input_tokens_seen": 41152272, + "step": 70880 + }, + { + "epoch": 10.557789693178433, + "grad_norm": 0.02001953125, + "learning_rate": 0.016156943620794586, + "loss": 0.7899, + "num_input_tokens_seen": 41154864, + "step": 70885 + }, + { + "epoch": 10.558534405719392, + "grad_norm": 0.0250244140625, + "learning_rate": 0.016154999766019355, + "loss": 0.8144, + "num_input_tokens_seen": 41157968, + "step": 70890 + }, + { + "epoch": 10.559279118260351, + "grad_norm": 0.0260009765625, + "learning_rate": 0.016153055891731577, + "loss": 0.7916, + "num_input_tokens_seen": 41160656, + "step": 70895 + }, + { + "epoch": 10.560023830801311, + "grad_norm": 0.0177001953125, + "learning_rate": 0.016151111997964086, + "loss": 0.7972, + "num_input_tokens_seen": 41163408, + "step": 70900 + }, + { + "epoch": 10.56076854334227, + "grad_norm": 0.0260009765625, + "learning_rate": 0.016149168084749727, + "loss": 0.8072, + "num_input_tokens_seen": 41166448, + "step": 70905 + }, + { + "epoch": 10.561513255883229, + "grad_norm": 0.02734375, + "learning_rate": 0.016147224152121345, + "loss": 0.7848, + "num_input_tokens_seen": 41169200, + "step": 70910 + }, + { + "epoch": 10.562257968424188, + "grad_norm": 0.03662109375, + "learning_rate": 0.016145280200111772, + "loss": 0.7918, + "num_input_tokens_seen": 41172176, + "step": 70915 + }, + { + "epoch": 10.563002680965148, + "grad_norm": 0.039794921875, + "learning_rate": 0.016143336228753857, + "loss": 0.8226, + "num_input_tokens_seen": 41175024, + "step": 70920 + }, + { + "epoch": 10.563747393506107, + "grad_norm": 0.03125, + "learning_rate": 0.01614139223808044, + "loss": 0.7764, + "num_input_tokens_seen": 41177936, + "step": 70925 + }, + { + "epoch": 10.564492106047066, + "grad_norm": 0.041259765625, + "learning_rate": 0.016139448228124364, + "loss": 0.7918, + "num_input_tokens_seen": 41181040, + "step": 70930 + }, + { + "epoch": 10.565236818588025, + "grad_norm": 0.0286865234375, + "learning_rate": 0.016137504198918465, + "loss": 0.8305, + "num_input_tokens_seen": 41184016, + "step": 70935 + }, + { + "epoch": 10.565981531128983, + "grad_norm": 0.0272216796875, + "learning_rate": 0.01613556015049559, + "loss": 0.792, + "num_input_tokens_seen": 41186736, + "step": 70940 + }, + { + "epoch": 10.566726243669944, + "grad_norm": 0.04248046875, + "learning_rate": 0.016133616082888574, + "loss": 0.797, + "num_input_tokens_seen": 41189712, + "step": 70945 + }, + { + "epoch": 10.567470956210903, + "grad_norm": 0.04248046875, + "learning_rate": 0.016131671996130272, + "loss": 0.8442, + "num_input_tokens_seen": 41192368, + "step": 70950 + }, + { + "epoch": 10.568215668751861, + "grad_norm": 0.035400390625, + "learning_rate": 0.016129727890253524, + "loss": 0.7853, + "num_input_tokens_seen": 41195088, + "step": 70955 + }, + { + "epoch": 10.568960381292822, + "grad_norm": 0.041015625, + "learning_rate": 0.01612778376529117, + "loss": 0.8024, + "num_input_tokens_seen": 41198000, + "step": 70960 + }, + { + "epoch": 10.56970509383378, + "grad_norm": 0.037841796875, + "learning_rate": 0.016125839621276054, + "loss": 0.8147, + "num_input_tokens_seen": 41200528, + "step": 70965 + }, + { + "epoch": 10.57044980637474, + "grad_norm": 0.039306640625, + "learning_rate": 0.01612389545824102, + "loss": 0.8083, + "num_input_tokens_seen": 41203472, + "step": 70970 + }, + { + "epoch": 10.571194518915698, + "grad_norm": 0.03271484375, + "learning_rate": 0.016121951276218918, + "loss": 0.8056, + "num_input_tokens_seen": 41206224, + "step": 70975 + }, + { + "epoch": 10.571939231456657, + "grad_norm": 0.036376953125, + "learning_rate": 0.01612000707524259, + "loss": 0.7929, + "num_input_tokens_seen": 41209264, + "step": 70980 + }, + { + "epoch": 10.572683943997617, + "grad_norm": 0.0537109375, + "learning_rate": 0.016118062855344874, + "loss": 0.8066, + "num_input_tokens_seen": 41211984, + "step": 70985 + }, + { + "epoch": 10.573428656538576, + "grad_norm": 0.02490234375, + "learning_rate": 0.016116118616558633, + "loss": 0.7893, + "num_input_tokens_seen": 41214928, + "step": 70990 + }, + { + "epoch": 10.574173369079535, + "grad_norm": 0.055419921875, + "learning_rate": 0.016114174358916693, + "loss": 0.8183, + "num_input_tokens_seen": 41217872, + "step": 70995 + }, + { + "epoch": 10.574918081620494, + "grad_norm": 0.03564453125, + "learning_rate": 0.016112230082451915, + "loss": 0.8047, + "num_input_tokens_seen": 41220656, + "step": 71000 + }, + { + "epoch": 10.575662794161454, + "grad_norm": 0.026611328125, + "learning_rate": 0.016110285787197136, + "loss": 0.8123, + "num_input_tokens_seen": 41223760, + "step": 71005 + }, + { + "epoch": 10.576407506702413, + "grad_norm": 0.040283203125, + "learning_rate": 0.016108341473185208, + "loss": 0.796, + "num_input_tokens_seen": 41226544, + "step": 71010 + }, + { + "epoch": 10.577152219243372, + "grad_norm": 0.031005859375, + "learning_rate": 0.01610639714044898, + "loss": 0.8025, + "num_input_tokens_seen": 41229360, + "step": 71015 + }, + { + "epoch": 10.57789693178433, + "grad_norm": 0.0286865234375, + "learning_rate": 0.016104452789021294, + "loss": 0.8106, + "num_input_tokens_seen": 41232080, + "step": 71020 + }, + { + "epoch": 10.578641644325291, + "grad_norm": 0.0322265625, + "learning_rate": 0.016102508418935006, + "loss": 0.7802, + "num_input_tokens_seen": 41234640, + "step": 71025 + }, + { + "epoch": 10.57938635686625, + "grad_norm": 0.033447265625, + "learning_rate": 0.016100564030222954, + "loss": 0.8116, + "num_input_tokens_seen": 41237776, + "step": 71030 + }, + { + "epoch": 10.580131069407209, + "grad_norm": 0.0194091796875, + "learning_rate": 0.016098619622917993, + "loss": 0.8089, + "num_input_tokens_seen": 41240560, + "step": 71035 + }, + { + "epoch": 10.580875781948167, + "grad_norm": 0.03173828125, + "learning_rate": 0.016096675197052965, + "loss": 0.7972, + "num_input_tokens_seen": 41243472, + "step": 71040 + }, + { + "epoch": 10.581620494489128, + "grad_norm": 0.037841796875, + "learning_rate": 0.01609473075266073, + "loss": 0.8122, + "num_input_tokens_seen": 41246384, + "step": 71045 + }, + { + "epoch": 10.582365207030087, + "grad_norm": 0.04150390625, + "learning_rate": 0.01609278628977413, + "loss": 0.8074, + "num_input_tokens_seen": 41249392, + "step": 71050 + }, + { + "epoch": 10.583109919571045, + "grad_norm": 0.02734375, + "learning_rate": 0.01609084180842602, + "loss": 0.8006, + "num_input_tokens_seen": 41252112, + "step": 71055 + }, + { + "epoch": 10.583854632112004, + "grad_norm": 0.0242919921875, + "learning_rate": 0.016088897308649243, + "loss": 0.7888, + "num_input_tokens_seen": 41254960, + "step": 71060 + }, + { + "epoch": 10.584599344652965, + "grad_norm": 0.042236328125, + "learning_rate": 0.01608695279047665, + "loss": 0.802, + "num_input_tokens_seen": 41257872, + "step": 71065 + }, + { + "epoch": 10.585344057193923, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0160850082539411, + "loss": 0.7957, + "num_input_tokens_seen": 41261104, + "step": 71070 + }, + { + "epoch": 10.586088769734882, + "grad_norm": 0.035400390625, + "learning_rate": 0.01608306369907543, + "loss": 0.8165, + "num_input_tokens_seen": 41264208, + "step": 71075 + }, + { + "epoch": 10.586833482275841, + "grad_norm": 0.043212890625, + "learning_rate": 0.016081119125912504, + "loss": 0.8095, + "num_input_tokens_seen": 41266960, + "step": 71080 + }, + { + "epoch": 10.587578194816802, + "grad_norm": 0.03955078125, + "learning_rate": 0.016079174534485173, + "loss": 0.8025, + "num_input_tokens_seen": 41269904, + "step": 71085 + }, + { + "epoch": 10.58832290735776, + "grad_norm": 0.0286865234375, + "learning_rate": 0.016077229924826283, + "loss": 0.7817, + "num_input_tokens_seen": 41272752, + "step": 71090 + }, + { + "epoch": 10.589067619898719, + "grad_norm": 0.0380859375, + "learning_rate": 0.016075285296968687, + "loss": 0.784, + "num_input_tokens_seen": 41275472, + "step": 71095 + }, + { + "epoch": 10.589812332439678, + "grad_norm": 0.0390625, + "learning_rate": 0.01607334065094524, + "loss": 0.8004, + "num_input_tokens_seen": 41278032, + "step": 71100 + }, + { + "epoch": 10.590557044980638, + "grad_norm": 0.032470703125, + "learning_rate": 0.016071395986788792, + "loss": 0.8192, + "num_input_tokens_seen": 41281136, + "step": 71105 + }, + { + "epoch": 10.591301757521597, + "grad_norm": 0.04736328125, + "learning_rate": 0.016069451304532197, + "loss": 0.7795, + "num_input_tokens_seen": 41284240, + "step": 71110 + }, + { + "epoch": 10.592046470062556, + "grad_norm": 0.048583984375, + "learning_rate": 0.016067506604208307, + "loss": 0.8001, + "num_input_tokens_seen": 41287184, + "step": 71115 + }, + { + "epoch": 10.592791182603515, + "grad_norm": 0.0211181640625, + "learning_rate": 0.01606556188584998, + "loss": 0.8116, + "num_input_tokens_seen": 41290128, + "step": 71120 + }, + { + "epoch": 10.593535895144473, + "grad_norm": 0.0283203125, + "learning_rate": 0.016063617149490075, + "loss": 0.7875, + "num_input_tokens_seen": 41293040, + "step": 71125 + }, + { + "epoch": 10.594280607685434, + "grad_norm": 0.030029296875, + "learning_rate": 0.01606167239516143, + "loss": 0.7889, + "num_input_tokens_seen": 41296112, + "step": 71130 + }, + { + "epoch": 10.595025320226393, + "grad_norm": 0.029541015625, + "learning_rate": 0.016059727622896914, + "loss": 0.808, + "num_input_tokens_seen": 41298800, + "step": 71135 + }, + { + "epoch": 10.595770032767351, + "grad_norm": 0.03125, + "learning_rate": 0.016057782832729378, + "loss": 0.7873, + "num_input_tokens_seen": 41301616, + "step": 71140 + }, + { + "epoch": 10.59651474530831, + "grad_norm": 0.0216064453125, + "learning_rate": 0.01605583802469167, + "loss": 0.8131, + "num_input_tokens_seen": 41304496, + "step": 71145 + }, + { + "epoch": 10.59725945784927, + "grad_norm": 0.04541015625, + "learning_rate": 0.016053893198816654, + "loss": 0.8101, + "num_input_tokens_seen": 41307408, + "step": 71150 + }, + { + "epoch": 10.59800417039023, + "grad_norm": 0.03564453125, + "learning_rate": 0.016051948355137187, + "loss": 0.7894, + "num_input_tokens_seen": 41310288, + "step": 71155 + }, + { + "epoch": 10.598748882931188, + "grad_norm": 0.027099609375, + "learning_rate": 0.016050003493686116, + "loss": 0.7874, + "num_input_tokens_seen": 41312816, + "step": 71160 + }, + { + "epoch": 10.599493595472147, + "grad_norm": 0.041015625, + "learning_rate": 0.01604805861449631, + "loss": 0.82, + "num_input_tokens_seen": 41315952, + "step": 71165 + }, + { + "epoch": 10.600238308013108, + "grad_norm": 0.032470703125, + "learning_rate": 0.01604611371760061, + "loss": 0.7985, + "num_input_tokens_seen": 41318768, + "step": 71170 + }, + { + "epoch": 10.600983020554066, + "grad_norm": 0.0263671875, + "learning_rate": 0.01604416880303189, + "loss": 0.8211, + "num_input_tokens_seen": 41321744, + "step": 71175 + }, + { + "epoch": 10.601727733095025, + "grad_norm": 0.030517578125, + "learning_rate": 0.016042223870823, + "loss": 0.7902, + "num_input_tokens_seen": 41324528, + "step": 71180 + }, + { + "epoch": 10.602472445635984, + "grad_norm": 0.031982421875, + "learning_rate": 0.01604027892100679, + "loss": 0.7861, + "num_input_tokens_seen": 41327440, + "step": 71185 + }, + { + "epoch": 10.603217158176944, + "grad_norm": 0.0361328125, + "learning_rate": 0.01603833395361613, + "loss": 0.7849, + "num_input_tokens_seen": 41330416, + "step": 71190 + }, + { + "epoch": 10.603961870717903, + "grad_norm": 0.031982421875, + "learning_rate": 0.01603638896868387, + "loss": 0.8078, + "num_input_tokens_seen": 41333296, + "step": 71195 + }, + { + "epoch": 10.604706583258862, + "grad_norm": 0.05810546875, + "learning_rate": 0.016034443966242873, + "loss": 0.7984, + "num_input_tokens_seen": 41336240, + "step": 71200 + }, + { + "epoch": 10.60545129579982, + "grad_norm": 0.03125, + "learning_rate": 0.016032498946325996, + "loss": 0.7814, + "num_input_tokens_seen": 41338832, + "step": 71205 + }, + { + "epoch": 10.606196008340781, + "grad_norm": 0.028076171875, + "learning_rate": 0.0160305539089661, + "loss": 0.8272, + "num_input_tokens_seen": 41341552, + "step": 71210 + }, + { + "epoch": 10.60694072088174, + "grad_norm": 0.0286865234375, + "learning_rate": 0.016028608854196044, + "loss": 0.8065, + "num_input_tokens_seen": 41344560, + "step": 71215 + }, + { + "epoch": 10.607685433422699, + "grad_norm": 0.0296630859375, + "learning_rate": 0.016026663782048686, + "loss": 0.8124, + "num_input_tokens_seen": 41347728, + "step": 71220 + }, + { + "epoch": 10.608430145963657, + "grad_norm": 0.025390625, + "learning_rate": 0.016024718692556887, + "loss": 0.7918, + "num_input_tokens_seen": 41350544, + "step": 71225 + }, + { + "epoch": 10.609174858504618, + "grad_norm": 0.0220947265625, + "learning_rate": 0.016022773585753505, + "loss": 0.8035, + "num_input_tokens_seen": 41353488, + "step": 71230 + }, + { + "epoch": 10.609919571045577, + "grad_norm": 0.02587890625, + "learning_rate": 0.016020828461671407, + "loss": 0.8025, + "num_input_tokens_seen": 41356816, + "step": 71235 + }, + { + "epoch": 10.610664283586535, + "grad_norm": 0.046630859375, + "learning_rate": 0.016018883320343444, + "loss": 0.807, + "num_input_tokens_seen": 41359888, + "step": 71240 + }, + { + "epoch": 10.611408996127494, + "grad_norm": 0.02685546875, + "learning_rate": 0.016016938161802486, + "loss": 0.8048, + "num_input_tokens_seen": 41362704, + "step": 71245 + }, + { + "epoch": 10.612153708668455, + "grad_norm": 0.036376953125, + "learning_rate": 0.01601499298608139, + "loss": 0.7981, + "num_input_tokens_seen": 41365552, + "step": 71250 + }, + { + "epoch": 10.612898421209414, + "grad_norm": 0.035400390625, + "learning_rate": 0.016013047793213022, + "loss": 0.7938, + "num_input_tokens_seen": 41368848, + "step": 71255 + }, + { + "epoch": 10.613643133750372, + "grad_norm": 0.041748046875, + "learning_rate": 0.01601110258323024, + "loss": 0.7992, + "num_input_tokens_seen": 41371632, + "step": 71260 + }, + { + "epoch": 10.614387846291331, + "grad_norm": 0.021484375, + "learning_rate": 0.01600915735616591, + "loss": 0.8052, + "num_input_tokens_seen": 41374544, + "step": 71265 + }, + { + "epoch": 10.615132558832292, + "grad_norm": 0.01904296875, + "learning_rate": 0.016007212112052888, + "loss": 0.7882, + "num_input_tokens_seen": 41377552, + "step": 71270 + }, + { + "epoch": 10.61587727137325, + "grad_norm": 0.0289306640625, + "learning_rate": 0.01600526685092404, + "loss": 0.7856, + "num_input_tokens_seen": 41380656, + "step": 71275 + }, + { + "epoch": 10.616621983914209, + "grad_norm": 0.029052734375, + "learning_rate": 0.01600332157281223, + "loss": 0.7992, + "num_input_tokens_seen": 41383984, + "step": 71280 + }, + { + "epoch": 10.617366696455168, + "grad_norm": 0.033935546875, + "learning_rate": 0.016001376277750324, + "loss": 0.7944, + "num_input_tokens_seen": 41386704, + "step": 71285 + }, + { + "epoch": 10.618111408996128, + "grad_norm": 0.037353515625, + "learning_rate": 0.015999430965771187, + "loss": 0.7847, + "num_input_tokens_seen": 41389712, + "step": 71290 + }, + { + "epoch": 10.618856121537087, + "grad_norm": 0.0341796875, + "learning_rate": 0.015997485636907678, + "loss": 0.8044, + "num_input_tokens_seen": 41392912, + "step": 71295 + }, + { + "epoch": 10.619600834078046, + "grad_norm": 0.0269775390625, + "learning_rate": 0.01599554029119266, + "loss": 0.7884, + "num_input_tokens_seen": 41395856, + "step": 71300 + }, + { + "epoch": 10.620345546619005, + "grad_norm": 0.03759765625, + "learning_rate": 0.015993594928659, + "loss": 0.7955, + "num_input_tokens_seen": 41398896, + "step": 71305 + }, + { + "epoch": 10.621090259159963, + "grad_norm": 0.031494140625, + "learning_rate": 0.015991649549339567, + "loss": 0.7917, + "num_input_tokens_seen": 41401840, + "step": 71310 + }, + { + "epoch": 10.621834971700924, + "grad_norm": 0.050048828125, + "learning_rate": 0.01598970415326722, + "loss": 0.7964, + "num_input_tokens_seen": 41404720, + "step": 71315 + }, + { + "epoch": 10.622579684241883, + "grad_norm": 0.03515625, + "learning_rate": 0.01598775874047483, + "loss": 0.7897, + "num_input_tokens_seen": 41407888, + "step": 71320 + }, + { + "epoch": 10.623324396782841, + "grad_norm": 0.01953125, + "learning_rate": 0.015985813310995255, + "loss": 0.8048, + "num_input_tokens_seen": 41410736, + "step": 71325 + }, + { + "epoch": 10.6240691093238, + "grad_norm": 0.042724609375, + "learning_rate": 0.01598386786486137, + "loss": 0.8271, + "num_input_tokens_seen": 41413520, + "step": 71330 + }, + { + "epoch": 10.62481382186476, + "grad_norm": 0.02099609375, + "learning_rate": 0.01598192240210604, + "loss": 0.7774, + "num_input_tokens_seen": 41416336, + "step": 71335 + }, + { + "epoch": 10.62555853440572, + "grad_norm": 0.051025390625, + "learning_rate": 0.015979976922762124, + "loss": 0.7842, + "num_input_tokens_seen": 41419280, + "step": 71340 + }, + { + "epoch": 10.626303246946678, + "grad_norm": 0.06689453125, + "learning_rate": 0.015978031426862493, + "loss": 0.7762, + "num_input_tokens_seen": 41422256, + "step": 71345 + }, + { + "epoch": 10.627047959487637, + "grad_norm": 0.0595703125, + "learning_rate": 0.015976085914440018, + "loss": 0.8329, + "num_input_tokens_seen": 41424944, + "step": 71350 + }, + { + "epoch": 10.627792672028598, + "grad_norm": 0.036865234375, + "learning_rate": 0.015974140385527566, + "loss": 0.7913, + "num_input_tokens_seen": 41427888, + "step": 71355 + }, + { + "epoch": 10.628537384569556, + "grad_norm": 0.032470703125, + "learning_rate": 0.015972194840157997, + "loss": 0.8113, + "num_input_tokens_seen": 41430864, + "step": 71360 + }, + { + "epoch": 10.629282097110515, + "grad_norm": 0.0216064453125, + "learning_rate": 0.015970249278364185, + "loss": 0.8132, + "num_input_tokens_seen": 41433744, + "step": 71365 + }, + { + "epoch": 10.630026809651474, + "grad_norm": 0.0303955078125, + "learning_rate": 0.015968303700179, + "loss": 0.8001, + "num_input_tokens_seen": 41436368, + "step": 71370 + }, + { + "epoch": 10.630771522192434, + "grad_norm": 0.028076171875, + "learning_rate": 0.015966358105635313, + "loss": 0.7689, + "num_input_tokens_seen": 41439056, + "step": 71375 + }, + { + "epoch": 10.631516234733393, + "grad_norm": 0.031005859375, + "learning_rate": 0.01596441249476598, + "loss": 0.794, + "num_input_tokens_seen": 41442064, + "step": 71380 + }, + { + "epoch": 10.632260947274352, + "grad_norm": 0.030517578125, + "learning_rate": 0.015962466867603876, + "loss": 0.779, + "num_input_tokens_seen": 41444688, + "step": 71385 + }, + { + "epoch": 10.63300565981531, + "grad_norm": 0.032958984375, + "learning_rate": 0.015960521224181883, + "loss": 0.8038, + "num_input_tokens_seen": 41447632, + "step": 71390 + }, + { + "epoch": 10.633750372356271, + "grad_norm": 0.032470703125, + "learning_rate": 0.01595857556453285, + "loss": 0.7895, + "num_input_tokens_seen": 41450640, + "step": 71395 + }, + { + "epoch": 10.63449508489723, + "grad_norm": 0.049560546875, + "learning_rate": 0.01595662988868966, + "loss": 0.8095, + "num_input_tokens_seen": 41453648, + "step": 71400 + }, + { + "epoch": 10.635239797438189, + "grad_norm": 0.04345703125, + "learning_rate": 0.015954684196685182, + "loss": 0.826, + "num_input_tokens_seen": 41456464, + "step": 71405 + }, + { + "epoch": 10.635984509979147, + "grad_norm": 0.024658203125, + "learning_rate": 0.015952738488552283, + "loss": 0.7988, + "num_input_tokens_seen": 41459632, + "step": 71410 + }, + { + "epoch": 10.636729222520108, + "grad_norm": 0.03369140625, + "learning_rate": 0.015950792764323842, + "loss": 0.804, + "num_input_tokens_seen": 41462448, + "step": 71415 + }, + { + "epoch": 10.637473935061067, + "grad_norm": 0.038818359375, + "learning_rate": 0.015948847024032715, + "loss": 0.8039, + "num_input_tokens_seen": 41465232, + "step": 71420 + }, + { + "epoch": 10.638218647602026, + "grad_norm": 0.0267333984375, + "learning_rate": 0.01594690126771179, + "loss": 0.8044, + "num_input_tokens_seen": 41468112, + "step": 71425 + }, + { + "epoch": 10.638963360142984, + "grad_norm": 0.023193359375, + "learning_rate": 0.01594495549539392, + "loss": 0.8104, + "num_input_tokens_seen": 41470992, + "step": 71430 + }, + { + "epoch": 10.639708072683945, + "grad_norm": 0.0303955078125, + "learning_rate": 0.015943009707111995, + "loss": 0.8029, + "num_input_tokens_seen": 41474032, + "step": 71435 + }, + { + "epoch": 10.640452785224904, + "grad_norm": 0.02490234375, + "learning_rate": 0.01594106390289888, + "loss": 0.7871, + "num_input_tokens_seen": 41476848, + "step": 71440 + }, + { + "epoch": 10.641197497765862, + "grad_norm": 0.0306396484375, + "learning_rate": 0.015939118082787442, + "loss": 0.8026, + "num_input_tokens_seen": 41479664, + "step": 71445 + }, + { + "epoch": 10.641942210306821, + "grad_norm": 0.038330078125, + "learning_rate": 0.015937172246810558, + "loss": 0.7987, + "num_input_tokens_seen": 41482352, + "step": 71450 + }, + { + "epoch": 10.64268692284778, + "grad_norm": 0.031005859375, + "learning_rate": 0.015935226395001105, + "loss": 0.8035, + "num_input_tokens_seen": 41485360, + "step": 71455 + }, + { + "epoch": 10.64343163538874, + "grad_norm": 0.02880859375, + "learning_rate": 0.015933280527391958, + "loss": 0.7965, + "num_input_tokens_seen": 41488048, + "step": 71460 + }, + { + "epoch": 10.6441763479297, + "grad_norm": 0.029052734375, + "learning_rate": 0.015931334644015974, + "loss": 0.7927, + "num_input_tokens_seen": 41490992, + "step": 71465 + }, + { + "epoch": 10.644921060470658, + "grad_norm": 0.03173828125, + "learning_rate": 0.015929388744906042, + "loss": 0.8154, + "num_input_tokens_seen": 41494128, + "step": 71470 + }, + { + "epoch": 10.645665773011618, + "grad_norm": 0.02099609375, + "learning_rate": 0.015927442830095034, + "loss": 0.8117, + "num_input_tokens_seen": 41497072, + "step": 71475 + }, + { + "epoch": 10.646410485552577, + "grad_norm": 0.04296875, + "learning_rate": 0.015925496899615817, + "loss": 0.8, + "num_input_tokens_seen": 41500016, + "step": 71480 + }, + { + "epoch": 10.647155198093536, + "grad_norm": 0.036376953125, + "learning_rate": 0.01592355095350128, + "loss": 0.7893, + "num_input_tokens_seen": 41502640, + "step": 71485 + }, + { + "epoch": 10.647899910634495, + "grad_norm": 0.028076171875, + "learning_rate": 0.01592160499178428, + "loss": 0.7934, + "num_input_tokens_seen": 41505584, + "step": 71490 + }, + { + "epoch": 10.648644623175453, + "grad_norm": 0.06005859375, + "learning_rate": 0.0159196590144977, + "loss": 0.7841, + "num_input_tokens_seen": 41508400, + "step": 71495 + }, + { + "epoch": 10.649389335716414, + "grad_norm": 0.03076171875, + "learning_rate": 0.015917713021674423, + "loss": 0.8373, + "num_input_tokens_seen": 41511216, + "step": 71500 + }, + { + "epoch": 10.650134048257373, + "grad_norm": 0.035888671875, + "learning_rate": 0.015915767013347307, + "loss": 0.806, + "num_input_tokens_seen": 41514224, + "step": 71505 + }, + { + "epoch": 10.650878760798332, + "grad_norm": 0.03369140625, + "learning_rate": 0.015913820989549247, + "loss": 0.7981, + "num_input_tokens_seen": 41517200, + "step": 71510 + }, + { + "epoch": 10.65162347333929, + "grad_norm": 0.049072265625, + "learning_rate": 0.015911874950313102, + "loss": 0.8005, + "num_input_tokens_seen": 41520304, + "step": 71515 + }, + { + "epoch": 10.65236818588025, + "grad_norm": 0.0380859375, + "learning_rate": 0.015909928895671763, + "loss": 0.7969, + "num_input_tokens_seen": 41523024, + "step": 71520 + }, + { + "epoch": 10.65311289842121, + "grad_norm": 0.031494140625, + "learning_rate": 0.015907982825658102, + "loss": 0.7896, + "num_input_tokens_seen": 41525744, + "step": 71525 + }, + { + "epoch": 10.653857610962168, + "grad_norm": 0.0234375, + "learning_rate": 0.01590603674030499, + "loss": 0.8019, + "num_input_tokens_seen": 41528400, + "step": 71530 + }, + { + "epoch": 10.654602323503127, + "grad_norm": 0.06787109375, + "learning_rate": 0.0159040906396453, + "loss": 0.8135, + "num_input_tokens_seen": 41531440, + "step": 71535 + }, + { + "epoch": 10.655347036044088, + "grad_norm": 0.03271484375, + "learning_rate": 0.01590214452371193, + "loss": 0.7954, + "num_input_tokens_seen": 41534384, + "step": 71540 + }, + { + "epoch": 10.656091748585046, + "grad_norm": 0.046875, + "learning_rate": 0.015900198392537743, + "loss": 0.8024, + "num_input_tokens_seen": 41537552, + "step": 71545 + }, + { + "epoch": 10.656836461126005, + "grad_norm": 0.043212890625, + "learning_rate": 0.01589825224615562, + "loss": 0.7906, + "num_input_tokens_seen": 41540304, + "step": 71550 + }, + { + "epoch": 10.657581173666964, + "grad_norm": 0.02783203125, + "learning_rate": 0.015896306084598435, + "loss": 0.7994, + "num_input_tokens_seen": 41543440, + "step": 71555 + }, + { + "epoch": 10.658325886207924, + "grad_norm": 0.0206298828125, + "learning_rate": 0.01589435990789907, + "loss": 0.8, + "num_input_tokens_seen": 41546128, + "step": 71560 + }, + { + "epoch": 10.659070598748883, + "grad_norm": 0.02294921875, + "learning_rate": 0.0158924137160904, + "loss": 0.8067, + "num_input_tokens_seen": 41548848, + "step": 71565 + }, + { + "epoch": 10.659815311289842, + "grad_norm": 0.0380859375, + "learning_rate": 0.015890467509205308, + "loss": 0.7945, + "num_input_tokens_seen": 41551504, + "step": 71570 + }, + { + "epoch": 10.6605600238308, + "grad_norm": 0.028564453125, + "learning_rate": 0.015888521287276677, + "loss": 0.808, + "num_input_tokens_seen": 41554192, + "step": 71575 + }, + { + "epoch": 10.661304736371761, + "grad_norm": 0.035888671875, + "learning_rate": 0.01588657505033738, + "loss": 0.803, + "num_input_tokens_seen": 41557136, + "step": 71580 + }, + { + "epoch": 10.66204944891272, + "grad_norm": 0.03564453125, + "learning_rate": 0.015884628798420296, + "loss": 0.814, + "num_input_tokens_seen": 41560016, + "step": 71585 + }, + { + "epoch": 10.662794161453679, + "grad_norm": 0.056396484375, + "learning_rate": 0.015882682531558313, + "loss": 0.7974, + "num_input_tokens_seen": 41563056, + "step": 71590 + }, + { + "epoch": 10.663538873994638, + "grad_norm": 0.034423828125, + "learning_rate": 0.015880736249784304, + "loss": 0.7798, + "num_input_tokens_seen": 41565648, + "step": 71595 + }, + { + "epoch": 10.664283586535598, + "grad_norm": 0.0225830078125, + "learning_rate": 0.015878789953131148, + "loss": 0.7762, + "num_input_tokens_seen": 41568624, + "step": 71600 + }, + { + "epoch": 10.665028299076557, + "grad_norm": 0.034423828125, + "learning_rate": 0.01587684364163173, + "loss": 0.8142, + "num_input_tokens_seen": 41571728, + "step": 71605 + }, + { + "epoch": 10.665773011617516, + "grad_norm": 0.038330078125, + "learning_rate": 0.015874897315318933, + "loss": 0.7868, + "num_input_tokens_seen": 41574864, + "step": 71610 + }, + { + "epoch": 10.666517724158474, + "grad_norm": 0.057861328125, + "learning_rate": 0.01587295097422563, + "loss": 0.8349, + "num_input_tokens_seen": 41577840, + "step": 71615 + }, + { + "epoch": 10.667262436699435, + "grad_norm": 0.025146484375, + "learning_rate": 0.015871004618384713, + "loss": 0.7856, + "num_input_tokens_seen": 41580848, + "step": 71620 + }, + { + "epoch": 10.668007149240394, + "grad_norm": 0.03271484375, + "learning_rate": 0.015869058247829058, + "loss": 0.8201, + "num_input_tokens_seen": 41583824, + "step": 71625 + }, + { + "epoch": 10.668751861781352, + "grad_norm": 0.04736328125, + "learning_rate": 0.015867111862591543, + "loss": 0.8282, + "num_input_tokens_seen": 41586544, + "step": 71630 + }, + { + "epoch": 10.669496574322311, + "grad_norm": 0.03369140625, + "learning_rate": 0.015865165462705055, + "loss": 0.7972, + "num_input_tokens_seen": 41589232, + "step": 71635 + }, + { + "epoch": 10.67024128686327, + "grad_norm": 0.04296875, + "learning_rate": 0.015863219048202478, + "loss": 0.8069, + "num_input_tokens_seen": 41592080, + "step": 71640 + }, + { + "epoch": 10.67098599940423, + "grad_norm": 0.0517578125, + "learning_rate": 0.015861272619116688, + "loss": 0.8083, + "num_input_tokens_seen": 41595056, + "step": 71645 + }, + { + "epoch": 10.67173071194519, + "grad_norm": 0.052490234375, + "learning_rate": 0.01585932617548058, + "loss": 0.8077, + "num_input_tokens_seen": 41597776, + "step": 71650 + }, + { + "epoch": 10.672475424486148, + "grad_norm": 0.021240234375, + "learning_rate": 0.015857379717327022, + "loss": 0.8015, + "num_input_tokens_seen": 41600496, + "step": 71655 + }, + { + "epoch": 10.673220137027108, + "grad_norm": 0.04052734375, + "learning_rate": 0.015855433244688914, + "loss": 0.8131, + "num_input_tokens_seen": 41603472, + "step": 71660 + }, + { + "epoch": 10.673964849568067, + "grad_norm": 0.052490234375, + "learning_rate": 0.015853486757599122, + "loss": 0.7922, + "num_input_tokens_seen": 41606352, + "step": 71665 + }, + { + "epoch": 10.674709562109026, + "grad_norm": 0.03857421875, + "learning_rate": 0.015851540256090542, + "loss": 0.7836, + "num_input_tokens_seen": 41609296, + "step": 71670 + }, + { + "epoch": 10.675454274649985, + "grad_norm": 0.03125, + "learning_rate": 0.015849593740196056, + "loss": 0.7987, + "num_input_tokens_seen": 41612016, + "step": 71675 + }, + { + "epoch": 10.676198987190944, + "grad_norm": 0.0341796875, + "learning_rate": 0.015847647209948544, + "loss": 0.7786, + "num_input_tokens_seen": 41614864, + "step": 71680 + }, + { + "epoch": 10.676943699731904, + "grad_norm": 0.040771484375, + "learning_rate": 0.015845700665380897, + "loss": 0.8137, + "num_input_tokens_seen": 41617712, + "step": 71685 + }, + { + "epoch": 10.677688412272863, + "grad_norm": 0.0361328125, + "learning_rate": 0.015843754106525998, + "loss": 0.7702, + "num_input_tokens_seen": 41620656, + "step": 71690 + }, + { + "epoch": 10.678433124813822, + "grad_norm": 0.0400390625, + "learning_rate": 0.015841807533416727, + "loss": 0.7863, + "num_input_tokens_seen": 41623504, + "step": 71695 + }, + { + "epoch": 10.67917783735478, + "grad_norm": 0.039306640625, + "learning_rate": 0.015839860946085978, + "loss": 0.8091, + "num_input_tokens_seen": 41626480, + "step": 71700 + }, + { + "epoch": 10.67992254989574, + "grad_norm": 0.0654296875, + "learning_rate": 0.015837914344566627, + "loss": 0.7839, + "num_input_tokens_seen": 41629328, + "step": 71705 + }, + { + "epoch": 10.6806672624367, + "grad_norm": 0.0245361328125, + "learning_rate": 0.015835967728891565, + "loss": 0.7872, + "num_input_tokens_seen": 41632368, + "step": 71710 + }, + { + "epoch": 10.681411974977658, + "grad_norm": 0.047607421875, + "learning_rate": 0.01583402109909368, + "loss": 0.7619, + "num_input_tokens_seen": 41635216, + "step": 71715 + }, + { + "epoch": 10.682156687518617, + "grad_norm": 0.03857421875, + "learning_rate": 0.015832074455205857, + "loss": 0.83, + "num_input_tokens_seen": 41638032, + "step": 71720 + }, + { + "epoch": 10.682901400059578, + "grad_norm": 0.03955078125, + "learning_rate": 0.015830127797260982, + "loss": 0.8076, + "num_input_tokens_seen": 41640848, + "step": 71725 + }, + { + "epoch": 10.683646112600536, + "grad_norm": 0.0341796875, + "learning_rate": 0.01582818112529194, + "loss": 0.7876, + "num_input_tokens_seen": 41643760, + "step": 71730 + }, + { + "epoch": 10.684390825141495, + "grad_norm": 0.05078125, + "learning_rate": 0.015826234439331618, + "loss": 0.8173, + "num_input_tokens_seen": 41646416, + "step": 71735 + }, + { + "epoch": 10.685135537682454, + "grad_norm": 0.032958984375, + "learning_rate": 0.01582428773941291, + "loss": 0.8372, + "num_input_tokens_seen": 41649328, + "step": 71740 + }, + { + "epoch": 10.685880250223414, + "grad_norm": 0.0238037109375, + "learning_rate": 0.015822341025568693, + "loss": 0.7678, + "num_input_tokens_seen": 41652272, + "step": 71745 + }, + { + "epoch": 10.686624962764373, + "grad_norm": 0.03466796875, + "learning_rate": 0.01582039429783186, + "loss": 0.7979, + "num_input_tokens_seen": 41655344, + "step": 71750 + }, + { + "epoch": 10.687369675305332, + "grad_norm": 0.045654296875, + "learning_rate": 0.015818447556235304, + "loss": 0.7778, + "num_input_tokens_seen": 41658096, + "step": 71755 + }, + { + "epoch": 10.68811438784629, + "grad_norm": 0.0390625, + "learning_rate": 0.015816500800811907, + "loss": 0.7874, + "num_input_tokens_seen": 41660880, + "step": 71760 + }, + { + "epoch": 10.688859100387251, + "grad_norm": 0.03466796875, + "learning_rate": 0.015814554031594555, + "loss": 0.8157, + "num_input_tokens_seen": 41663632, + "step": 71765 + }, + { + "epoch": 10.68960381292821, + "grad_norm": 0.02734375, + "learning_rate": 0.01581260724861614, + "loss": 0.8025, + "num_input_tokens_seen": 41666448, + "step": 71770 + }, + { + "epoch": 10.690348525469169, + "grad_norm": 0.0322265625, + "learning_rate": 0.015810660451909556, + "loss": 0.8001, + "num_input_tokens_seen": 41669392, + "step": 71775 + }, + { + "epoch": 10.691093238010128, + "grad_norm": 0.02880859375, + "learning_rate": 0.01580871364150769, + "loss": 0.8031, + "num_input_tokens_seen": 41672784, + "step": 71780 + }, + { + "epoch": 10.691837950551088, + "grad_norm": 0.033935546875, + "learning_rate": 0.015806766817443422, + "loss": 0.7951, + "num_input_tokens_seen": 41675536, + "step": 71785 + }, + { + "epoch": 10.692582663092047, + "grad_norm": 0.044189453125, + "learning_rate": 0.015804819979749654, + "loss": 0.8172, + "num_input_tokens_seen": 41678352, + "step": 71790 + }, + { + "epoch": 10.693327375633006, + "grad_norm": 0.0341796875, + "learning_rate": 0.015802873128459266, + "loss": 0.7811, + "num_input_tokens_seen": 41680976, + "step": 71795 + }, + { + "epoch": 10.694072088173964, + "grad_norm": 0.026611328125, + "learning_rate": 0.015800926263605155, + "loss": 0.8066, + "num_input_tokens_seen": 41683952, + "step": 71800 + }, + { + "epoch": 10.694816800714925, + "grad_norm": 0.035400390625, + "learning_rate": 0.015798979385220208, + "loss": 0.7834, + "num_input_tokens_seen": 41686864, + "step": 71805 + }, + { + "epoch": 10.695561513255884, + "grad_norm": 0.03125, + "learning_rate": 0.015797032493337314, + "loss": 0.8123, + "num_input_tokens_seen": 41689840, + "step": 71810 + }, + { + "epoch": 10.696306225796842, + "grad_norm": 0.03125, + "learning_rate": 0.015795085587989373, + "loss": 0.8174, + "num_input_tokens_seen": 41692784, + "step": 71815 + }, + { + "epoch": 10.697050938337801, + "grad_norm": 0.0311279296875, + "learning_rate": 0.015793138669209263, + "loss": 0.8209, + "num_input_tokens_seen": 41695280, + "step": 71820 + }, + { + "epoch": 10.69779565087876, + "grad_norm": 0.043212890625, + "learning_rate": 0.015791191737029885, + "loss": 0.8011, + "num_input_tokens_seen": 41698320, + "step": 71825 + }, + { + "epoch": 10.69854036341972, + "grad_norm": 0.025634765625, + "learning_rate": 0.015789244791484126, + "loss": 0.8053, + "num_input_tokens_seen": 41701136, + "step": 71830 + }, + { + "epoch": 10.69928507596068, + "grad_norm": 0.048095703125, + "learning_rate": 0.01578729783260488, + "loss": 0.7985, + "num_input_tokens_seen": 41703984, + "step": 71835 + }, + { + "epoch": 10.700029788501638, + "grad_norm": 0.045166015625, + "learning_rate": 0.01578535086042503, + "loss": 0.8025, + "num_input_tokens_seen": 41707184, + "step": 71840 + }, + { + "epoch": 10.700774501042597, + "grad_norm": 0.024169921875, + "learning_rate": 0.015783403874977477, + "loss": 0.8188, + "num_input_tokens_seen": 41710096, + "step": 71845 + }, + { + "epoch": 10.701519213583557, + "grad_norm": 0.0228271484375, + "learning_rate": 0.01578145687629512, + "loss": 0.7814, + "num_input_tokens_seen": 41713008, + "step": 71850 + }, + { + "epoch": 10.702263926124516, + "grad_norm": 0.039794921875, + "learning_rate": 0.015779509864410835, + "loss": 0.8214, + "num_input_tokens_seen": 41715728, + "step": 71855 + }, + { + "epoch": 10.703008638665475, + "grad_norm": 0.035888671875, + "learning_rate": 0.01577756283935753, + "loss": 0.7946, + "num_input_tokens_seen": 41718448, + "step": 71860 + }, + { + "epoch": 10.703753351206434, + "grad_norm": 0.03564453125, + "learning_rate": 0.015775615801168087, + "loss": 0.7821, + "num_input_tokens_seen": 41721232, + "step": 71865 + }, + { + "epoch": 10.704498063747394, + "grad_norm": 0.0203857421875, + "learning_rate": 0.015773668749875402, + "loss": 0.7887, + "num_input_tokens_seen": 41724016, + "step": 71870 + }, + { + "epoch": 10.705242776288353, + "grad_norm": 0.039306640625, + "learning_rate": 0.01577172168551237, + "loss": 0.8008, + "num_input_tokens_seen": 41726960, + "step": 71875 + }, + { + "epoch": 10.705987488829312, + "grad_norm": 0.037353515625, + "learning_rate": 0.015769774608111884, + "loss": 0.7879, + "num_input_tokens_seen": 41730128, + "step": 71880 + }, + { + "epoch": 10.70673220137027, + "grad_norm": 0.033935546875, + "learning_rate": 0.015767827517706844, + "loss": 0.8109, + "num_input_tokens_seen": 41733072, + "step": 71885 + }, + { + "epoch": 10.707476913911231, + "grad_norm": 0.028564453125, + "learning_rate": 0.015765880414330136, + "loss": 0.7805, + "num_input_tokens_seen": 41735888, + "step": 71890 + }, + { + "epoch": 10.70822162645219, + "grad_norm": 0.03857421875, + "learning_rate": 0.015763933298014652, + "loss": 0.7983, + "num_input_tokens_seen": 41738736, + "step": 71895 + }, + { + "epoch": 10.708966338993148, + "grad_norm": 0.0286865234375, + "learning_rate": 0.015761986168793297, + "loss": 0.8127, + "num_input_tokens_seen": 41741584, + "step": 71900 + }, + { + "epoch": 10.709711051534107, + "grad_norm": 0.051025390625, + "learning_rate": 0.01576003902669896, + "loss": 0.779, + "num_input_tokens_seen": 41744880, + "step": 71905 + }, + { + "epoch": 10.710455764075068, + "grad_norm": 0.035888671875, + "learning_rate": 0.01575809187176453, + "loss": 0.8009, + "num_input_tokens_seen": 41747984, + "step": 71910 + }, + { + "epoch": 10.711200476616026, + "grad_norm": 0.03173828125, + "learning_rate": 0.015756144704022913, + "loss": 0.7908, + "num_input_tokens_seen": 41751440, + "step": 71915 + }, + { + "epoch": 10.711945189156985, + "grad_norm": 0.044677734375, + "learning_rate": 0.015754197523507, + "loss": 0.7845, + "num_input_tokens_seen": 41754512, + "step": 71920 + }, + { + "epoch": 10.712689901697944, + "grad_norm": 0.04150390625, + "learning_rate": 0.015752250330249684, + "loss": 0.7773, + "num_input_tokens_seen": 41757232, + "step": 71925 + }, + { + "epoch": 10.713434614238905, + "grad_norm": 0.050537109375, + "learning_rate": 0.015750303124283868, + "loss": 0.7922, + "num_input_tokens_seen": 41760176, + "step": 71930 + }, + { + "epoch": 10.714179326779863, + "grad_norm": 0.034912109375, + "learning_rate": 0.015748355905642438, + "loss": 0.7947, + "num_input_tokens_seen": 41762992, + "step": 71935 + }, + { + "epoch": 10.714924039320822, + "grad_norm": 0.06396484375, + "learning_rate": 0.0157464086743583, + "loss": 0.8013, + "num_input_tokens_seen": 41766224, + "step": 71940 + }, + { + "epoch": 10.71566875186178, + "grad_norm": 0.035400390625, + "learning_rate": 0.015744461430464343, + "loss": 0.7933, + "num_input_tokens_seen": 41768944, + "step": 71945 + }, + { + "epoch": 10.716413464402741, + "grad_norm": 0.034423828125, + "learning_rate": 0.015742514173993468, + "loss": 0.8127, + "num_input_tokens_seen": 41772048, + "step": 71950 + }, + { + "epoch": 10.7171581769437, + "grad_norm": 0.037109375, + "learning_rate": 0.015740566904978572, + "loss": 0.819, + "num_input_tokens_seen": 41774864, + "step": 71955 + }, + { + "epoch": 10.717902889484659, + "grad_norm": 0.043701171875, + "learning_rate": 0.015738619623452545, + "loss": 0.8307, + "num_input_tokens_seen": 41777712, + "step": 71960 + }, + { + "epoch": 10.718647602025618, + "grad_norm": 0.04296875, + "learning_rate": 0.01573667232944829, + "loss": 0.8194, + "num_input_tokens_seen": 41780752, + "step": 71965 + }, + { + "epoch": 10.719392314566576, + "grad_norm": 0.031982421875, + "learning_rate": 0.015734725022998712, + "loss": 0.8107, + "num_input_tokens_seen": 41783536, + "step": 71970 + }, + { + "epoch": 10.720137027107537, + "grad_norm": 0.038818359375, + "learning_rate": 0.015732777704136697, + "loss": 0.7878, + "num_input_tokens_seen": 41786160, + "step": 71975 + }, + { + "epoch": 10.720881739648496, + "grad_norm": 0.030517578125, + "learning_rate": 0.015730830372895153, + "loss": 0.8039, + "num_input_tokens_seen": 41788816, + "step": 71980 + }, + { + "epoch": 10.721626452189454, + "grad_norm": 0.055419921875, + "learning_rate": 0.01572888302930697, + "loss": 0.7927, + "num_input_tokens_seen": 41791632, + "step": 71985 + }, + { + "epoch": 10.722371164730415, + "grad_norm": 0.0634765625, + "learning_rate": 0.015726935673405044, + "loss": 0.8106, + "num_input_tokens_seen": 41794480, + "step": 71990 + }, + { + "epoch": 10.723115877271374, + "grad_norm": 0.052001953125, + "learning_rate": 0.015724988305222284, + "loss": 0.804, + "num_input_tokens_seen": 41797424, + "step": 71995 + }, + { + "epoch": 10.723860589812332, + "grad_norm": 0.05908203125, + "learning_rate": 0.015723040924791577, + "loss": 0.8162, + "num_input_tokens_seen": 41800176, + "step": 72000 + }, + { + "epoch": 10.724605302353291, + "grad_norm": 0.047607421875, + "learning_rate": 0.015721093532145836, + "loss": 0.7936, + "num_input_tokens_seen": 41802992, + "step": 72005 + }, + { + "epoch": 10.72535001489425, + "grad_norm": 0.11474609375, + "learning_rate": 0.01571914612731795, + "loss": 0.8042, + "num_input_tokens_seen": 41805808, + "step": 72010 + }, + { + "epoch": 10.72609472743521, + "grad_norm": 0.1005859375, + "learning_rate": 0.015717198710340823, + "loss": 0.819, + "num_input_tokens_seen": 41808880, + "step": 72015 + }, + { + "epoch": 10.72683943997617, + "grad_norm": 0.1884765625, + "learning_rate": 0.01571525128124735, + "loss": 0.7825, + "num_input_tokens_seen": 41811888, + "step": 72020 + }, + { + "epoch": 10.727584152517128, + "grad_norm": 0.04736328125, + "learning_rate": 0.015713303840070435, + "loss": 0.8232, + "num_input_tokens_seen": 41814640, + "step": 72025 + }, + { + "epoch": 10.728328865058087, + "grad_norm": 0.05224609375, + "learning_rate": 0.015711356386842974, + "loss": 0.807, + "num_input_tokens_seen": 41817232, + "step": 72030 + }, + { + "epoch": 10.729073577599047, + "grad_norm": 0.06298828125, + "learning_rate": 0.015709408921597872, + "loss": 0.8073, + "num_input_tokens_seen": 41820016, + "step": 72035 + }, + { + "epoch": 10.729818290140006, + "grad_norm": 0.072265625, + "learning_rate": 0.015707461444368024, + "loss": 0.7879, + "num_input_tokens_seen": 41822864, + "step": 72040 + }, + { + "epoch": 10.730563002680965, + "grad_norm": 0.05908203125, + "learning_rate": 0.015705513955186334, + "loss": 0.8086, + "num_input_tokens_seen": 41826064, + "step": 72045 + }, + { + "epoch": 10.731307715221924, + "grad_norm": 0.05029296875, + "learning_rate": 0.01570356645408571, + "loss": 0.8331, + "num_input_tokens_seen": 41828944, + "step": 72050 + }, + { + "epoch": 10.732052427762884, + "grad_norm": 0.052978515625, + "learning_rate": 0.01570161894109904, + "loss": 0.8027, + "num_input_tokens_seen": 41831664, + "step": 72055 + }, + { + "epoch": 10.732797140303843, + "grad_norm": 0.06201171875, + "learning_rate": 0.015699671416259234, + "loss": 0.8153, + "num_input_tokens_seen": 41834896, + "step": 72060 + }, + { + "epoch": 10.733541852844802, + "grad_norm": 0.03955078125, + "learning_rate": 0.01569772387959919, + "loss": 0.794, + "num_input_tokens_seen": 41837680, + "step": 72065 + }, + { + "epoch": 10.73428656538576, + "grad_norm": 0.0546875, + "learning_rate": 0.015695776331151805, + "loss": 0.7932, + "num_input_tokens_seen": 41840496, + "step": 72070 + }, + { + "epoch": 10.735031277926721, + "grad_norm": 0.031494140625, + "learning_rate": 0.01569382877094999, + "loss": 0.8187, + "num_input_tokens_seen": 41843728, + "step": 72075 + }, + { + "epoch": 10.73577599046768, + "grad_norm": 0.05712890625, + "learning_rate": 0.01569188119902664, + "loss": 0.7974, + "num_input_tokens_seen": 41846896, + "step": 72080 + }, + { + "epoch": 10.736520703008638, + "grad_norm": 0.0439453125, + "learning_rate": 0.015689933615414667, + "loss": 0.7816, + "num_input_tokens_seen": 41850096, + "step": 72085 + }, + { + "epoch": 10.737265415549597, + "grad_norm": 0.044189453125, + "learning_rate": 0.015687986020146965, + "loss": 0.8092, + "num_input_tokens_seen": 41853040, + "step": 72090 + }, + { + "epoch": 10.738010128090558, + "grad_norm": 0.051025390625, + "learning_rate": 0.015686038413256432, + "loss": 0.8217, + "num_input_tokens_seen": 41856208, + "step": 72095 + }, + { + "epoch": 10.738754840631517, + "grad_norm": 0.053466796875, + "learning_rate": 0.015684090794775986, + "loss": 0.8157, + "num_input_tokens_seen": 41859408, + "step": 72100 + }, + { + "epoch": 10.739499553172475, + "grad_norm": 0.060546875, + "learning_rate": 0.015682143164738517, + "loss": 0.8109, + "num_input_tokens_seen": 41862448, + "step": 72105 + }, + { + "epoch": 10.740244265713434, + "grad_norm": 0.04443359375, + "learning_rate": 0.015680195523176933, + "loss": 0.8119, + "num_input_tokens_seen": 41865392, + "step": 72110 + }, + { + "epoch": 10.740988978254395, + "grad_norm": 0.044921875, + "learning_rate": 0.015678247870124135, + "loss": 0.8008, + "num_input_tokens_seen": 41868304, + "step": 72115 + }, + { + "epoch": 10.741733690795353, + "grad_norm": 0.0654296875, + "learning_rate": 0.015676300205613034, + "loss": 0.7917, + "num_input_tokens_seen": 41871216, + "step": 72120 + }, + { + "epoch": 10.742478403336312, + "grad_norm": 0.03271484375, + "learning_rate": 0.015674352529676524, + "loss": 0.7931, + "num_input_tokens_seen": 41874064, + "step": 72125 + }, + { + "epoch": 10.74322311587727, + "grad_norm": 0.037841796875, + "learning_rate": 0.015672404842347512, + "loss": 0.7933, + "num_input_tokens_seen": 41876560, + "step": 72130 + }, + { + "epoch": 10.743967828418231, + "grad_norm": 0.033203125, + "learning_rate": 0.015670457143658903, + "loss": 0.7873, + "num_input_tokens_seen": 41879280, + "step": 72135 + }, + { + "epoch": 10.74471254095919, + "grad_norm": 0.052490234375, + "learning_rate": 0.015668509433643604, + "loss": 0.8287, + "num_input_tokens_seen": 41882064, + "step": 72140 + }, + { + "epoch": 10.745457253500149, + "grad_norm": 0.0556640625, + "learning_rate": 0.01566656171233452, + "loss": 0.8276, + "num_input_tokens_seen": 41885072, + "step": 72145 + }, + { + "epoch": 10.746201966041108, + "grad_norm": 0.046875, + "learning_rate": 0.01566461397976455, + "loss": 0.8186, + "num_input_tokens_seen": 41887952, + "step": 72150 + }, + { + "epoch": 10.746946678582066, + "grad_norm": 0.040771484375, + "learning_rate": 0.01566266623596661, + "loss": 0.8188, + "num_input_tokens_seen": 41890960, + "step": 72155 + }, + { + "epoch": 10.747691391123027, + "grad_norm": 0.040771484375, + "learning_rate": 0.015660718480973588, + "loss": 0.8156, + "num_input_tokens_seen": 41893840, + "step": 72160 + }, + { + "epoch": 10.748436103663986, + "grad_norm": 0.02392578125, + "learning_rate": 0.0156587707148184, + "loss": 0.8086, + "num_input_tokens_seen": 41896656, + "step": 72165 + }, + { + "epoch": 10.749180816204944, + "grad_norm": 0.0224609375, + "learning_rate": 0.015656822937533954, + "loss": 0.7939, + "num_input_tokens_seen": 41899408, + "step": 72170 + }, + { + "epoch": 10.749925528745905, + "grad_norm": 0.0255126953125, + "learning_rate": 0.01565487514915315, + "loss": 0.8064, + "num_input_tokens_seen": 41902224, + "step": 72175 + }, + { + "epoch": 10.750670241286864, + "grad_norm": 0.037353515625, + "learning_rate": 0.015652927349708898, + "loss": 0.7985, + "num_input_tokens_seen": 41905392, + "step": 72180 + }, + { + "epoch": 10.751414953827823, + "grad_norm": 0.039794921875, + "learning_rate": 0.015650979539234104, + "loss": 0.8098, + "num_input_tokens_seen": 41908208, + "step": 72185 + }, + { + "epoch": 10.752159666368781, + "grad_norm": 0.041015625, + "learning_rate": 0.015649031717761663, + "loss": 0.8091, + "num_input_tokens_seen": 41910896, + "step": 72190 + }, + { + "epoch": 10.75290437890974, + "grad_norm": 0.041259765625, + "learning_rate": 0.015647083885324497, + "loss": 0.8101, + "num_input_tokens_seen": 41913744, + "step": 72195 + }, + { + "epoch": 10.7536490914507, + "grad_norm": 0.02001953125, + "learning_rate": 0.015645136041955506, + "loss": 0.8056, + "num_input_tokens_seen": 41916208, + "step": 72200 + }, + { + "epoch": 10.75439380399166, + "grad_norm": 0.0238037109375, + "learning_rate": 0.015643188187687597, + "loss": 0.8048, + "num_input_tokens_seen": 41919184, + "step": 72205 + }, + { + "epoch": 10.755138516532618, + "grad_norm": 0.03955078125, + "learning_rate": 0.015641240322553675, + "loss": 0.7958, + "num_input_tokens_seen": 41922096, + "step": 72210 + }, + { + "epoch": 10.755883229073577, + "grad_norm": 0.0228271484375, + "learning_rate": 0.015639292446586656, + "loss": 0.8046, + "num_input_tokens_seen": 41925072, + "step": 72215 + }, + { + "epoch": 10.756627941614537, + "grad_norm": 0.03857421875, + "learning_rate": 0.015637344559819435, + "loss": 0.7839, + "num_input_tokens_seen": 41927728, + "step": 72220 + }, + { + "epoch": 10.757372654155496, + "grad_norm": 0.0299072265625, + "learning_rate": 0.01563539666228493, + "loss": 0.806, + "num_input_tokens_seen": 41930640, + "step": 72225 + }, + { + "epoch": 10.758117366696455, + "grad_norm": 0.03369140625, + "learning_rate": 0.01563344875401604, + "loss": 0.791, + "num_input_tokens_seen": 41933680, + "step": 72230 + }, + { + "epoch": 10.758862079237414, + "grad_norm": 0.0380859375, + "learning_rate": 0.015631500835045678, + "loss": 0.8108, + "num_input_tokens_seen": 41936496, + "step": 72235 + }, + { + "epoch": 10.759606791778374, + "grad_norm": 0.046875, + "learning_rate": 0.01562955290540675, + "loss": 0.8002, + "num_input_tokens_seen": 41939472, + "step": 72240 + }, + { + "epoch": 10.760351504319333, + "grad_norm": 0.037353515625, + "learning_rate": 0.01562760496513217, + "loss": 0.8082, + "num_input_tokens_seen": 41942320, + "step": 72245 + }, + { + "epoch": 10.761096216860292, + "grad_norm": 0.051513671875, + "learning_rate": 0.015625657014254838, + "loss": 0.7969, + "num_input_tokens_seen": 41945232, + "step": 72250 + }, + { + "epoch": 10.76184092940125, + "grad_norm": 0.033935546875, + "learning_rate": 0.01562370905280767, + "loss": 0.795, + "num_input_tokens_seen": 41948432, + "step": 72255 + }, + { + "epoch": 10.762585641942211, + "grad_norm": 0.0478515625, + "learning_rate": 0.015621761080823568, + "loss": 0.8262, + "num_input_tokens_seen": 41951248, + "step": 72260 + }, + { + "epoch": 10.76333035448317, + "grad_norm": 0.0262451171875, + "learning_rate": 0.015619813098335445, + "loss": 0.8058, + "num_input_tokens_seen": 41954032, + "step": 72265 + }, + { + "epoch": 10.764075067024129, + "grad_norm": 0.0693359375, + "learning_rate": 0.015617865105376213, + "loss": 0.7926, + "num_input_tokens_seen": 41956848, + "step": 72270 + }, + { + "epoch": 10.764819779565087, + "grad_norm": 0.053955078125, + "learning_rate": 0.015615917101978774, + "loss": 0.8071, + "num_input_tokens_seen": 41959536, + "step": 72275 + }, + { + "epoch": 10.765564492106048, + "grad_norm": 0.053955078125, + "learning_rate": 0.015613969088176041, + "loss": 0.8354, + "num_input_tokens_seen": 41962416, + "step": 72280 + }, + { + "epoch": 10.766309204647007, + "grad_norm": 0.04833984375, + "learning_rate": 0.015612021064000928, + "loss": 0.8043, + "num_input_tokens_seen": 41965200, + "step": 72285 + }, + { + "epoch": 10.767053917187965, + "grad_norm": 0.0228271484375, + "learning_rate": 0.01561007302948634, + "loss": 0.7953, + "num_input_tokens_seen": 41968176, + "step": 72290 + }, + { + "epoch": 10.767798629728924, + "grad_norm": 0.0400390625, + "learning_rate": 0.015608124984665187, + "loss": 0.8004, + "num_input_tokens_seen": 41971056, + "step": 72295 + }, + { + "epoch": 10.768543342269885, + "grad_norm": 0.10693359375, + "learning_rate": 0.01560617692957038, + "loss": 0.8002, + "num_input_tokens_seen": 41973872, + "step": 72300 + }, + { + "epoch": 10.769288054810843, + "grad_norm": 0.04248046875, + "learning_rate": 0.015604228864234833, + "loss": 0.7963, + "num_input_tokens_seen": 41976848, + "step": 72305 + }, + { + "epoch": 10.770032767351802, + "grad_norm": 0.057373046875, + "learning_rate": 0.01560228078869145, + "loss": 0.8097, + "num_input_tokens_seen": 41979568, + "step": 72310 + }, + { + "epoch": 10.770777479892761, + "grad_norm": 0.06689453125, + "learning_rate": 0.015600332702973148, + "loss": 0.808, + "num_input_tokens_seen": 41982576, + "step": 72315 + }, + { + "epoch": 10.771522192433721, + "grad_norm": 0.04638671875, + "learning_rate": 0.015598384607112834, + "loss": 0.8136, + "num_input_tokens_seen": 41985584, + "step": 72320 + }, + { + "epoch": 10.77226690497468, + "grad_norm": 0.039306640625, + "learning_rate": 0.01559643650114342, + "loss": 0.7958, + "num_input_tokens_seen": 41989168, + "step": 72325 + }, + { + "epoch": 10.773011617515639, + "grad_norm": 0.05419921875, + "learning_rate": 0.015594488385097817, + "loss": 0.7994, + "num_input_tokens_seen": 41992080, + "step": 72330 + }, + { + "epoch": 10.773756330056598, + "grad_norm": 0.057861328125, + "learning_rate": 0.015592540259008938, + "loss": 0.7895, + "num_input_tokens_seen": 41994928, + "step": 72335 + }, + { + "epoch": 10.774501042597556, + "grad_norm": 0.037841796875, + "learning_rate": 0.015590592122909691, + "loss": 0.8154, + "num_input_tokens_seen": 41997744, + "step": 72340 + }, + { + "epoch": 10.775245755138517, + "grad_norm": 0.04443359375, + "learning_rate": 0.015588643976832994, + "loss": 0.8113, + "num_input_tokens_seen": 42000400, + "step": 72345 + }, + { + "epoch": 10.775990467679476, + "grad_norm": 0.026611328125, + "learning_rate": 0.015586695820811756, + "loss": 0.7982, + "num_input_tokens_seen": 42003120, + "step": 72350 + }, + { + "epoch": 10.776735180220435, + "grad_norm": 0.061767578125, + "learning_rate": 0.015584747654878887, + "loss": 0.7809, + "num_input_tokens_seen": 42006032, + "step": 72355 + }, + { + "epoch": 10.777479892761393, + "grad_norm": 0.0888671875, + "learning_rate": 0.015582799479067298, + "loss": 0.8253, + "num_input_tokens_seen": 42009232, + "step": 72360 + }, + { + "epoch": 10.778224605302354, + "grad_norm": 0.05126953125, + "learning_rate": 0.015580851293409904, + "loss": 0.7996, + "num_input_tokens_seen": 42012464, + "step": 72365 + }, + { + "epoch": 10.778969317843313, + "grad_norm": 0.046875, + "learning_rate": 0.01557890309793962, + "loss": 0.8058, + "num_input_tokens_seen": 42015440, + "step": 72370 + }, + { + "epoch": 10.779714030384271, + "grad_norm": 0.04296875, + "learning_rate": 0.015576954892689352, + "loss": 0.8005, + "num_input_tokens_seen": 42018320, + "step": 72375 + }, + { + "epoch": 10.78045874292523, + "grad_norm": 0.061767578125, + "learning_rate": 0.015575006677692025, + "loss": 0.8005, + "num_input_tokens_seen": 42021136, + "step": 72380 + }, + { + "epoch": 10.78120345546619, + "grad_norm": 0.08837890625, + "learning_rate": 0.015573058452980546, + "loss": 0.7919, + "num_input_tokens_seen": 42024272, + "step": 72385 + }, + { + "epoch": 10.78194816800715, + "grad_norm": 0.02294921875, + "learning_rate": 0.01557111021858782, + "loss": 0.8028, + "num_input_tokens_seen": 42026960, + "step": 72390 + }, + { + "epoch": 10.782692880548108, + "grad_norm": 0.051513671875, + "learning_rate": 0.015569161974546768, + "loss": 0.804, + "num_input_tokens_seen": 42030096, + "step": 72395 + }, + { + "epoch": 10.783437593089067, + "grad_norm": 0.0361328125, + "learning_rate": 0.015567213720890302, + "loss": 0.8003, + "num_input_tokens_seen": 42032880, + "step": 72400 + }, + { + "epoch": 10.784182305630027, + "grad_norm": 0.03515625, + "learning_rate": 0.015565265457651337, + "loss": 0.8009, + "num_input_tokens_seen": 42035696, + "step": 72405 + }, + { + "epoch": 10.784927018170986, + "grad_norm": 0.03466796875, + "learning_rate": 0.015563317184862788, + "loss": 0.7961, + "num_input_tokens_seen": 42038352, + "step": 72410 + }, + { + "epoch": 10.785671730711945, + "grad_norm": 0.02880859375, + "learning_rate": 0.015561368902557573, + "loss": 0.8041, + "num_input_tokens_seen": 42041168, + "step": 72415 + }, + { + "epoch": 10.786416443252904, + "grad_norm": 0.0576171875, + "learning_rate": 0.015559420610768596, + "loss": 0.815, + "num_input_tokens_seen": 42044016, + "step": 72420 + }, + { + "epoch": 10.787161155793864, + "grad_norm": 0.043212890625, + "learning_rate": 0.015557472309528779, + "loss": 0.8261, + "num_input_tokens_seen": 42046800, + "step": 72425 + }, + { + "epoch": 10.787905868334823, + "grad_norm": 0.04150390625, + "learning_rate": 0.01555552399887103, + "loss": 0.7764, + "num_input_tokens_seen": 42049456, + "step": 72430 + }, + { + "epoch": 10.788650580875782, + "grad_norm": 0.0306396484375, + "learning_rate": 0.015553575678828266, + "loss": 0.8125, + "num_input_tokens_seen": 42052240, + "step": 72435 + }, + { + "epoch": 10.78939529341674, + "grad_norm": 0.044189453125, + "learning_rate": 0.015551627349433408, + "loss": 0.8121, + "num_input_tokens_seen": 42055024, + "step": 72440 + }, + { + "epoch": 10.790140005957701, + "grad_norm": 0.04052734375, + "learning_rate": 0.015549679010719363, + "loss": 0.8014, + "num_input_tokens_seen": 42057648, + "step": 72445 + }, + { + "epoch": 10.79088471849866, + "grad_norm": 0.06201171875, + "learning_rate": 0.015547730662719054, + "loss": 0.7925, + "num_input_tokens_seen": 42060752, + "step": 72450 + }, + { + "epoch": 10.791629431039619, + "grad_norm": 0.056640625, + "learning_rate": 0.01554578230546539, + "loss": 0.8144, + "num_input_tokens_seen": 42063728, + "step": 72455 + }, + { + "epoch": 10.792374143580577, + "grad_norm": 0.039794921875, + "learning_rate": 0.01554383393899129, + "loss": 0.8075, + "num_input_tokens_seen": 42066352, + "step": 72460 + }, + { + "epoch": 10.793118856121538, + "grad_norm": 0.04638671875, + "learning_rate": 0.015541885563329666, + "loss": 0.8041, + "num_input_tokens_seen": 42069328, + "step": 72465 + }, + { + "epoch": 10.793863568662497, + "grad_norm": 0.045654296875, + "learning_rate": 0.015539937178513437, + "loss": 0.8103, + "num_input_tokens_seen": 42071984, + "step": 72470 + }, + { + "epoch": 10.794608281203455, + "grad_norm": 0.054443359375, + "learning_rate": 0.015537988784575518, + "loss": 0.8082, + "num_input_tokens_seen": 42075280, + "step": 72475 + }, + { + "epoch": 10.795352993744414, + "grad_norm": 0.035400390625, + "learning_rate": 0.015536040381548825, + "loss": 0.8307, + "num_input_tokens_seen": 42078288, + "step": 72480 + }, + { + "epoch": 10.796097706285373, + "grad_norm": 0.050048828125, + "learning_rate": 0.015534091969466276, + "loss": 0.8089, + "num_input_tokens_seen": 42081328, + "step": 72485 + }, + { + "epoch": 10.796842418826333, + "grad_norm": 0.04296875, + "learning_rate": 0.015532143548360786, + "loss": 0.7942, + "num_input_tokens_seen": 42084400, + "step": 72490 + }, + { + "epoch": 10.797587131367292, + "grad_norm": 0.0234375, + "learning_rate": 0.01553019511826527, + "loss": 0.82, + "num_input_tokens_seen": 42087440, + "step": 72495 + }, + { + "epoch": 10.798331843908251, + "grad_norm": 0.025146484375, + "learning_rate": 0.015528246679212647, + "loss": 0.8167, + "num_input_tokens_seen": 42090416, + "step": 72500 + }, + { + "epoch": 10.799076556449211, + "grad_norm": 0.034912109375, + "learning_rate": 0.015526298231235835, + "loss": 0.8018, + "num_input_tokens_seen": 42093200, + "step": 72505 + }, + { + "epoch": 10.79982126899017, + "grad_norm": 0.0361328125, + "learning_rate": 0.015524349774367749, + "loss": 0.7976, + "num_input_tokens_seen": 42096304, + "step": 72510 + }, + { + "epoch": 10.800565981531129, + "grad_norm": 0.028076171875, + "learning_rate": 0.015522401308641307, + "loss": 0.8024, + "num_input_tokens_seen": 42099472, + "step": 72515 + }, + { + "epoch": 10.801310694072088, + "grad_norm": 0.04443359375, + "learning_rate": 0.015520452834089421, + "loss": 0.7981, + "num_input_tokens_seen": 42102352, + "step": 72520 + }, + { + "epoch": 10.802055406613047, + "grad_norm": 0.033935546875, + "learning_rate": 0.015518504350745017, + "loss": 0.7992, + "num_input_tokens_seen": 42105200, + "step": 72525 + }, + { + "epoch": 10.802800119154007, + "grad_norm": 0.043701171875, + "learning_rate": 0.015516555858641005, + "loss": 0.8092, + "num_input_tokens_seen": 42108240, + "step": 72530 + }, + { + "epoch": 10.803544831694966, + "grad_norm": 0.039794921875, + "learning_rate": 0.01551460735781031, + "loss": 0.8103, + "num_input_tokens_seen": 42110928, + "step": 72535 + }, + { + "epoch": 10.804289544235925, + "grad_norm": 0.0267333984375, + "learning_rate": 0.015512658848285848, + "loss": 0.7922, + "num_input_tokens_seen": 42113776, + "step": 72540 + }, + { + "epoch": 10.805034256776883, + "grad_norm": 0.03759765625, + "learning_rate": 0.015510710330100536, + "loss": 0.8049, + "num_input_tokens_seen": 42116592, + "step": 72545 + }, + { + "epoch": 10.805778969317844, + "grad_norm": 0.0478515625, + "learning_rate": 0.015508761803287292, + "loss": 0.8054, + "num_input_tokens_seen": 42119248, + "step": 72550 + }, + { + "epoch": 10.806523681858803, + "grad_norm": 0.050537109375, + "learning_rate": 0.015506813267879034, + "loss": 0.8189, + "num_input_tokens_seen": 42122256, + "step": 72555 + }, + { + "epoch": 10.807268394399761, + "grad_norm": 0.0301513671875, + "learning_rate": 0.015504864723908678, + "loss": 0.8099, + "num_input_tokens_seen": 42125168, + "step": 72560 + }, + { + "epoch": 10.80801310694072, + "grad_norm": 0.0419921875, + "learning_rate": 0.01550291617140915, + "loss": 0.8073, + "num_input_tokens_seen": 42128016, + "step": 72565 + }, + { + "epoch": 10.80875781948168, + "grad_norm": 0.04541015625, + "learning_rate": 0.015500967610413358, + "loss": 0.8071, + "num_input_tokens_seen": 42131152, + "step": 72570 + }, + { + "epoch": 10.80950253202264, + "grad_norm": 0.0419921875, + "learning_rate": 0.01549901904095423, + "loss": 0.8112, + "num_input_tokens_seen": 42134128, + "step": 72575 + }, + { + "epoch": 10.810247244563598, + "grad_norm": 0.042236328125, + "learning_rate": 0.015497070463064687, + "loss": 0.7937, + "num_input_tokens_seen": 42136976, + "step": 72580 + }, + { + "epoch": 10.810991957104557, + "grad_norm": 0.032470703125, + "learning_rate": 0.015495121876777642, + "loss": 0.8016, + "num_input_tokens_seen": 42140080, + "step": 72585 + }, + { + "epoch": 10.811736669645517, + "grad_norm": 0.04345703125, + "learning_rate": 0.015493173282126016, + "loss": 0.8082, + "num_input_tokens_seen": 42142608, + "step": 72590 + }, + { + "epoch": 10.812481382186476, + "grad_norm": 0.05322265625, + "learning_rate": 0.015491224679142726, + "loss": 0.7849, + "num_input_tokens_seen": 42145392, + "step": 72595 + }, + { + "epoch": 10.813226094727435, + "grad_norm": 0.0556640625, + "learning_rate": 0.015489276067860694, + "loss": 0.8041, + "num_input_tokens_seen": 42148560, + "step": 72600 + }, + { + "epoch": 10.813970807268394, + "grad_norm": 0.039306640625, + "learning_rate": 0.01548732744831284, + "loss": 0.7985, + "num_input_tokens_seen": 42151280, + "step": 72605 + }, + { + "epoch": 10.814715519809354, + "grad_norm": 0.035888671875, + "learning_rate": 0.015485378820532086, + "loss": 0.8014, + "num_input_tokens_seen": 42154320, + "step": 72610 + }, + { + "epoch": 10.815460232350313, + "grad_norm": 0.06591796875, + "learning_rate": 0.015483430184551355, + "loss": 0.8007, + "num_input_tokens_seen": 42157264, + "step": 72615 + }, + { + "epoch": 10.816204944891272, + "grad_norm": 0.0517578125, + "learning_rate": 0.015481481540403555, + "loss": 0.7891, + "num_input_tokens_seen": 42160016, + "step": 72620 + }, + { + "epoch": 10.81694965743223, + "grad_norm": 0.045166015625, + "learning_rate": 0.015479532888121619, + "loss": 0.815, + "num_input_tokens_seen": 42163088, + "step": 72625 + }, + { + "epoch": 10.817694369973191, + "grad_norm": 0.05517578125, + "learning_rate": 0.015477584227738457, + "loss": 0.8101, + "num_input_tokens_seen": 42166000, + "step": 72630 + }, + { + "epoch": 10.81843908251415, + "grad_norm": 0.0380859375, + "learning_rate": 0.015475635559286997, + "loss": 0.7706, + "num_input_tokens_seen": 42168656, + "step": 72635 + }, + { + "epoch": 10.819183795055109, + "grad_norm": 0.0306396484375, + "learning_rate": 0.015473686882800156, + "loss": 0.7959, + "num_input_tokens_seen": 42171472, + "step": 72640 + }, + { + "epoch": 10.819928507596067, + "grad_norm": 0.04296875, + "learning_rate": 0.015471738198310856, + "loss": 0.811, + "num_input_tokens_seen": 42174288, + "step": 72645 + }, + { + "epoch": 10.820673220137028, + "grad_norm": 0.044921875, + "learning_rate": 0.015469789505852026, + "loss": 0.7873, + "num_input_tokens_seen": 42177328, + "step": 72650 + }, + { + "epoch": 10.821417932677987, + "grad_norm": 0.052978515625, + "learning_rate": 0.015467840805456573, + "loss": 0.8047, + "num_input_tokens_seen": 42180080, + "step": 72655 + }, + { + "epoch": 10.822162645218945, + "grad_norm": 0.055419921875, + "learning_rate": 0.015465892097157426, + "loss": 0.8073, + "num_input_tokens_seen": 42182992, + "step": 72660 + }, + { + "epoch": 10.822907357759904, + "grad_norm": 0.07666015625, + "learning_rate": 0.015463943380987506, + "loss": 0.8092, + "num_input_tokens_seen": 42185968, + "step": 72665 + }, + { + "epoch": 10.823652070300863, + "grad_norm": 0.08544921875, + "learning_rate": 0.015461994656979736, + "loss": 0.8001, + "num_input_tokens_seen": 42188656, + "step": 72670 + }, + { + "epoch": 10.824396782841823, + "grad_norm": 0.043701171875, + "learning_rate": 0.015460045925167035, + "loss": 0.7991, + "num_input_tokens_seen": 42191696, + "step": 72675 + }, + { + "epoch": 10.825141495382782, + "grad_norm": 0.06103515625, + "learning_rate": 0.015458097185582328, + "loss": 0.7717, + "num_input_tokens_seen": 42194352, + "step": 72680 + }, + { + "epoch": 10.825886207923741, + "grad_norm": 0.047119140625, + "learning_rate": 0.015456148438258531, + "loss": 0.815, + "num_input_tokens_seen": 42197520, + "step": 72685 + }, + { + "epoch": 10.826630920464702, + "grad_norm": 0.039794921875, + "learning_rate": 0.015454199683228568, + "loss": 0.7928, + "num_input_tokens_seen": 42200176, + "step": 72690 + }, + { + "epoch": 10.82737563300566, + "grad_norm": 0.07470703125, + "learning_rate": 0.015452250920525363, + "loss": 0.7916, + "num_input_tokens_seen": 42203024, + "step": 72695 + }, + { + "epoch": 10.828120345546619, + "grad_norm": 0.049072265625, + "learning_rate": 0.015450302150181842, + "loss": 0.8218, + "num_input_tokens_seen": 42205840, + "step": 72700 + }, + { + "epoch": 10.828865058087578, + "grad_norm": 0.055908203125, + "learning_rate": 0.01544835337223092, + "loss": 0.7854, + "num_input_tokens_seen": 42208720, + "step": 72705 + }, + { + "epoch": 10.829609770628537, + "grad_norm": 0.0361328125, + "learning_rate": 0.015446404586705528, + "loss": 0.7868, + "num_input_tokens_seen": 42211600, + "step": 72710 + }, + { + "epoch": 10.830354483169497, + "grad_norm": 0.0625, + "learning_rate": 0.015444455793638586, + "loss": 0.8063, + "num_input_tokens_seen": 42214640, + "step": 72715 + }, + { + "epoch": 10.831099195710456, + "grad_norm": 0.0380859375, + "learning_rate": 0.015442506993063009, + "loss": 0.792, + "num_input_tokens_seen": 42217520, + "step": 72720 + }, + { + "epoch": 10.831843908251415, + "grad_norm": 0.0419921875, + "learning_rate": 0.01544055818501173, + "loss": 0.7807, + "num_input_tokens_seen": 42220464, + "step": 72725 + }, + { + "epoch": 10.832588620792373, + "grad_norm": 0.07373046875, + "learning_rate": 0.015438609369517664, + "loss": 0.8152, + "num_input_tokens_seen": 42223568, + "step": 72730 + }, + { + "epoch": 10.833333333333334, + "grad_norm": 0.043212890625, + "learning_rate": 0.015436660546613742, + "loss": 0.78, + "num_input_tokens_seen": 42226288, + "step": 72735 + }, + { + "epoch": 10.834078045874293, + "grad_norm": 0.043212890625, + "learning_rate": 0.01543471171633288, + "loss": 0.8044, + "num_input_tokens_seen": 42228880, + "step": 72740 + }, + { + "epoch": 10.834822758415251, + "grad_norm": 0.04248046875, + "learning_rate": 0.01543276287870801, + "loss": 0.7695, + "num_input_tokens_seen": 42232016, + "step": 72745 + }, + { + "epoch": 10.83556747095621, + "grad_norm": 0.05029296875, + "learning_rate": 0.015430814033772052, + "loss": 0.7765, + "num_input_tokens_seen": 42235024, + "step": 72750 + }, + { + "epoch": 10.83631218349717, + "grad_norm": 0.0625, + "learning_rate": 0.015428865181557925, + "loss": 0.8029, + "num_input_tokens_seen": 42237936, + "step": 72755 + }, + { + "epoch": 10.83705689603813, + "grad_norm": 0.1240234375, + "learning_rate": 0.015426916322098558, + "loss": 0.8048, + "num_input_tokens_seen": 42240976, + "step": 72760 + }, + { + "epoch": 10.837801608579088, + "grad_norm": 0.044189453125, + "learning_rate": 0.015424967455426873, + "loss": 0.8255, + "num_input_tokens_seen": 42243824, + "step": 72765 + }, + { + "epoch": 10.838546321120047, + "grad_norm": 0.0986328125, + "learning_rate": 0.015423018581575795, + "loss": 0.7978, + "num_input_tokens_seen": 42246544, + "step": 72770 + }, + { + "epoch": 10.839291033661008, + "grad_norm": 0.0810546875, + "learning_rate": 0.015421069700578246, + "loss": 0.8019, + "num_input_tokens_seen": 42249488, + "step": 72775 + }, + { + "epoch": 10.840035746201966, + "grad_norm": 0.08203125, + "learning_rate": 0.015419120812467157, + "loss": 0.7917, + "num_input_tokens_seen": 42252656, + "step": 72780 + }, + { + "epoch": 10.840780458742925, + "grad_norm": 0.0771484375, + "learning_rate": 0.015417171917275445, + "loss": 0.835, + "num_input_tokens_seen": 42255408, + "step": 72785 + }, + { + "epoch": 10.841525171283884, + "grad_norm": 0.055908203125, + "learning_rate": 0.015415223015036039, + "loss": 0.7873, + "num_input_tokens_seen": 42258288, + "step": 72790 + }, + { + "epoch": 10.842269883824844, + "grad_norm": 0.06640625, + "learning_rate": 0.01541327410578186, + "loss": 0.7962, + "num_input_tokens_seen": 42260880, + "step": 72795 + }, + { + "epoch": 10.843014596365803, + "grad_norm": 0.0849609375, + "learning_rate": 0.015411325189545836, + "loss": 0.817, + "num_input_tokens_seen": 42263664, + "step": 72800 + }, + { + "epoch": 10.843759308906762, + "grad_norm": 0.053955078125, + "learning_rate": 0.015409376266360889, + "loss": 0.8199, + "num_input_tokens_seen": 42266672, + "step": 72805 + }, + { + "epoch": 10.84450402144772, + "grad_norm": 0.0908203125, + "learning_rate": 0.01540742733625995, + "loss": 0.7933, + "num_input_tokens_seen": 42269392, + "step": 72810 + }, + { + "epoch": 10.845248733988681, + "grad_norm": 0.052978515625, + "learning_rate": 0.015405478399275939, + "loss": 0.8157, + "num_input_tokens_seen": 42272240, + "step": 72815 + }, + { + "epoch": 10.84599344652964, + "grad_norm": 0.05859375, + "learning_rate": 0.01540352945544178, + "loss": 0.8072, + "num_input_tokens_seen": 42274992, + "step": 72820 + }, + { + "epoch": 10.846738159070599, + "grad_norm": 0.12890625, + "learning_rate": 0.015401580504790402, + "loss": 0.7861, + "num_input_tokens_seen": 42278160, + "step": 72825 + }, + { + "epoch": 10.847482871611557, + "grad_norm": 0.039306640625, + "learning_rate": 0.015399631547354735, + "loss": 0.7932, + "num_input_tokens_seen": 42280880, + "step": 72830 + }, + { + "epoch": 10.848227584152518, + "grad_norm": 0.0400390625, + "learning_rate": 0.01539768258316769, + "loss": 0.7671, + "num_input_tokens_seen": 42283728, + "step": 72835 + }, + { + "epoch": 10.848972296693477, + "grad_norm": 0.03564453125, + "learning_rate": 0.015395733612262208, + "loss": 0.7929, + "num_input_tokens_seen": 42286416, + "step": 72840 + }, + { + "epoch": 10.849717009234435, + "grad_norm": 0.025390625, + "learning_rate": 0.015393784634671209, + "loss": 0.7882, + "num_input_tokens_seen": 42289264, + "step": 72845 + }, + { + "epoch": 10.850461721775394, + "grad_norm": 0.030517578125, + "learning_rate": 0.015391835650427614, + "loss": 0.8079, + "num_input_tokens_seen": 42291824, + "step": 72850 + }, + { + "epoch": 10.851206434316353, + "grad_norm": 0.035888671875, + "learning_rate": 0.01538988665956436, + "loss": 0.7996, + "num_input_tokens_seen": 42294448, + "step": 72855 + }, + { + "epoch": 10.851951146857314, + "grad_norm": 0.0439453125, + "learning_rate": 0.015387937662114362, + "loss": 0.8209, + "num_input_tokens_seen": 42297424, + "step": 72860 + }, + { + "epoch": 10.852695859398272, + "grad_norm": 0.0478515625, + "learning_rate": 0.015385988658110552, + "loss": 0.7805, + "num_input_tokens_seen": 42300272, + "step": 72865 + }, + { + "epoch": 10.853440571939231, + "grad_norm": 0.055419921875, + "learning_rate": 0.015384039647585862, + "loss": 0.8003, + "num_input_tokens_seen": 42303120, + "step": 72870 + }, + { + "epoch": 10.85418528448019, + "grad_norm": 0.057373046875, + "learning_rate": 0.015382090630573208, + "loss": 0.8313, + "num_input_tokens_seen": 42306128, + "step": 72875 + }, + { + "epoch": 10.85492999702115, + "grad_norm": 0.045166015625, + "learning_rate": 0.015380141607105522, + "loss": 0.7767, + "num_input_tokens_seen": 42309104, + "step": 72880 + }, + { + "epoch": 10.85567470956211, + "grad_norm": 0.04052734375, + "learning_rate": 0.015378192577215729, + "loss": 0.8143, + "num_input_tokens_seen": 42312336, + "step": 72885 + }, + { + "epoch": 10.856419422103068, + "grad_norm": 0.05517578125, + "learning_rate": 0.015376243540936755, + "loss": 0.8151, + "num_input_tokens_seen": 42315312, + "step": 72890 + }, + { + "epoch": 10.857164134644027, + "grad_norm": 0.059814453125, + "learning_rate": 0.01537429449830153, + "loss": 0.7853, + "num_input_tokens_seen": 42318512, + "step": 72895 + }, + { + "epoch": 10.857908847184987, + "grad_norm": 0.049072265625, + "learning_rate": 0.01537234544934298, + "loss": 0.8159, + "num_input_tokens_seen": 42321584, + "step": 72900 + }, + { + "epoch": 10.858653559725946, + "grad_norm": 0.0556640625, + "learning_rate": 0.015370396394094031, + "loss": 0.824, + "num_input_tokens_seen": 42324400, + "step": 72905 + }, + { + "epoch": 10.859398272266905, + "grad_norm": 0.0703125, + "learning_rate": 0.015368447332587614, + "loss": 0.8051, + "num_input_tokens_seen": 42327312, + "step": 72910 + }, + { + "epoch": 10.860142984807863, + "grad_norm": 0.044189453125, + "learning_rate": 0.015366498264856657, + "loss": 0.8161, + "num_input_tokens_seen": 42330384, + "step": 72915 + }, + { + "epoch": 10.860887697348824, + "grad_norm": 0.06640625, + "learning_rate": 0.01536454919093408, + "loss": 0.8121, + "num_input_tokens_seen": 42333328, + "step": 72920 + }, + { + "epoch": 10.861632409889783, + "grad_norm": 0.064453125, + "learning_rate": 0.015362600110852814, + "loss": 0.7974, + "num_input_tokens_seen": 42336400, + "step": 72925 + }, + { + "epoch": 10.862377122430741, + "grad_norm": 0.06494140625, + "learning_rate": 0.015360651024645785, + "loss": 0.8159, + "num_input_tokens_seen": 42339312, + "step": 72930 + }, + { + "epoch": 10.8631218349717, + "grad_norm": 0.052734375, + "learning_rate": 0.015358701932345929, + "loss": 0.8163, + "num_input_tokens_seen": 42342032, + "step": 72935 + }, + { + "epoch": 10.86386654751266, + "grad_norm": 0.04931640625, + "learning_rate": 0.015356752833986164, + "loss": 0.8065, + "num_input_tokens_seen": 42345072, + "step": 72940 + }, + { + "epoch": 10.86461126005362, + "grad_norm": 0.035400390625, + "learning_rate": 0.015354803729599428, + "loss": 0.7869, + "num_input_tokens_seen": 42347760, + "step": 72945 + }, + { + "epoch": 10.865355972594578, + "grad_norm": 0.0556640625, + "learning_rate": 0.015352854619218644, + "loss": 0.8036, + "num_input_tokens_seen": 42350608, + "step": 72950 + }, + { + "epoch": 10.866100685135537, + "grad_norm": 0.0947265625, + "learning_rate": 0.015350905502876735, + "loss": 0.8089, + "num_input_tokens_seen": 42353968, + "step": 72955 + }, + { + "epoch": 10.866845397676498, + "grad_norm": 0.041015625, + "learning_rate": 0.015348956380606633, + "loss": 0.8043, + "num_input_tokens_seen": 42356720, + "step": 72960 + }, + { + "epoch": 10.867590110217456, + "grad_norm": 0.06640625, + "learning_rate": 0.015347007252441269, + "loss": 0.788, + "num_input_tokens_seen": 42359440, + "step": 72965 + }, + { + "epoch": 10.868334822758415, + "grad_norm": 0.036376953125, + "learning_rate": 0.01534505811841357, + "loss": 0.8066, + "num_input_tokens_seen": 42362224, + "step": 72970 + }, + { + "epoch": 10.869079535299374, + "grad_norm": 0.040283203125, + "learning_rate": 0.015343108978556468, + "loss": 0.812, + "num_input_tokens_seen": 42365040, + "step": 72975 + }, + { + "epoch": 10.869824247840334, + "grad_norm": 0.040771484375, + "learning_rate": 0.015341159832902887, + "loss": 0.7929, + "num_input_tokens_seen": 42367952, + "step": 72980 + }, + { + "epoch": 10.870568960381293, + "grad_norm": 0.039794921875, + "learning_rate": 0.015339210681485756, + "loss": 0.7952, + "num_input_tokens_seen": 42370960, + "step": 72985 + }, + { + "epoch": 10.871313672922252, + "grad_norm": 0.047607421875, + "learning_rate": 0.015337261524338012, + "loss": 0.7949, + "num_input_tokens_seen": 42373968, + "step": 72990 + }, + { + "epoch": 10.87205838546321, + "grad_norm": 0.04150390625, + "learning_rate": 0.015335312361492567, + "loss": 0.7985, + "num_input_tokens_seen": 42376752, + "step": 72995 + }, + { + "epoch": 10.872803098004171, + "grad_norm": 0.0380859375, + "learning_rate": 0.015333363192982365, + "loss": 0.7752, + "num_input_tokens_seen": 42379280, + "step": 73000 + }, + { + "epoch": 10.87354781054513, + "grad_norm": 0.035888671875, + "learning_rate": 0.01533141401884033, + "loss": 0.7752, + "num_input_tokens_seen": 42381936, + "step": 73005 + }, + { + "epoch": 10.874292523086089, + "grad_norm": 0.039794921875, + "learning_rate": 0.015329464839099395, + "loss": 0.8194, + "num_input_tokens_seen": 42384624, + "step": 73010 + }, + { + "epoch": 10.875037235627047, + "grad_norm": 0.11669921875, + "learning_rate": 0.015327515653792485, + "loss": 0.8481, + "num_input_tokens_seen": 42387376, + "step": 73015 + }, + { + "epoch": 10.875781948168008, + "grad_norm": 0.03564453125, + "learning_rate": 0.01532556646295253, + "loss": 0.7949, + "num_input_tokens_seen": 42389904, + "step": 73020 + }, + { + "epoch": 10.876526660708967, + "grad_norm": 0.042724609375, + "learning_rate": 0.015323617266612457, + "loss": 0.7943, + "num_input_tokens_seen": 42392912, + "step": 73025 + }, + { + "epoch": 10.877271373249926, + "grad_norm": 0.045166015625, + "learning_rate": 0.015321668064805206, + "loss": 0.7837, + "num_input_tokens_seen": 42395856, + "step": 73030 + }, + { + "epoch": 10.878016085790884, + "grad_norm": 0.03759765625, + "learning_rate": 0.015319718857563697, + "loss": 0.7962, + "num_input_tokens_seen": 42398608, + "step": 73035 + }, + { + "epoch": 10.878760798331843, + "grad_norm": 0.034912109375, + "learning_rate": 0.01531776964492086, + "loss": 0.8127, + "num_input_tokens_seen": 42401328, + "step": 73040 + }, + { + "epoch": 10.879505510872804, + "grad_norm": 0.047119140625, + "learning_rate": 0.015315820426909633, + "loss": 0.7839, + "num_input_tokens_seen": 42404432, + "step": 73045 + }, + { + "epoch": 10.880250223413762, + "grad_norm": 0.0311279296875, + "learning_rate": 0.015313871203562939, + "loss": 0.8196, + "num_input_tokens_seen": 42407280, + "step": 73050 + }, + { + "epoch": 10.880994935954721, + "grad_norm": 0.029296875, + "learning_rate": 0.015311921974913706, + "loss": 0.8066, + "num_input_tokens_seen": 42410160, + "step": 73055 + }, + { + "epoch": 10.88173964849568, + "grad_norm": 0.03173828125, + "learning_rate": 0.015309972740994873, + "loss": 0.7919, + "num_input_tokens_seen": 42413232, + "step": 73060 + }, + { + "epoch": 10.88248436103664, + "grad_norm": 0.043701171875, + "learning_rate": 0.015308023501839364, + "loss": 0.7927, + "num_input_tokens_seen": 42416080, + "step": 73065 + }, + { + "epoch": 10.8832290735776, + "grad_norm": 0.03173828125, + "learning_rate": 0.015306074257480111, + "loss": 0.8089, + "num_input_tokens_seen": 42418928, + "step": 73070 + }, + { + "epoch": 10.883973786118558, + "grad_norm": 0.058349609375, + "learning_rate": 0.015304125007950043, + "loss": 0.8047, + "num_input_tokens_seen": 42421776, + "step": 73075 + }, + { + "epoch": 10.884718498659517, + "grad_norm": 0.03466796875, + "learning_rate": 0.015302175753282098, + "loss": 0.7963, + "num_input_tokens_seen": 42424624, + "step": 73080 + }, + { + "epoch": 10.885463211200477, + "grad_norm": 0.05859375, + "learning_rate": 0.015300226493509194, + "loss": 0.8157, + "num_input_tokens_seen": 42427920, + "step": 73085 + }, + { + "epoch": 10.886207923741436, + "grad_norm": 0.042236328125, + "learning_rate": 0.015298277228664272, + "loss": 0.8052, + "num_input_tokens_seen": 42430768, + "step": 73090 + }, + { + "epoch": 10.886952636282395, + "grad_norm": 0.05029296875, + "learning_rate": 0.015296327958780257, + "loss": 0.7642, + "num_input_tokens_seen": 42433616, + "step": 73095 + }, + { + "epoch": 10.887697348823353, + "grad_norm": 0.05615234375, + "learning_rate": 0.015294378683890083, + "loss": 0.8061, + "num_input_tokens_seen": 42436560, + "step": 73100 + }, + { + "epoch": 10.888442061364314, + "grad_norm": 0.042724609375, + "learning_rate": 0.01529242940402668, + "loss": 0.7836, + "num_input_tokens_seen": 42439216, + "step": 73105 + }, + { + "epoch": 10.889186773905273, + "grad_norm": 0.0908203125, + "learning_rate": 0.015290480119222983, + "loss": 0.8138, + "num_input_tokens_seen": 42442704, + "step": 73110 + }, + { + "epoch": 10.889931486446232, + "grad_norm": 0.042236328125, + "learning_rate": 0.015288530829511917, + "loss": 0.8237, + "num_input_tokens_seen": 42445680, + "step": 73115 + }, + { + "epoch": 10.89067619898719, + "grad_norm": 0.05615234375, + "learning_rate": 0.015286581534926418, + "loss": 0.8157, + "num_input_tokens_seen": 42448432, + "step": 73120 + }, + { + "epoch": 10.89142091152815, + "grad_norm": 0.04931640625, + "learning_rate": 0.015284632235499411, + "loss": 0.8175, + "num_input_tokens_seen": 42451248, + "step": 73125 + }, + { + "epoch": 10.89216562406911, + "grad_norm": 0.04541015625, + "learning_rate": 0.015282682931263836, + "loss": 0.7931, + "num_input_tokens_seen": 42454000, + "step": 73130 + }, + { + "epoch": 10.892910336610068, + "grad_norm": 0.02734375, + "learning_rate": 0.015280733622252615, + "loss": 0.7957, + "num_input_tokens_seen": 42456816, + "step": 73135 + }, + { + "epoch": 10.893655049151027, + "grad_norm": 0.041015625, + "learning_rate": 0.015278784308498685, + "loss": 0.8215, + "num_input_tokens_seen": 42459536, + "step": 73140 + }, + { + "epoch": 10.894399761691988, + "grad_norm": 0.039306640625, + "learning_rate": 0.015276834990034984, + "loss": 0.8125, + "num_input_tokens_seen": 42462416, + "step": 73145 + }, + { + "epoch": 10.895144474232946, + "grad_norm": 0.038818359375, + "learning_rate": 0.015274885666894436, + "loss": 0.7896, + "num_input_tokens_seen": 42465328, + "step": 73150 + }, + { + "epoch": 10.895889186773905, + "grad_norm": 0.0458984375, + "learning_rate": 0.01527293633910997, + "loss": 0.7995, + "num_input_tokens_seen": 42468080, + "step": 73155 + }, + { + "epoch": 10.896633899314864, + "grad_norm": 0.0281982421875, + "learning_rate": 0.015270987006714523, + "loss": 0.7859, + "num_input_tokens_seen": 42470864, + "step": 73160 + }, + { + "epoch": 10.897378611855824, + "grad_norm": 0.034423828125, + "learning_rate": 0.015269037669741026, + "loss": 0.8192, + "num_input_tokens_seen": 42473968, + "step": 73165 + }, + { + "epoch": 10.898123324396783, + "grad_norm": 0.0712890625, + "learning_rate": 0.01526708832822241, + "loss": 0.833, + "num_input_tokens_seen": 42476656, + "step": 73170 + }, + { + "epoch": 10.898868036937742, + "grad_norm": 0.054931640625, + "learning_rate": 0.015265138982191612, + "loss": 0.8322, + "num_input_tokens_seen": 42479472, + "step": 73175 + }, + { + "epoch": 10.8996127494787, + "grad_norm": 0.046630859375, + "learning_rate": 0.015263189631681558, + "loss": 0.8136, + "num_input_tokens_seen": 42482096, + "step": 73180 + }, + { + "epoch": 10.90035746201966, + "grad_norm": 0.045654296875, + "learning_rate": 0.01526124027672518, + "loss": 0.8084, + "num_input_tokens_seen": 42485104, + "step": 73185 + }, + { + "epoch": 10.90110217456062, + "grad_norm": 0.03125, + "learning_rate": 0.015259290917355419, + "loss": 0.8272, + "num_input_tokens_seen": 42488240, + "step": 73190 + }, + { + "epoch": 10.901846887101579, + "grad_norm": 0.043701171875, + "learning_rate": 0.015257341553605197, + "loss": 0.8198, + "num_input_tokens_seen": 42491312, + "step": 73195 + }, + { + "epoch": 10.902591599642538, + "grad_norm": 0.047607421875, + "learning_rate": 0.015255392185507451, + "loss": 0.8192, + "num_input_tokens_seen": 42494288, + "step": 73200 + }, + { + "epoch": 10.903336312183498, + "grad_norm": 0.032470703125, + "learning_rate": 0.015253442813095113, + "loss": 0.8, + "num_input_tokens_seen": 42497040, + "step": 73205 + }, + { + "epoch": 10.904081024724457, + "grad_norm": 0.055908203125, + "learning_rate": 0.015251493436401121, + "loss": 0.8078, + "num_input_tokens_seen": 42499952, + "step": 73210 + }, + { + "epoch": 10.904825737265416, + "grad_norm": 0.04541015625, + "learning_rate": 0.0152495440554584, + "loss": 0.8128, + "num_input_tokens_seen": 42502640, + "step": 73215 + }, + { + "epoch": 10.905570449806374, + "grad_norm": 0.044677734375, + "learning_rate": 0.015247594670299885, + "loss": 0.7962, + "num_input_tokens_seen": 42505584, + "step": 73220 + }, + { + "epoch": 10.906315162347333, + "grad_norm": 0.06396484375, + "learning_rate": 0.015245645280958509, + "loss": 0.8111, + "num_input_tokens_seen": 42508432, + "step": 73225 + }, + { + "epoch": 10.907059874888294, + "grad_norm": 0.03369140625, + "learning_rate": 0.015243695887467209, + "loss": 0.7973, + "num_input_tokens_seen": 42511440, + "step": 73230 + }, + { + "epoch": 10.907804587429252, + "grad_norm": 0.046630859375, + "learning_rate": 0.015241746489858913, + "loss": 0.7886, + "num_input_tokens_seen": 42514192, + "step": 73235 + }, + { + "epoch": 10.908549299970211, + "grad_norm": 0.03466796875, + "learning_rate": 0.015239797088166553, + "loss": 0.7964, + "num_input_tokens_seen": 42516848, + "step": 73240 + }, + { + "epoch": 10.90929401251117, + "grad_norm": 0.046875, + "learning_rate": 0.015237847682423072, + "loss": 0.798, + "num_input_tokens_seen": 42519920, + "step": 73245 + }, + { + "epoch": 10.91003872505213, + "grad_norm": 0.042236328125, + "learning_rate": 0.015235898272661391, + "loss": 0.7804, + "num_input_tokens_seen": 42522576, + "step": 73250 + }, + { + "epoch": 10.91078343759309, + "grad_norm": 0.042724609375, + "learning_rate": 0.015233948858914447, + "loss": 0.7978, + "num_input_tokens_seen": 42525424, + "step": 73255 + }, + { + "epoch": 10.911528150134048, + "grad_norm": 0.03271484375, + "learning_rate": 0.015231999441215175, + "loss": 0.8138, + "num_input_tokens_seen": 42528240, + "step": 73260 + }, + { + "epoch": 10.912272862675007, + "grad_norm": 0.031494140625, + "learning_rate": 0.015230050019596512, + "loss": 0.8171, + "num_input_tokens_seen": 42531152, + "step": 73265 + }, + { + "epoch": 10.913017575215967, + "grad_norm": 0.0299072265625, + "learning_rate": 0.015228100594091385, + "loss": 0.7935, + "num_input_tokens_seen": 42533808, + "step": 73270 + }, + { + "epoch": 10.913762287756926, + "grad_norm": 0.0341796875, + "learning_rate": 0.015226151164732733, + "loss": 0.7903, + "num_input_tokens_seen": 42536624, + "step": 73275 + }, + { + "epoch": 10.914507000297885, + "grad_norm": 0.0400390625, + "learning_rate": 0.015224201731553486, + "loss": 0.802, + "num_input_tokens_seen": 42539472, + "step": 73280 + }, + { + "epoch": 10.915251712838844, + "grad_norm": 0.03857421875, + "learning_rate": 0.015222252294586578, + "loss": 0.8132, + "num_input_tokens_seen": 42542320, + "step": 73285 + }, + { + "epoch": 10.915996425379804, + "grad_norm": 0.041259765625, + "learning_rate": 0.015220302853864942, + "loss": 0.7851, + "num_input_tokens_seen": 42545040, + "step": 73290 + }, + { + "epoch": 10.916741137920763, + "grad_norm": 0.043212890625, + "learning_rate": 0.015218353409421513, + "loss": 0.8012, + "num_input_tokens_seen": 42548016, + "step": 73295 + }, + { + "epoch": 10.917485850461722, + "grad_norm": 0.05615234375, + "learning_rate": 0.015216403961289225, + "loss": 0.8031, + "num_input_tokens_seen": 42550736, + "step": 73300 + }, + { + "epoch": 10.91823056300268, + "grad_norm": 0.038330078125, + "learning_rate": 0.015214454509501011, + "loss": 0.8067, + "num_input_tokens_seen": 42553520, + "step": 73305 + }, + { + "epoch": 10.918975275543641, + "grad_norm": 0.034423828125, + "learning_rate": 0.015212505054089812, + "loss": 0.8039, + "num_input_tokens_seen": 42556528, + "step": 73310 + }, + { + "epoch": 10.9197199880846, + "grad_norm": 0.0380859375, + "learning_rate": 0.015210555595088555, + "loss": 0.7875, + "num_input_tokens_seen": 42559440, + "step": 73315 + }, + { + "epoch": 10.920464700625558, + "grad_norm": 0.04150390625, + "learning_rate": 0.01520860613253017, + "loss": 0.8, + "num_input_tokens_seen": 42562320, + "step": 73320 + }, + { + "epoch": 10.921209413166517, + "grad_norm": 0.0308837890625, + "learning_rate": 0.015206656666447601, + "loss": 0.8008, + "num_input_tokens_seen": 42564976, + "step": 73325 + }, + { + "epoch": 10.921954125707478, + "grad_norm": 0.032958984375, + "learning_rate": 0.015204707196873776, + "loss": 0.8047, + "num_input_tokens_seen": 42568112, + "step": 73330 + }, + { + "epoch": 10.922698838248436, + "grad_norm": 0.035888671875, + "learning_rate": 0.015202757723841628, + "loss": 0.8184, + "num_input_tokens_seen": 42571088, + "step": 73335 + }, + { + "epoch": 10.923443550789395, + "grad_norm": 0.04296875, + "learning_rate": 0.015200808247384099, + "loss": 0.7911, + "num_input_tokens_seen": 42573904, + "step": 73340 + }, + { + "epoch": 10.924188263330354, + "grad_norm": 0.037109375, + "learning_rate": 0.015198858767534115, + "loss": 0.8139, + "num_input_tokens_seen": 42577360, + "step": 73345 + }, + { + "epoch": 10.924932975871315, + "grad_norm": 0.033447265625, + "learning_rate": 0.015196909284324616, + "loss": 0.8185, + "num_input_tokens_seen": 42580464, + "step": 73350 + }, + { + "epoch": 10.925677688412273, + "grad_norm": 0.043212890625, + "learning_rate": 0.015194959797788538, + "loss": 0.7962, + "num_input_tokens_seen": 42583376, + "step": 73355 + }, + { + "epoch": 10.926422400953232, + "grad_norm": 0.0306396484375, + "learning_rate": 0.015193010307958808, + "loss": 0.8055, + "num_input_tokens_seen": 42586672, + "step": 73360 + }, + { + "epoch": 10.92716711349419, + "grad_norm": 0.031982421875, + "learning_rate": 0.015191060814868367, + "loss": 0.7775, + "num_input_tokens_seen": 42589648, + "step": 73365 + }, + { + "epoch": 10.92791182603515, + "grad_norm": 0.037841796875, + "learning_rate": 0.015189111318550143, + "loss": 0.8, + "num_input_tokens_seen": 42592496, + "step": 73370 + }, + { + "epoch": 10.92865653857611, + "grad_norm": 0.03173828125, + "learning_rate": 0.015187161819037082, + "loss": 0.7736, + "num_input_tokens_seen": 42595504, + "step": 73375 + }, + { + "epoch": 10.929401251117069, + "grad_norm": 0.05029296875, + "learning_rate": 0.01518521231636211, + "loss": 0.8011, + "num_input_tokens_seen": 42598640, + "step": 73380 + }, + { + "epoch": 10.930145963658028, + "grad_norm": 0.02099609375, + "learning_rate": 0.015183262810558162, + "loss": 0.7846, + "num_input_tokens_seen": 42601424, + "step": 73385 + }, + { + "epoch": 10.930890676198988, + "grad_norm": 0.028076171875, + "learning_rate": 0.015181313301658176, + "loss": 0.788, + "num_input_tokens_seen": 42603952, + "step": 73390 + }, + { + "epoch": 10.931635388739947, + "grad_norm": 0.03564453125, + "learning_rate": 0.015179363789695088, + "loss": 0.8111, + "num_input_tokens_seen": 42606672, + "step": 73395 + }, + { + "epoch": 10.932380101280906, + "grad_norm": 0.035888671875, + "learning_rate": 0.01517741427470183, + "loss": 0.7905, + "num_input_tokens_seen": 42609968, + "step": 73400 + }, + { + "epoch": 10.933124813821864, + "grad_norm": 0.0294189453125, + "learning_rate": 0.015175464756711336, + "loss": 0.81, + "num_input_tokens_seen": 42612752, + "step": 73405 + }, + { + "epoch": 10.933869526362823, + "grad_norm": 0.02734375, + "learning_rate": 0.01517351523575655, + "loss": 0.8157, + "num_input_tokens_seen": 42615536, + "step": 73410 + }, + { + "epoch": 10.934614238903784, + "grad_norm": 0.0291748046875, + "learning_rate": 0.015171565711870396, + "loss": 0.7898, + "num_input_tokens_seen": 42618224, + "step": 73415 + }, + { + "epoch": 10.935358951444742, + "grad_norm": 0.0458984375, + "learning_rate": 0.01516961618508581, + "loss": 0.8133, + "num_input_tokens_seen": 42621168, + "step": 73420 + }, + { + "epoch": 10.936103663985701, + "grad_norm": 0.034912109375, + "learning_rate": 0.015167666655435737, + "loss": 0.8069, + "num_input_tokens_seen": 42623856, + "step": 73425 + }, + { + "epoch": 10.93684837652666, + "grad_norm": 0.0361328125, + "learning_rate": 0.015165717122953101, + "loss": 0.7993, + "num_input_tokens_seen": 42626576, + "step": 73430 + }, + { + "epoch": 10.93759308906762, + "grad_norm": 0.0291748046875, + "learning_rate": 0.015163767587670846, + "loss": 0.8169, + "num_input_tokens_seen": 42629680, + "step": 73435 + }, + { + "epoch": 10.93833780160858, + "grad_norm": 0.0238037109375, + "learning_rate": 0.015161818049621904, + "loss": 0.8045, + "num_input_tokens_seen": 42632400, + "step": 73440 + }, + { + "epoch": 10.939082514149538, + "grad_norm": 0.037353515625, + "learning_rate": 0.015159868508839212, + "loss": 0.8207, + "num_input_tokens_seen": 42635184, + "step": 73445 + }, + { + "epoch": 10.939827226690497, + "grad_norm": 0.040771484375, + "learning_rate": 0.015157918965355701, + "loss": 0.7952, + "num_input_tokens_seen": 42638256, + "step": 73450 + }, + { + "epoch": 10.940571939231457, + "grad_norm": 0.0322265625, + "learning_rate": 0.01515596941920431, + "loss": 0.7875, + "num_input_tokens_seen": 42640976, + "step": 73455 + }, + { + "epoch": 10.941316651772416, + "grad_norm": 0.035400390625, + "learning_rate": 0.015154019870417973, + "loss": 0.8008, + "num_input_tokens_seen": 42643792, + "step": 73460 + }, + { + "epoch": 10.942061364313375, + "grad_norm": 0.03271484375, + "learning_rate": 0.01515207031902963, + "loss": 0.8108, + "num_input_tokens_seen": 42646768, + "step": 73465 + }, + { + "epoch": 10.942806076854334, + "grad_norm": 0.044677734375, + "learning_rate": 0.015150120765072208, + "loss": 0.8, + "num_input_tokens_seen": 42649744, + "step": 73470 + }, + { + "epoch": 10.943550789395294, + "grad_norm": 0.041259765625, + "learning_rate": 0.015148171208578655, + "loss": 0.7994, + "num_input_tokens_seen": 42652336, + "step": 73475 + }, + { + "epoch": 10.944295501936253, + "grad_norm": 0.044189453125, + "learning_rate": 0.015146221649581902, + "loss": 0.817, + "num_input_tokens_seen": 42655280, + "step": 73480 + }, + { + "epoch": 10.945040214477212, + "grad_norm": 0.05224609375, + "learning_rate": 0.015144272088114875, + "loss": 0.7754, + "num_input_tokens_seen": 42657840, + "step": 73485 + }, + { + "epoch": 10.94578492701817, + "grad_norm": 0.046875, + "learning_rate": 0.015142322524210518, + "loss": 0.7998, + "num_input_tokens_seen": 42660912, + "step": 73490 + }, + { + "epoch": 10.946529639559131, + "grad_norm": 0.0277099609375, + "learning_rate": 0.015140372957901773, + "loss": 0.8037, + "num_input_tokens_seen": 42664112, + "step": 73495 + }, + { + "epoch": 10.94727435210009, + "grad_norm": 0.04638671875, + "learning_rate": 0.015138423389221563, + "loss": 0.8113, + "num_input_tokens_seen": 42666832, + "step": 73500 + }, + { + "epoch": 10.948019064641048, + "grad_norm": 0.031982421875, + "learning_rate": 0.015136473818202838, + "loss": 0.7763, + "num_input_tokens_seen": 42669712, + "step": 73505 + }, + { + "epoch": 10.948763777182007, + "grad_norm": 0.0400390625, + "learning_rate": 0.01513452424487852, + "loss": 0.8044, + "num_input_tokens_seen": 42672656, + "step": 73510 + }, + { + "epoch": 10.949508489722968, + "grad_norm": 0.033203125, + "learning_rate": 0.015132574669281557, + "loss": 0.778, + "num_input_tokens_seen": 42675408, + "step": 73515 + }, + { + "epoch": 10.950253202263927, + "grad_norm": 0.046142578125, + "learning_rate": 0.015130625091444874, + "loss": 0.8068, + "num_input_tokens_seen": 42678512, + "step": 73520 + }, + { + "epoch": 10.950997914804885, + "grad_norm": 0.03076171875, + "learning_rate": 0.015128675511401414, + "loss": 0.8165, + "num_input_tokens_seen": 42681200, + "step": 73525 + }, + { + "epoch": 10.951742627345844, + "grad_norm": 0.031005859375, + "learning_rate": 0.015126725929184114, + "loss": 0.7782, + "num_input_tokens_seen": 42684880, + "step": 73530 + }, + { + "epoch": 10.952487339886805, + "grad_norm": 0.03271484375, + "learning_rate": 0.015124776344825905, + "loss": 0.7979, + "num_input_tokens_seen": 42687888, + "step": 73535 + }, + { + "epoch": 10.953232052427763, + "grad_norm": 0.04345703125, + "learning_rate": 0.015122826758359733, + "loss": 0.8233, + "num_input_tokens_seen": 42690768, + "step": 73540 + }, + { + "epoch": 10.953976764968722, + "grad_norm": 0.033203125, + "learning_rate": 0.01512087716981852, + "loss": 0.784, + "num_input_tokens_seen": 42693616, + "step": 73545 + }, + { + "epoch": 10.95472147750968, + "grad_norm": 0.032958984375, + "learning_rate": 0.015118927579235214, + "loss": 0.8313, + "num_input_tokens_seen": 42696592, + "step": 73550 + }, + { + "epoch": 10.95546619005064, + "grad_norm": 0.045654296875, + "learning_rate": 0.01511697798664275, + "loss": 0.8025, + "num_input_tokens_seen": 42699472, + "step": 73555 + }, + { + "epoch": 10.9562109025916, + "grad_norm": 0.038330078125, + "learning_rate": 0.015115028392074058, + "loss": 0.7834, + "num_input_tokens_seen": 42702416, + "step": 73560 + }, + { + "epoch": 10.956955615132559, + "grad_norm": 0.0289306640625, + "learning_rate": 0.015113078795562078, + "loss": 0.8121, + "num_input_tokens_seen": 42705520, + "step": 73565 + }, + { + "epoch": 10.957700327673518, + "grad_norm": 0.030517578125, + "learning_rate": 0.015111129197139746, + "loss": 0.79, + "num_input_tokens_seen": 42708368, + "step": 73570 + }, + { + "epoch": 10.958445040214476, + "grad_norm": 0.036376953125, + "learning_rate": 0.015109179596840003, + "loss": 0.8115, + "num_input_tokens_seen": 42711248, + "step": 73575 + }, + { + "epoch": 10.959189752755437, + "grad_norm": 0.0283203125, + "learning_rate": 0.015107229994695779, + "loss": 0.7891, + "num_input_tokens_seen": 42714256, + "step": 73580 + }, + { + "epoch": 10.959934465296396, + "grad_norm": 0.08642578125, + "learning_rate": 0.015105280390740014, + "loss": 0.8309, + "num_input_tokens_seen": 42717328, + "step": 73585 + }, + { + "epoch": 10.960679177837354, + "grad_norm": 0.0260009765625, + "learning_rate": 0.015103330785005645, + "loss": 0.8154, + "num_input_tokens_seen": 42720528, + "step": 73590 + }, + { + "epoch": 10.961423890378313, + "grad_norm": 0.05078125, + "learning_rate": 0.01510138117752561, + "loss": 0.8072, + "num_input_tokens_seen": 42723472, + "step": 73595 + }, + { + "epoch": 10.962168602919274, + "grad_norm": 0.034423828125, + "learning_rate": 0.015099431568332836, + "loss": 0.7858, + "num_input_tokens_seen": 42726576, + "step": 73600 + }, + { + "epoch": 10.962913315460233, + "grad_norm": 0.033935546875, + "learning_rate": 0.01509748195746027, + "loss": 0.7912, + "num_input_tokens_seen": 42729872, + "step": 73605 + }, + { + "epoch": 10.963658028001191, + "grad_norm": 0.06591796875, + "learning_rate": 0.015095532344940849, + "loss": 0.8107, + "num_input_tokens_seen": 42732720, + "step": 73610 + }, + { + "epoch": 10.96440274054215, + "grad_norm": 0.041015625, + "learning_rate": 0.0150935827308075, + "loss": 0.7913, + "num_input_tokens_seen": 42735600, + "step": 73615 + }, + { + "epoch": 10.96514745308311, + "grad_norm": 0.0263671875, + "learning_rate": 0.015091633115093168, + "loss": 0.8272, + "num_input_tokens_seen": 42738576, + "step": 73620 + }, + { + "epoch": 10.96589216562407, + "grad_norm": 0.041748046875, + "learning_rate": 0.015089683497830787, + "loss": 0.8207, + "num_input_tokens_seen": 42741328, + "step": 73625 + }, + { + "epoch": 10.966636878165028, + "grad_norm": 0.03466796875, + "learning_rate": 0.015087733879053294, + "loss": 0.8036, + "num_input_tokens_seen": 42744336, + "step": 73630 + }, + { + "epoch": 10.967381590705987, + "grad_norm": 0.0286865234375, + "learning_rate": 0.01508578425879363, + "loss": 0.7944, + "num_input_tokens_seen": 42747280, + "step": 73635 + }, + { + "epoch": 10.968126303246947, + "grad_norm": 0.0283203125, + "learning_rate": 0.015083834637084723, + "loss": 0.8086, + "num_input_tokens_seen": 42750512, + "step": 73640 + }, + { + "epoch": 10.968871015787906, + "grad_norm": 0.039306640625, + "learning_rate": 0.01508188501395952, + "loss": 0.7969, + "num_input_tokens_seen": 42753296, + "step": 73645 + }, + { + "epoch": 10.969615728328865, + "grad_norm": 0.06640625, + "learning_rate": 0.01507993538945095, + "loss": 0.8155, + "num_input_tokens_seen": 42756016, + "step": 73650 + }, + { + "epoch": 10.970360440869824, + "grad_norm": 0.028076171875, + "learning_rate": 0.015077985763591956, + "loss": 0.7943, + "num_input_tokens_seen": 42759088, + "step": 73655 + }, + { + "epoch": 10.971105153410784, + "grad_norm": 0.035400390625, + "learning_rate": 0.015076036136415468, + "loss": 0.8036, + "num_input_tokens_seen": 42761968, + "step": 73660 + }, + { + "epoch": 10.971849865951743, + "grad_norm": 0.0272216796875, + "learning_rate": 0.015074086507954424, + "loss": 0.778, + "num_input_tokens_seen": 42764624, + "step": 73665 + }, + { + "epoch": 10.972594578492702, + "grad_norm": 0.031494140625, + "learning_rate": 0.015072136878241773, + "loss": 0.7996, + "num_input_tokens_seen": 42767536, + "step": 73670 + }, + { + "epoch": 10.97333929103366, + "grad_norm": 0.03466796875, + "learning_rate": 0.015070187247310435, + "loss": 0.7938, + "num_input_tokens_seen": 42770480, + "step": 73675 + }, + { + "epoch": 10.974084003574621, + "grad_norm": 0.03076171875, + "learning_rate": 0.01506823761519336, + "loss": 0.7996, + "num_input_tokens_seen": 42773360, + "step": 73680 + }, + { + "epoch": 10.97482871611558, + "grad_norm": 0.0235595703125, + "learning_rate": 0.015066287981923478, + "loss": 0.7893, + "num_input_tokens_seen": 42776112, + "step": 73685 + }, + { + "epoch": 10.975573428656539, + "grad_norm": 0.034423828125, + "learning_rate": 0.015064338347533726, + "loss": 0.8234, + "num_input_tokens_seen": 42779216, + "step": 73690 + }, + { + "epoch": 10.976318141197497, + "grad_norm": 0.044189453125, + "learning_rate": 0.015062388712057041, + "loss": 0.7991, + "num_input_tokens_seen": 42782288, + "step": 73695 + }, + { + "epoch": 10.977062853738456, + "grad_norm": 0.03076171875, + "learning_rate": 0.015060439075526367, + "loss": 0.7837, + "num_input_tokens_seen": 42785072, + "step": 73700 + }, + { + "epoch": 10.977807566279417, + "grad_norm": 0.042724609375, + "learning_rate": 0.015058489437974635, + "loss": 0.7828, + "num_input_tokens_seen": 42787888, + "step": 73705 + }, + { + "epoch": 10.978552278820375, + "grad_norm": 0.034912109375, + "learning_rate": 0.015056539799434785, + "loss": 0.8325, + "num_input_tokens_seen": 42791024, + "step": 73710 + }, + { + "epoch": 10.979296991361334, + "grad_norm": 0.035888671875, + "learning_rate": 0.01505459015993975, + "loss": 0.7618, + "num_input_tokens_seen": 42794032, + "step": 73715 + }, + { + "epoch": 10.980041703902295, + "grad_norm": 0.042724609375, + "learning_rate": 0.015052640519522471, + "loss": 0.7853, + "num_input_tokens_seen": 42796688, + "step": 73720 + }, + { + "epoch": 10.980786416443253, + "grad_norm": 0.06689453125, + "learning_rate": 0.01505069087821588, + "loss": 0.8105, + "num_input_tokens_seen": 42799760, + "step": 73725 + }, + { + "epoch": 10.981531128984212, + "grad_norm": 0.035400390625, + "learning_rate": 0.015048741236052921, + "loss": 0.7796, + "num_input_tokens_seen": 42802576, + "step": 73730 + }, + { + "epoch": 10.982275841525171, + "grad_norm": 0.044921875, + "learning_rate": 0.01504679159306653, + "loss": 0.7773, + "num_input_tokens_seen": 42805456, + "step": 73735 + }, + { + "epoch": 10.98302055406613, + "grad_norm": 0.044677734375, + "learning_rate": 0.015044841949289641, + "loss": 0.7715, + "num_input_tokens_seen": 42808336, + "step": 73740 + }, + { + "epoch": 10.98376526660709, + "grad_norm": 0.03076171875, + "learning_rate": 0.015042892304755192, + "loss": 0.7917, + "num_input_tokens_seen": 42811376, + "step": 73745 + }, + { + "epoch": 10.984509979148049, + "grad_norm": 0.037109375, + "learning_rate": 0.015040942659496122, + "loss": 0.762, + "num_input_tokens_seen": 42814352, + "step": 73750 + }, + { + "epoch": 10.985254691689008, + "grad_norm": 0.036376953125, + "learning_rate": 0.01503899301354537, + "loss": 0.764, + "num_input_tokens_seen": 42817168, + "step": 73755 + }, + { + "epoch": 10.985999404229966, + "grad_norm": 0.047607421875, + "learning_rate": 0.015037043366935867, + "loss": 0.8561, + "num_input_tokens_seen": 42819952, + "step": 73760 + }, + { + "epoch": 10.986744116770927, + "grad_norm": 0.048583984375, + "learning_rate": 0.015035093719700555, + "loss": 0.8002, + "num_input_tokens_seen": 42823056, + "step": 73765 + }, + { + "epoch": 10.987488829311886, + "grad_norm": 0.0419921875, + "learning_rate": 0.015033144071872371, + "loss": 0.7928, + "num_input_tokens_seen": 42826000, + "step": 73770 + }, + { + "epoch": 10.988233541852845, + "grad_norm": 0.0283203125, + "learning_rate": 0.015031194423484254, + "loss": 0.7373, + "num_input_tokens_seen": 42828912, + "step": 73775 + }, + { + "epoch": 10.988978254393803, + "grad_norm": 0.046630859375, + "learning_rate": 0.015029244774569133, + "loss": 0.7872, + "num_input_tokens_seen": 42832016, + "step": 73780 + }, + { + "epoch": 10.989722966934764, + "grad_norm": 0.02685546875, + "learning_rate": 0.015027295125159953, + "loss": 0.811, + "num_input_tokens_seen": 42834736, + "step": 73785 + }, + { + "epoch": 10.990467679475723, + "grad_norm": 0.043701171875, + "learning_rate": 0.015025345475289652, + "loss": 0.7884, + "num_input_tokens_seen": 42837776, + "step": 73790 + }, + { + "epoch": 10.991212392016681, + "grad_norm": 0.050048828125, + "learning_rate": 0.015023395824991168, + "loss": 0.8083, + "num_input_tokens_seen": 42840400, + "step": 73795 + }, + { + "epoch": 10.99195710455764, + "grad_norm": 0.047607421875, + "learning_rate": 0.015021446174297428, + "loss": 0.8184, + "num_input_tokens_seen": 42843408, + "step": 73800 + }, + { + "epoch": 10.9927018170986, + "grad_norm": 0.06494140625, + "learning_rate": 0.01501949652324138, + "loss": 0.7914, + "num_input_tokens_seen": 42846384, + "step": 73805 + }, + { + "epoch": 10.99344652963956, + "grad_norm": 0.038818359375, + "learning_rate": 0.01501754687185596, + "loss": 0.7948, + "num_input_tokens_seen": 42849264, + "step": 73810 + }, + { + "epoch": 10.994191242180518, + "grad_norm": 0.05908203125, + "learning_rate": 0.0150155972201741, + "loss": 0.8442, + "num_input_tokens_seen": 42852080, + "step": 73815 + }, + { + "epoch": 10.994935954721477, + "grad_norm": 0.047119140625, + "learning_rate": 0.015013647568228742, + "loss": 0.7689, + "num_input_tokens_seen": 42855504, + "step": 73820 + }, + { + "epoch": 10.995680667262437, + "grad_norm": 0.0400390625, + "learning_rate": 0.015011697916052821, + "loss": 0.8165, + "num_input_tokens_seen": 42858832, + "step": 73825 + }, + { + "epoch": 10.996425379803396, + "grad_norm": 0.0654296875, + "learning_rate": 0.015009748263679276, + "loss": 0.8111, + "num_input_tokens_seen": 42861712, + "step": 73830 + }, + { + "epoch": 10.997170092344355, + "grad_norm": 0.041259765625, + "learning_rate": 0.015007798611141047, + "loss": 0.7984, + "num_input_tokens_seen": 42864368, + "step": 73835 + }, + { + "epoch": 10.997914804885314, + "grad_norm": 0.032958984375, + "learning_rate": 0.015005848958471065, + "loss": 0.8032, + "num_input_tokens_seen": 42867184, + "step": 73840 + }, + { + "epoch": 10.998659517426274, + "grad_norm": 0.033203125, + "learning_rate": 0.015003899305702275, + "loss": 0.8066, + "num_input_tokens_seen": 42870352, + "step": 73845 + }, + { + "epoch": 10.999404229967233, + "grad_norm": 0.041748046875, + "learning_rate": 0.015001949652867606, + "loss": 0.7992, + "num_input_tokens_seen": 42873520, + "step": 73850 + }, + { + "epoch": 11.0, + "eval_loss": 0.8003315925598145, + "eval_runtime": 70.6735, + "eval_samples_per_second": 42.222, + "eval_steps_per_second": 10.556, + "num_input_tokens_seen": 42875264, + "step": 73854 + }, + { + "epoch": 11.000148942508192, + "grad_norm": 0.051025390625, + "learning_rate": 0.015, + "loss": 0.7945, + "num_input_tokens_seen": 42875808, + "step": 73855 + }, + { + "epoch": 11.00089365504915, + "grad_norm": 0.048583984375, + "learning_rate": 0.014998050347132398, + "loss": 0.7884, + "num_input_tokens_seen": 42878944, + "step": 73860 + }, + { + "epoch": 11.001638367590111, + "grad_norm": 0.0419921875, + "learning_rate": 0.014996100694297727, + "loss": 0.8115, + "num_input_tokens_seen": 42881696, + "step": 73865 + }, + { + "epoch": 11.00238308013107, + "grad_norm": 0.037841796875, + "learning_rate": 0.014994151041528936, + "loss": 0.7803, + "num_input_tokens_seen": 42884480, + "step": 73870 + }, + { + "epoch": 11.003127792672029, + "grad_norm": 0.0301513671875, + "learning_rate": 0.014992201388858954, + "loss": 0.7922, + "num_input_tokens_seen": 42887488, + "step": 73875 + }, + { + "epoch": 11.003872505212987, + "grad_norm": 0.06494140625, + "learning_rate": 0.014990251736320724, + "loss": 0.7991, + "num_input_tokens_seen": 42890112, + "step": 73880 + }, + { + "epoch": 11.004617217753948, + "grad_norm": 0.03857421875, + "learning_rate": 0.014988302083947178, + "loss": 0.8083, + "num_input_tokens_seen": 42893248, + "step": 73885 + }, + { + "epoch": 11.005361930294907, + "grad_norm": 0.035400390625, + "learning_rate": 0.014986352431771259, + "loss": 0.7776, + "num_input_tokens_seen": 42895744, + "step": 73890 + }, + { + "epoch": 11.006106642835865, + "grad_norm": 0.044189453125, + "learning_rate": 0.014984402779825899, + "loss": 0.7946, + "num_input_tokens_seen": 42898528, + "step": 73895 + }, + { + "epoch": 11.006851355376824, + "grad_norm": 0.0303955078125, + "learning_rate": 0.014982453128144042, + "loss": 0.7931, + "num_input_tokens_seen": 42901600, + "step": 73900 + }, + { + "epoch": 11.007596067917783, + "grad_norm": 0.037841796875, + "learning_rate": 0.014980503476758621, + "loss": 0.7777, + "num_input_tokens_seen": 42904768, + "step": 73905 + }, + { + "epoch": 11.008340780458743, + "grad_norm": 0.043212890625, + "learning_rate": 0.014978553825702573, + "loss": 0.773, + "num_input_tokens_seen": 42907584, + "step": 73910 + }, + { + "epoch": 11.009085492999702, + "grad_norm": 0.035888671875, + "learning_rate": 0.014976604175008838, + "loss": 0.8241, + "num_input_tokens_seen": 42910848, + "step": 73915 + }, + { + "epoch": 11.009830205540661, + "grad_norm": 0.07080078125, + "learning_rate": 0.014974654524710347, + "loss": 0.8103, + "num_input_tokens_seen": 42913984, + "step": 73920 + }, + { + "epoch": 11.01057491808162, + "grad_norm": 0.040771484375, + "learning_rate": 0.014972704874840046, + "loss": 0.776, + "num_input_tokens_seen": 42917376, + "step": 73925 + }, + { + "epoch": 11.01131963062258, + "grad_norm": 0.07373046875, + "learning_rate": 0.014970755225430862, + "loss": 0.8131, + "num_input_tokens_seen": 42920448, + "step": 73930 + }, + { + "epoch": 11.012064343163539, + "grad_norm": 0.02783203125, + "learning_rate": 0.014968805576515747, + "loss": 0.7924, + "num_input_tokens_seen": 42923584, + "step": 73935 + }, + { + "epoch": 11.012809055704498, + "grad_norm": 0.0380859375, + "learning_rate": 0.01496685592812763, + "loss": 0.7985, + "num_input_tokens_seen": 42926144, + "step": 73940 + }, + { + "epoch": 11.013553768245457, + "grad_norm": 0.07421875, + "learning_rate": 0.014964906280299445, + "loss": 0.8169, + "num_input_tokens_seen": 42929216, + "step": 73945 + }, + { + "epoch": 11.014298480786417, + "grad_norm": 0.03369140625, + "learning_rate": 0.014962956633064135, + "loss": 0.8023, + "num_input_tokens_seen": 42932192, + "step": 73950 + }, + { + "epoch": 11.015043193327376, + "grad_norm": 0.04541015625, + "learning_rate": 0.014961006986454629, + "loss": 0.8189, + "num_input_tokens_seen": 42935104, + "step": 73955 + }, + { + "epoch": 11.015787905868335, + "grad_norm": 0.044189453125, + "learning_rate": 0.014959057340503878, + "loss": 0.8033, + "num_input_tokens_seen": 42937920, + "step": 73960 + }, + { + "epoch": 11.016532618409293, + "grad_norm": 0.05224609375, + "learning_rate": 0.014957107695244803, + "loss": 0.7788, + "num_input_tokens_seen": 42940608, + "step": 73965 + }, + { + "epoch": 11.017277330950254, + "grad_norm": 0.031005859375, + "learning_rate": 0.014955158050710357, + "loss": 0.7906, + "num_input_tokens_seen": 42943840, + "step": 73970 + }, + { + "epoch": 11.018022043491213, + "grad_norm": 0.039306640625, + "learning_rate": 0.014953208406933469, + "loss": 0.7618, + "num_input_tokens_seen": 42946592, + "step": 73975 + }, + { + "epoch": 11.018766756032171, + "grad_norm": 0.026611328125, + "learning_rate": 0.014951258763947078, + "loss": 0.7984, + "num_input_tokens_seen": 42949760, + "step": 73980 + }, + { + "epoch": 11.01951146857313, + "grad_norm": 0.0286865234375, + "learning_rate": 0.014949309121784122, + "loss": 0.814, + "num_input_tokens_seen": 42952864, + "step": 73985 + }, + { + "epoch": 11.02025618111409, + "grad_norm": 0.05810546875, + "learning_rate": 0.014947359480477531, + "loss": 0.83, + "num_input_tokens_seen": 42955616, + "step": 73990 + }, + { + "epoch": 11.02100089365505, + "grad_norm": 0.0400390625, + "learning_rate": 0.014945409840060254, + "loss": 0.7853, + "num_input_tokens_seen": 42958560, + "step": 73995 + }, + { + "epoch": 11.021745606196008, + "grad_norm": 0.0302734375, + "learning_rate": 0.014943460200565218, + "loss": 0.7856, + "num_input_tokens_seen": 42961344, + "step": 74000 + }, + { + "epoch": 11.022490318736967, + "grad_norm": 0.031494140625, + "learning_rate": 0.014941510562025365, + "loss": 0.8045, + "num_input_tokens_seen": 42964192, + "step": 74005 + }, + { + "epoch": 11.023235031277927, + "grad_norm": 0.033203125, + "learning_rate": 0.01493956092447363, + "loss": 0.8054, + "num_input_tokens_seen": 42967040, + "step": 74010 + }, + { + "epoch": 11.023979743818886, + "grad_norm": 0.042236328125, + "learning_rate": 0.014937611287942958, + "loss": 0.7795, + "num_input_tokens_seen": 42970208, + "step": 74015 + }, + { + "epoch": 11.024724456359845, + "grad_norm": 0.0390625, + "learning_rate": 0.01493566165246628, + "loss": 0.8187, + "num_input_tokens_seen": 42973024, + "step": 74020 + }, + { + "epoch": 11.025469168900804, + "grad_norm": 0.0322265625, + "learning_rate": 0.014933712018076523, + "loss": 0.7955, + "num_input_tokens_seen": 42976000, + "step": 74025 + }, + { + "epoch": 11.026213881441764, + "grad_norm": 0.03662109375, + "learning_rate": 0.014931762384806644, + "loss": 0.819, + "num_input_tokens_seen": 42979040, + "step": 74030 + }, + { + "epoch": 11.026958593982723, + "grad_norm": 0.049072265625, + "learning_rate": 0.014929812752689566, + "loss": 0.7991, + "num_input_tokens_seen": 42981888, + "step": 74035 + }, + { + "epoch": 11.027703306523682, + "grad_norm": 0.03076171875, + "learning_rate": 0.014927863121758228, + "loss": 0.7971, + "num_input_tokens_seen": 42984672, + "step": 74040 + }, + { + "epoch": 11.02844801906464, + "grad_norm": 0.036865234375, + "learning_rate": 0.014925913492045572, + "loss": 0.8239, + "num_input_tokens_seen": 42987616, + "step": 74045 + }, + { + "epoch": 11.029192731605601, + "grad_norm": 0.03955078125, + "learning_rate": 0.014923963863584534, + "loss": 0.7809, + "num_input_tokens_seen": 42990496, + "step": 74050 + }, + { + "epoch": 11.02993744414656, + "grad_norm": 0.0419921875, + "learning_rate": 0.014922014236408043, + "loss": 0.7829, + "num_input_tokens_seen": 42993280, + "step": 74055 + }, + { + "epoch": 11.030682156687519, + "grad_norm": 0.03759765625, + "learning_rate": 0.014920064610549049, + "loss": 0.8041, + "num_input_tokens_seen": 42996352, + "step": 74060 + }, + { + "epoch": 11.031426869228477, + "grad_norm": 0.045166015625, + "learning_rate": 0.014918114986040483, + "loss": 0.7945, + "num_input_tokens_seen": 42999072, + "step": 74065 + }, + { + "epoch": 11.032171581769436, + "grad_norm": 0.0341796875, + "learning_rate": 0.014916165362915276, + "loss": 0.8068, + "num_input_tokens_seen": 43001824, + "step": 74070 + }, + { + "epoch": 11.032916294310397, + "grad_norm": 0.035400390625, + "learning_rate": 0.014914215741206376, + "loss": 0.7863, + "num_input_tokens_seen": 43004544, + "step": 74075 + }, + { + "epoch": 11.033661006851355, + "grad_norm": 0.0233154296875, + "learning_rate": 0.014912266120946704, + "loss": 0.8095, + "num_input_tokens_seen": 43007584, + "step": 74080 + }, + { + "epoch": 11.034405719392314, + "grad_norm": 0.03857421875, + "learning_rate": 0.014910316502169216, + "loss": 0.785, + "num_input_tokens_seen": 43010208, + "step": 74085 + }, + { + "epoch": 11.035150431933273, + "grad_norm": 0.035400390625, + "learning_rate": 0.014908366884906831, + "loss": 0.8176, + "num_input_tokens_seen": 43013216, + "step": 74090 + }, + { + "epoch": 11.035895144474233, + "grad_norm": 0.0400390625, + "learning_rate": 0.014906417269192502, + "loss": 0.7998, + "num_input_tokens_seen": 43016096, + "step": 74095 + }, + { + "epoch": 11.036639857015192, + "grad_norm": 0.03369140625, + "learning_rate": 0.014904467655059157, + "loss": 0.8031, + "num_input_tokens_seen": 43018528, + "step": 74100 + }, + { + "epoch": 11.037384569556151, + "grad_norm": 0.02978515625, + "learning_rate": 0.014902518042539732, + "loss": 0.7853, + "num_input_tokens_seen": 43021408, + "step": 74105 + }, + { + "epoch": 11.03812928209711, + "grad_norm": 0.0301513671875, + "learning_rate": 0.014900568431667168, + "loss": 0.8338, + "num_input_tokens_seen": 43024032, + "step": 74110 + }, + { + "epoch": 11.03887399463807, + "grad_norm": 0.058349609375, + "learning_rate": 0.014898618822474393, + "loss": 0.8111, + "num_input_tokens_seen": 43026880, + "step": 74115 + }, + { + "epoch": 11.039618707179029, + "grad_norm": 0.056640625, + "learning_rate": 0.014896669214994357, + "loss": 0.8019, + "num_input_tokens_seen": 43029952, + "step": 74120 + }, + { + "epoch": 11.040363419719988, + "grad_norm": 0.052001953125, + "learning_rate": 0.014894719609259983, + "loss": 0.8546, + "num_input_tokens_seen": 43032800, + "step": 74125 + }, + { + "epoch": 11.041108132260947, + "grad_norm": 0.04345703125, + "learning_rate": 0.01489277000530422, + "loss": 0.7957, + "num_input_tokens_seen": 43035648, + "step": 74130 + }, + { + "epoch": 11.041852844801907, + "grad_norm": 0.033935546875, + "learning_rate": 0.014890820403159997, + "loss": 0.8182, + "num_input_tokens_seen": 43038368, + "step": 74135 + }, + { + "epoch": 11.042597557342866, + "grad_norm": 0.03662109375, + "learning_rate": 0.014888870802860254, + "loss": 0.7925, + "num_input_tokens_seen": 43041056, + "step": 74140 + }, + { + "epoch": 11.043342269883825, + "grad_norm": 0.03564453125, + "learning_rate": 0.014886921204437926, + "loss": 0.8006, + "num_input_tokens_seen": 43043872, + "step": 74145 + }, + { + "epoch": 11.044086982424783, + "grad_norm": 0.037841796875, + "learning_rate": 0.014884971607925943, + "loss": 0.8064, + "num_input_tokens_seen": 43046912, + "step": 74150 + }, + { + "epoch": 11.044831694965744, + "grad_norm": 0.0322265625, + "learning_rate": 0.014883022013357254, + "loss": 0.795, + "num_input_tokens_seen": 43049600, + "step": 74155 + }, + { + "epoch": 11.045576407506703, + "grad_norm": 0.04443359375, + "learning_rate": 0.014881072420764783, + "loss": 0.799, + "num_input_tokens_seen": 43052512, + "step": 74160 + }, + { + "epoch": 11.046321120047661, + "grad_norm": 0.046875, + "learning_rate": 0.014879122830181479, + "loss": 0.8123, + "num_input_tokens_seen": 43055744, + "step": 74165 + }, + { + "epoch": 11.04706583258862, + "grad_norm": 0.04052734375, + "learning_rate": 0.014877173241640269, + "loss": 0.796, + "num_input_tokens_seen": 43058976, + "step": 74170 + }, + { + "epoch": 11.04781054512958, + "grad_norm": 0.048583984375, + "learning_rate": 0.014875223655174095, + "loss": 0.7973, + "num_input_tokens_seen": 43062112, + "step": 74175 + }, + { + "epoch": 11.04855525767054, + "grad_norm": 0.04541015625, + "learning_rate": 0.01487327407081589, + "loss": 0.805, + "num_input_tokens_seen": 43064992, + "step": 74180 + }, + { + "epoch": 11.049299970211498, + "grad_norm": 0.035400390625, + "learning_rate": 0.014871324488598587, + "loss": 0.7835, + "num_input_tokens_seen": 43067872, + "step": 74185 + }, + { + "epoch": 11.050044682752457, + "grad_norm": 0.036376953125, + "learning_rate": 0.01486937490855513, + "loss": 0.801, + "num_input_tokens_seen": 43070752, + "step": 74190 + }, + { + "epoch": 11.050789395293418, + "grad_norm": 0.04345703125, + "learning_rate": 0.014867425330718444, + "loss": 0.8043, + "num_input_tokens_seen": 43073568, + "step": 74195 + }, + { + "epoch": 11.051534107834376, + "grad_norm": 0.045654296875, + "learning_rate": 0.01486547575512148, + "loss": 0.7859, + "num_input_tokens_seen": 43076320, + "step": 74200 + }, + { + "epoch": 11.052278820375335, + "grad_norm": 0.033203125, + "learning_rate": 0.014863526181797164, + "loss": 0.8017, + "num_input_tokens_seen": 43078976, + "step": 74205 + }, + { + "epoch": 11.053023532916294, + "grad_norm": 0.034912109375, + "learning_rate": 0.014861576610778438, + "loss": 0.7869, + "num_input_tokens_seen": 43081696, + "step": 74210 + }, + { + "epoch": 11.053768245457254, + "grad_norm": 0.043212890625, + "learning_rate": 0.014859627042098226, + "loss": 0.8036, + "num_input_tokens_seen": 43084768, + "step": 74215 + }, + { + "epoch": 11.054512957998213, + "grad_norm": 0.0302734375, + "learning_rate": 0.01485767747578948, + "loss": 0.8031, + "num_input_tokens_seen": 43087520, + "step": 74220 + }, + { + "epoch": 11.055257670539172, + "grad_norm": 0.0400390625, + "learning_rate": 0.014855727911885129, + "loss": 0.8086, + "num_input_tokens_seen": 43090080, + "step": 74225 + }, + { + "epoch": 11.05600238308013, + "grad_norm": 0.03564453125, + "learning_rate": 0.0148537783504181, + "loss": 0.7937, + "num_input_tokens_seen": 43092960, + "step": 74230 + }, + { + "epoch": 11.056747095621091, + "grad_norm": 0.039794921875, + "learning_rate": 0.014851828791421344, + "loss": 0.8177, + "num_input_tokens_seen": 43095584, + "step": 74235 + }, + { + "epoch": 11.05749180816205, + "grad_norm": 0.0693359375, + "learning_rate": 0.014849879234927789, + "loss": 0.7888, + "num_input_tokens_seen": 43098336, + "step": 74240 + }, + { + "epoch": 11.058236520703009, + "grad_norm": 0.040771484375, + "learning_rate": 0.014847929680970373, + "loss": 0.7802, + "num_input_tokens_seen": 43101088, + "step": 74245 + }, + { + "epoch": 11.058981233243967, + "grad_norm": 0.0294189453125, + "learning_rate": 0.014845980129582024, + "loss": 0.7888, + "num_input_tokens_seen": 43103968, + "step": 74250 + }, + { + "epoch": 11.059725945784926, + "grad_norm": 0.033447265625, + "learning_rate": 0.01484403058079569, + "loss": 0.7816, + "num_input_tokens_seen": 43106592, + "step": 74255 + }, + { + "epoch": 11.060470658325887, + "grad_norm": 0.051025390625, + "learning_rate": 0.014842081034644303, + "loss": 0.7858, + "num_input_tokens_seen": 43109280, + "step": 74260 + }, + { + "epoch": 11.061215370866845, + "grad_norm": 0.045166015625, + "learning_rate": 0.014840131491160789, + "loss": 0.8063, + "num_input_tokens_seen": 43112000, + "step": 74265 + }, + { + "epoch": 11.061960083407804, + "grad_norm": 0.035888671875, + "learning_rate": 0.014838181950378098, + "loss": 0.8144, + "num_input_tokens_seen": 43114848, + "step": 74270 + }, + { + "epoch": 11.062704795948763, + "grad_norm": 0.045654296875, + "learning_rate": 0.014836232412329155, + "loss": 0.8139, + "num_input_tokens_seen": 43117920, + "step": 74275 + }, + { + "epoch": 11.063449508489724, + "grad_norm": 0.045654296875, + "learning_rate": 0.014834282877046901, + "loss": 0.7859, + "num_input_tokens_seen": 43120544, + "step": 74280 + }, + { + "epoch": 11.064194221030682, + "grad_norm": 0.03173828125, + "learning_rate": 0.014832333344564262, + "loss": 0.8128, + "num_input_tokens_seen": 43123520, + "step": 74285 + }, + { + "epoch": 11.064938933571641, + "grad_norm": 0.051513671875, + "learning_rate": 0.014830383814914189, + "loss": 0.8086, + "num_input_tokens_seen": 43126592, + "step": 74290 + }, + { + "epoch": 11.0656836461126, + "grad_norm": 0.0250244140625, + "learning_rate": 0.014828434288129603, + "loss": 0.8021, + "num_input_tokens_seen": 43129600, + "step": 74295 + }, + { + "epoch": 11.06642835865356, + "grad_norm": 0.087890625, + "learning_rate": 0.014826484764243452, + "loss": 0.7913, + "num_input_tokens_seen": 43132384, + "step": 74300 + }, + { + "epoch": 11.067173071194519, + "grad_norm": 0.052978515625, + "learning_rate": 0.014824535243288663, + "loss": 0.7882, + "num_input_tokens_seen": 43134944, + "step": 74305 + }, + { + "epoch": 11.067917783735478, + "grad_norm": 0.040771484375, + "learning_rate": 0.014822585725298171, + "loss": 0.7896, + "num_input_tokens_seen": 43137856, + "step": 74310 + }, + { + "epoch": 11.068662496276437, + "grad_norm": 0.06298828125, + "learning_rate": 0.014820636210304914, + "loss": 0.7985, + "num_input_tokens_seen": 43140512, + "step": 74315 + }, + { + "epoch": 11.069407208817397, + "grad_norm": 0.046630859375, + "learning_rate": 0.014818686698341821, + "loss": 0.8082, + "num_input_tokens_seen": 43143200, + "step": 74320 + }, + { + "epoch": 11.070151921358356, + "grad_norm": 0.0322265625, + "learning_rate": 0.014816737189441839, + "loss": 0.7929, + "num_input_tokens_seen": 43146208, + "step": 74325 + }, + { + "epoch": 11.070896633899315, + "grad_norm": 0.060791015625, + "learning_rate": 0.014814787683637892, + "loss": 0.7955, + "num_input_tokens_seen": 43149056, + "step": 74330 + }, + { + "epoch": 11.071641346440273, + "grad_norm": 0.028564453125, + "learning_rate": 0.014812838180962918, + "loss": 0.8009, + "num_input_tokens_seen": 43151744, + "step": 74335 + }, + { + "epoch": 11.072386058981234, + "grad_norm": 0.03955078125, + "learning_rate": 0.014810888681449858, + "loss": 0.7763, + "num_input_tokens_seen": 43155040, + "step": 74340 + }, + { + "epoch": 11.073130771522193, + "grad_norm": 0.0732421875, + "learning_rate": 0.014808939185131636, + "loss": 0.7866, + "num_input_tokens_seen": 43158144, + "step": 74345 + }, + { + "epoch": 11.073875484063151, + "grad_norm": 0.035400390625, + "learning_rate": 0.014806989692041196, + "loss": 0.8134, + "num_input_tokens_seen": 43161056, + "step": 74350 + }, + { + "epoch": 11.07462019660411, + "grad_norm": 0.033203125, + "learning_rate": 0.014805040202211465, + "loss": 0.792, + "num_input_tokens_seen": 43164288, + "step": 74355 + }, + { + "epoch": 11.07536490914507, + "grad_norm": 0.03955078125, + "learning_rate": 0.014803090715675385, + "loss": 0.8011, + "num_input_tokens_seen": 43166848, + "step": 74360 + }, + { + "epoch": 11.07610962168603, + "grad_norm": 0.036865234375, + "learning_rate": 0.014801141232465885, + "loss": 0.769, + "num_input_tokens_seen": 43169600, + "step": 74365 + }, + { + "epoch": 11.076854334226988, + "grad_norm": 0.060302734375, + "learning_rate": 0.014799191752615902, + "loss": 0.8185, + "num_input_tokens_seen": 43172288, + "step": 74370 + }, + { + "epoch": 11.077599046767947, + "grad_norm": 0.03076171875, + "learning_rate": 0.01479724227615837, + "loss": 0.7958, + "num_input_tokens_seen": 43175168, + "step": 74375 + }, + { + "epoch": 11.078343759308908, + "grad_norm": 0.07470703125, + "learning_rate": 0.014795292803126227, + "loss": 0.805, + "num_input_tokens_seen": 43178304, + "step": 74380 + }, + { + "epoch": 11.079088471849866, + "grad_norm": 0.07568359375, + "learning_rate": 0.014793343333552403, + "loss": 0.7946, + "num_input_tokens_seen": 43181184, + "step": 74385 + }, + { + "epoch": 11.079833184390825, + "grad_norm": 0.04931640625, + "learning_rate": 0.014791393867469829, + "loss": 0.7889, + "num_input_tokens_seen": 43184064, + "step": 74390 + }, + { + "epoch": 11.080577896931784, + "grad_norm": 0.042236328125, + "learning_rate": 0.014789444404911449, + "loss": 0.7931, + "num_input_tokens_seen": 43186976, + "step": 74395 + }, + { + "epoch": 11.081322609472744, + "grad_norm": 0.03564453125, + "learning_rate": 0.014787494945910189, + "loss": 0.8035, + "num_input_tokens_seen": 43189888, + "step": 74400 + }, + { + "epoch": 11.082067322013703, + "grad_norm": 0.03076171875, + "learning_rate": 0.014785545490498988, + "loss": 0.7839, + "num_input_tokens_seen": 43192896, + "step": 74405 + }, + { + "epoch": 11.082812034554662, + "grad_norm": 0.0361328125, + "learning_rate": 0.014783596038710772, + "loss": 0.8367, + "num_input_tokens_seen": 43195936, + "step": 74410 + }, + { + "epoch": 11.08355674709562, + "grad_norm": 0.0267333984375, + "learning_rate": 0.014781646590578488, + "loss": 0.8122, + "num_input_tokens_seen": 43198496, + "step": 74415 + }, + { + "epoch": 11.08430145963658, + "grad_norm": 0.055908203125, + "learning_rate": 0.014779697146135062, + "loss": 0.8236, + "num_input_tokens_seen": 43201344, + "step": 74420 + }, + { + "epoch": 11.08504617217754, + "grad_norm": 0.038330078125, + "learning_rate": 0.014777747705413423, + "loss": 0.7974, + "num_input_tokens_seen": 43204192, + "step": 74425 + }, + { + "epoch": 11.085790884718499, + "grad_norm": 0.032958984375, + "learning_rate": 0.014775798268446516, + "loss": 0.8084, + "num_input_tokens_seen": 43207168, + "step": 74430 + }, + { + "epoch": 11.086535597259457, + "grad_norm": 0.058349609375, + "learning_rate": 0.014773848835267268, + "loss": 0.8024, + "num_input_tokens_seen": 43210464, + "step": 74435 + }, + { + "epoch": 11.087280309800416, + "grad_norm": 0.033447265625, + "learning_rate": 0.014771899405908616, + "loss": 0.7992, + "num_input_tokens_seen": 43213344, + "step": 74440 + }, + { + "epoch": 11.088025022341377, + "grad_norm": 0.04052734375, + "learning_rate": 0.014769949980403487, + "loss": 0.807, + "num_input_tokens_seen": 43216096, + "step": 74445 + }, + { + "epoch": 11.088769734882336, + "grad_norm": 0.035400390625, + "learning_rate": 0.014768000558784824, + "loss": 0.8019, + "num_input_tokens_seen": 43219136, + "step": 74450 + }, + { + "epoch": 11.089514447423294, + "grad_norm": 0.037353515625, + "learning_rate": 0.01476605114108555, + "loss": 0.7995, + "num_input_tokens_seen": 43222304, + "step": 74455 + }, + { + "epoch": 11.090259159964253, + "grad_norm": 0.032470703125, + "learning_rate": 0.01476410172733861, + "loss": 0.8046, + "num_input_tokens_seen": 43224960, + "step": 74460 + }, + { + "epoch": 11.091003872505214, + "grad_norm": 0.0341796875, + "learning_rate": 0.014762152317576932, + "loss": 0.8237, + "num_input_tokens_seen": 43227936, + "step": 74465 + }, + { + "epoch": 11.091748585046172, + "grad_norm": 0.025390625, + "learning_rate": 0.014760202911833447, + "loss": 0.8071, + "num_input_tokens_seen": 43230816, + "step": 74470 + }, + { + "epoch": 11.092493297587131, + "grad_norm": 0.04931640625, + "learning_rate": 0.01475825351014109, + "loss": 0.8079, + "num_input_tokens_seen": 43233504, + "step": 74475 + }, + { + "epoch": 11.09323801012809, + "grad_norm": 0.041259765625, + "learning_rate": 0.014756304112532792, + "loss": 0.8134, + "num_input_tokens_seen": 43236096, + "step": 74480 + }, + { + "epoch": 11.09398272266905, + "grad_norm": 0.03173828125, + "learning_rate": 0.014754354719041492, + "loss": 0.8211, + "num_input_tokens_seen": 43239104, + "step": 74485 + }, + { + "epoch": 11.09472743521001, + "grad_norm": 0.0341796875, + "learning_rate": 0.014752405329700112, + "loss": 0.8026, + "num_input_tokens_seen": 43242016, + "step": 74490 + }, + { + "epoch": 11.095472147750968, + "grad_norm": 0.03369140625, + "learning_rate": 0.0147504559445416, + "loss": 0.8048, + "num_input_tokens_seen": 43244864, + "step": 74495 + }, + { + "epoch": 11.096216860291927, + "grad_norm": 0.03076171875, + "learning_rate": 0.014748506563598877, + "loss": 0.7964, + "num_input_tokens_seen": 43247904, + "step": 74500 + }, + { + "epoch": 11.096961572832887, + "grad_norm": 0.040283203125, + "learning_rate": 0.014746557186904886, + "loss": 0.8108, + "num_input_tokens_seen": 43250848, + "step": 74505 + }, + { + "epoch": 11.097706285373846, + "grad_norm": 0.031494140625, + "learning_rate": 0.014744607814492551, + "loss": 0.7994, + "num_input_tokens_seen": 43253504, + "step": 74510 + }, + { + "epoch": 11.098450997914805, + "grad_norm": 0.039306640625, + "learning_rate": 0.014742658446394802, + "loss": 0.7985, + "num_input_tokens_seen": 43256384, + "step": 74515 + }, + { + "epoch": 11.099195710455763, + "grad_norm": 0.0703125, + "learning_rate": 0.014740709082644583, + "loss": 0.8125, + "num_input_tokens_seen": 43259040, + "step": 74520 + }, + { + "epoch": 11.099940422996724, + "grad_norm": 0.0291748046875, + "learning_rate": 0.014738759723274816, + "loss": 0.8111, + "num_input_tokens_seen": 43263104, + "step": 74525 + }, + { + "epoch": 11.100685135537683, + "grad_norm": 0.043212890625, + "learning_rate": 0.014736810368318443, + "loss": 0.815, + "num_input_tokens_seen": 43265920, + "step": 74530 + }, + { + "epoch": 11.101429848078642, + "grad_norm": 0.0211181640625, + "learning_rate": 0.014734861017808389, + "loss": 0.8144, + "num_input_tokens_seen": 43269024, + "step": 74535 + }, + { + "epoch": 11.1021745606196, + "grad_norm": 0.027587890625, + "learning_rate": 0.01473291167177759, + "loss": 0.7996, + "num_input_tokens_seen": 43271744, + "step": 74540 + }, + { + "epoch": 11.10291927316056, + "grad_norm": 0.0732421875, + "learning_rate": 0.014730962330258978, + "loss": 0.8007, + "num_input_tokens_seen": 43274624, + "step": 74545 + }, + { + "epoch": 11.10366398570152, + "grad_norm": 0.023681640625, + "learning_rate": 0.014729012993285476, + "loss": 0.8001, + "num_input_tokens_seen": 43277472, + "step": 74550 + }, + { + "epoch": 11.104408698242478, + "grad_norm": 0.039794921875, + "learning_rate": 0.014727063660890032, + "loss": 0.7821, + "num_input_tokens_seen": 43280704, + "step": 74555 + }, + { + "epoch": 11.105153410783437, + "grad_norm": 0.0185546875, + "learning_rate": 0.014725114333105565, + "loss": 0.7858, + "num_input_tokens_seen": 43283456, + "step": 74560 + }, + { + "epoch": 11.105898123324398, + "grad_norm": 0.04296875, + "learning_rate": 0.014723165009965017, + "loss": 0.7916, + "num_input_tokens_seen": 43286208, + "step": 74565 + }, + { + "epoch": 11.106642835865356, + "grad_norm": 0.0322265625, + "learning_rate": 0.01472121569150131, + "loss": 0.8014, + "num_input_tokens_seen": 43288992, + "step": 74570 + }, + { + "epoch": 11.107387548406315, + "grad_norm": 0.02099609375, + "learning_rate": 0.014719266377747385, + "loss": 0.8086, + "num_input_tokens_seen": 43292032, + "step": 74575 + }, + { + "epoch": 11.108132260947274, + "grad_norm": 0.0308837890625, + "learning_rate": 0.014717317068736163, + "loss": 0.8109, + "num_input_tokens_seen": 43295264, + "step": 74580 + }, + { + "epoch": 11.108876973488233, + "grad_norm": 0.0294189453125, + "learning_rate": 0.014715367764500588, + "loss": 0.7885, + "num_input_tokens_seen": 43297824, + "step": 74585 + }, + { + "epoch": 11.109621686029193, + "grad_norm": 0.029296875, + "learning_rate": 0.014713418465073588, + "loss": 0.8035, + "num_input_tokens_seen": 43300480, + "step": 74590 + }, + { + "epoch": 11.110366398570152, + "grad_norm": 0.037109375, + "learning_rate": 0.014711469170488082, + "loss": 0.7878, + "num_input_tokens_seen": 43303360, + "step": 74595 + }, + { + "epoch": 11.11111111111111, + "grad_norm": 0.03076171875, + "learning_rate": 0.014709519880777018, + "loss": 0.7807, + "num_input_tokens_seen": 43306176, + "step": 74600 + }, + { + "epoch": 11.11185582365207, + "grad_norm": 0.031494140625, + "learning_rate": 0.014707570595973317, + "loss": 0.7966, + "num_input_tokens_seen": 43308992, + "step": 74605 + }, + { + "epoch": 11.11260053619303, + "grad_norm": 0.0281982421875, + "learning_rate": 0.014705621316109918, + "loss": 0.8206, + "num_input_tokens_seen": 43311808, + "step": 74610 + }, + { + "epoch": 11.113345248733989, + "grad_norm": 0.0213623046875, + "learning_rate": 0.01470367204121974, + "loss": 0.8171, + "num_input_tokens_seen": 43314368, + "step": 74615 + }, + { + "epoch": 11.114089961274948, + "grad_norm": 0.04150390625, + "learning_rate": 0.014701722771335728, + "loss": 0.8017, + "num_input_tokens_seen": 43317568, + "step": 74620 + }, + { + "epoch": 11.114834673815906, + "grad_norm": 0.037353515625, + "learning_rate": 0.014699773506490809, + "loss": 0.8229, + "num_input_tokens_seen": 43320416, + "step": 74625 + }, + { + "epoch": 11.115579386356867, + "grad_norm": 0.039794921875, + "learning_rate": 0.014697824246717903, + "loss": 0.7924, + "num_input_tokens_seen": 43323296, + "step": 74630 + }, + { + "epoch": 11.116324098897826, + "grad_norm": 0.031982421875, + "learning_rate": 0.014695874992049957, + "loss": 0.8124, + "num_input_tokens_seen": 43326400, + "step": 74635 + }, + { + "epoch": 11.117068811438784, + "grad_norm": 0.0400390625, + "learning_rate": 0.014693925742519888, + "loss": 0.7979, + "num_input_tokens_seen": 43329248, + "step": 74640 + }, + { + "epoch": 11.117813523979743, + "grad_norm": 0.023681640625, + "learning_rate": 0.014691976498160638, + "loss": 0.7962, + "num_input_tokens_seen": 43332320, + "step": 74645 + }, + { + "epoch": 11.118558236520704, + "grad_norm": 0.03271484375, + "learning_rate": 0.014690027259005124, + "loss": 0.8042, + "num_input_tokens_seen": 43335392, + "step": 74650 + }, + { + "epoch": 11.119302949061662, + "grad_norm": 0.0262451171875, + "learning_rate": 0.014688078025086294, + "loss": 0.7841, + "num_input_tokens_seen": 43338464, + "step": 74655 + }, + { + "epoch": 11.120047661602621, + "grad_norm": 0.06103515625, + "learning_rate": 0.014686128796437064, + "loss": 0.8082, + "num_input_tokens_seen": 43341152, + "step": 74660 + }, + { + "epoch": 11.12079237414358, + "grad_norm": 0.0191650390625, + "learning_rate": 0.014684179573090367, + "loss": 0.8213, + "num_input_tokens_seen": 43343968, + "step": 74665 + }, + { + "epoch": 11.12153708668454, + "grad_norm": 0.0390625, + "learning_rate": 0.014682230355079139, + "loss": 0.8027, + "num_input_tokens_seen": 43346752, + "step": 74670 + }, + { + "epoch": 11.1222817992255, + "grad_norm": 0.0213623046875, + "learning_rate": 0.014680281142436304, + "loss": 0.8037, + "num_input_tokens_seen": 43349600, + "step": 74675 + }, + { + "epoch": 11.123026511766458, + "grad_norm": 0.017822265625, + "learning_rate": 0.014678331935194798, + "loss": 0.7858, + "num_input_tokens_seen": 43352576, + "step": 74680 + }, + { + "epoch": 11.123771224307417, + "grad_norm": 0.0263671875, + "learning_rate": 0.01467638273338754, + "loss": 0.8079, + "num_input_tokens_seen": 43355712, + "step": 74685 + }, + { + "epoch": 11.124515936848377, + "grad_norm": 0.02294921875, + "learning_rate": 0.014674433537047471, + "loss": 0.7989, + "num_input_tokens_seen": 43358368, + "step": 74690 + }, + { + "epoch": 11.125260649389336, + "grad_norm": 0.0498046875, + "learning_rate": 0.014672484346207517, + "loss": 0.7883, + "num_input_tokens_seen": 43361088, + "step": 74695 + }, + { + "epoch": 11.126005361930295, + "grad_norm": 0.062255859375, + "learning_rate": 0.014670535160900606, + "loss": 0.8195, + "num_input_tokens_seen": 43364352, + "step": 74700 + }, + { + "epoch": 11.126750074471254, + "grad_norm": 0.048828125, + "learning_rate": 0.01466858598115967, + "loss": 0.8031, + "num_input_tokens_seen": 43367264, + "step": 74705 + }, + { + "epoch": 11.127494787012214, + "grad_norm": 0.0322265625, + "learning_rate": 0.014666636807017635, + "loss": 0.7974, + "num_input_tokens_seen": 43370464, + "step": 74710 + }, + { + "epoch": 11.128239499553173, + "grad_norm": 0.02587890625, + "learning_rate": 0.014664687638507435, + "loss": 0.7883, + "num_input_tokens_seen": 43373344, + "step": 74715 + }, + { + "epoch": 11.128984212094132, + "grad_norm": 0.031005859375, + "learning_rate": 0.01466273847566199, + "loss": 0.782, + "num_input_tokens_seen": 43376224, + "step": 74720 + }, + { + "epoch": 11.12972892463509, + "grad_norm": 0.0269775390625, + "learning_rate": 0.014660789318514243, + "loss": 0.7901, + "num_input_tokens_seen": 43379232, + "step": 74725 + }, + { + "epoch": 11.13047363717605, + "grad_norm": 0.0301513671875, + "learning_rate": 0.014658840167097113, + "loss": 0.7965, + "num_input_tokens_seen": 43381984, + "step": 74730 + }, + { + "epoch": 11.13121834971701, + "grad_norm": 0.0235595703125, + "learning_rate": 0.014656891021443533, + "loss": 0.8097, + "num_input_tokens_seen": 43384928, + "step": 74735 + }, + { + "epoch": 11.131963062257968, + "grad_norm": 0.018798828125, + "learning_rate": 0.014654941881586427, + "loss": 0.8052, + "num_input_tokens_seen": 43387840, + "step": 74740 + }, + { + "epoch": 11.132707774798927, + "grad_norm": 0.0279541015625, + "learning_rate": 0.01465299274755873, + "loss": 0.8011, + "num_input_tokens_seen": 43390656, + "step": 74745 + }, + { + "epoch": 11.133452487339888, + "grad_norm": 0.053466796875, + "learning_rate": 0.01465104361939337, + "loss": 0.8098, + "num_input_tokens_seen": 43393856, + "step": 74750 + }, + { + "epoch": 11.134197199880846, + "grad_norm": 0.0233154296875, + "learning_rate": 0.014649094497123266, + "loss": 0.8095, + "num_input_tokens_seen": 43396800, + "step": 74755 + }, + { + "epoch": 11.134941912421805, + "grad_norm": 0.05029296875, + "learning_rate": 0.014647145380781362, + "loss": 0.8082, + "num_input_tokens_seen": 43399488, + "step": 74760 + }, + { + "epoch": 11.135686624962764, + "grad_norm": 0.0189208984375, + "learning_rate": 0.014645196270400574, + "loss": 0.799, + "num_input_tokens_seen": 43402464, + "step": 74765 + }, + { + "epoch": 11.136431337503723, + "grad_norm": 0.043212890625, + "learning_rate": 0.014643247166013836, + "loss": 0.7936, + "num_input_tokens_seen": 43405504, + "step": 74770 + }, + { + "epoch": 11.137176050044683, + "grad_norm": 0.029052734375, + "learning_rate": 0.014641298067654069, + "loss": 0.8022, + "num_input_tokens_seen": 43408384, + "step": 74775 + }, + { + "epoch": 11.137920762585642, + "grad_norm": 0.049560546875, + "learning_rate": 0.014639348975354216, + "loss": 0.8063, + "num_input_tokens_seen": 43411392, + "step": 74780 + }, + { + "epoch": 11.1386654751266, + "grad_norm": 0.0205078125, + "learning_rate": 0.014637399889147192, + "loss": 0.7915, + "num_input_tokens_seen": 43414336, + "step": 74785 + }, + { + "epoch": 11.13941018766756, + "grad_norm": 0.02783203125, + "learning_rate": 0.014635450809065921, + "loss": 0.8262, + "num_input_tokens_seen": 43417024, + "step": 74790 + }, + { + "epoch": 11.14015490020852, + "grad_norm": 0.042236328125, + "learning_rate": 0.014633501735143347, + "loss": 0.8018, + "num_input_tokens_seen": 43419648, + "step": 74795 + }, + { + "epoch": 11.140899612749479, + "grad_norm": 0.029541015625, + "learning_rate": 0.014631552667412387, + "loss": 0.7983, + "num_input_tokens_seen": 43422560, + "step": 74800 + }, + { + "epoch": 11.141644325290438, + "grad_norm": 0.06005859375, + "learning_rate": 0.01462960360590597, + "loss": 0.7892, + "num_input_tokens_seen": 43425760, + "step": 74805 + }, + { + "epoch": 11.142389037831396, + "grad_norm": 0.04541015625, + "learning_rate": 0.014627654550657019, + "loss": 0.8077, + "num_input_tokens_seen": 43428544, + "step": 74810 + }, + { + "epoch": 11.143133750372357, + "grad_norm": 0.03759765625, + "learning_rate": 0.014625705501698472, + "loss": 0.8009, + "num_input_tokens_seen": 43431392, + "step": 74815 + }, + { + "epoch": 11.143878462913316, + "grad_norm": 0.053955078125, + "learning_rate": 0.014623756459063244, + "loss": 0.814, + "num_input_tokens_seen": 43434496, + "step": 74820 + }, + { + "epoch": 11.144623175454274, + "grad_norm": 0.0274658203125, + "learning_rate": 0.014621807422784273, + "loss": 0.799, + "num_input_tokens_seen": 43437472, + "step": 74825 + }, + { + "epoch": 11.145367887995233, + "grad_norm": 0.038330078125, + "learning_rate": 0.014619858392894484, + "loss": 0.8093, + "num_input_tokens_seen": 43440288, + "step": 74830 + }, + { + "epoch": 11.146112600536194, + "grad_norm": 0.0184326171875, + "learning_rate": 0.014617909369426795, + "loss": 0.8102, + "num_input_tokens_seen": 43443008, + "step": 74835 + }, + { + "epoch": 11.146857313077152, + "grad_norm": 0.05078125, + "learning_rate": 0.014615960352414144, + "loss": 0.8028, + "num_input_tokens_seen": 43446112, + "step": 74840 + }, + { + "epoch": 11.147602025618111, + "grad_norm": 0.031494140625, + "learning_rate": 0.014614011341889445, + "loss": 0.7924, + "num_input_tokens_seen": 43448992, + "step": 74845 + }, + { + "epoch": 11.14834673815907, + "grad_norm": 0.03369140625, + "learning_rate": 0.014612062337885638, + "loss": 0.8135, + "num_input_tokens_seen": 43451936, + "step": 74850 + }, + { + "epoch": 11.14909145070003, + "grad_norm": 0.039306640625, + "learning_rate": 0.014610113340435639, + "loss": 0.7834, + "num_input_tokens_seen": 43454560, + "step": 74855 + }, + { + "epoch": 11.14983616324099, + "grad_norm": 0.0205078125, + "learning_rate": 0.014608164349572385, + "loss": 0.8139, + "num_input_tokens_seen": 43457760, + "step": 74860 + }, + { + "epoch": 11.150580875781948, + "grad_norm": 0.04833984375, + "learning_rate": 0.014606215365328796, + "loss": 0.7838, + "num_input_tokens_seen": 43460992, + "step": 74865 + }, + { + "epoch": 11.151325588322907, + "grad_norm": 0.050537109375, + "learning_rate": 0.014604266387737793, + "loss": 0.7917, + "num_input_tokens_seen": 43463904, + "step": 74870 + }, + { + "epoch": 11.152070300863867, + "grad_norm": 0.041015625, + "learning_rate": 0.014602317416832312, + "loss": 0.7942, + "num_input_tokens_seen": 43466816, + "step": 74875 + }, + { + "epoch": 11.152815013404826, + "grad_norm": 0.022216796875, + "learning_rate": 0.014600368452645267, + "loss": 0.7955, + "num_input_tokens_seen": 43469568, + "step": 74880 + }, + { + "epoch": 11.153559725945785, + "grad_norm": 0.031982421875, + "learning_rate": 0.014598419495209599, + "loss": 0.7871, + "num_input_tokens_seen": 43472576, + "step": 74885 + }, + { + "epoch": 11.154304438486744, + "grad_norm": 0.04345703125, + "learning_rate": 0.014596470544558217, + "loss": 0.8004, + "num_input_tokens_seen": 43475392, + "step": 74890 + }, + { + "epoch": 11.155049151027704, + "grad_norm": 0.0234375, + "learning_rate": 0.014594521600724062, + "loss": 0.8155, + "num_input_tokens_seen": 43478336, + "step": 74895 + }, + { + "epoch": 11.155793863568663, + "grad_norm": 0.0224609375, + "learning_rate": 0.014592572663740049, + "loss": 0.7987, + "num_input_tokens_seen": 43481216, + "step": 74900 + }, + { + "epoch": 11.156538576109622, + "grad_norm": 0.030029296875, + "learning_rate": 0.014590623733639112, + "loss": 0.778, + "num_input_tokens_seen": 43484416, + "step": 74905 + }, + { + "epoch": 11.15728328865058, + "grad_norm": 0.041015625, + "learning_rate": 0.014588674810454168, + "loss": 0.7961, + "num_input_tokens_seen": 43487040, + "step": 74910 + }, + { + "epoch": 11.158028001191541, + "grad_norm": 0.03515625, + "learning_rate": 0.014586725894218139, + "loss": 0.8254, + "num_input_tokens_seen": 43489952, + "step": 74915 + }, + { + "epoch": 11.1587727137325, + "grad_norm": 0.039306640625, + "learning_rate": 0.014584776984963964, + "loss": 0.8002, + "num_input_tokens_seen": 43492832, + "step": 74920 + }, + { + "epoch": 11.159517426273458, + "grad_norm": 0.0223388671875, + "learning_rate": 0.014582828082724554, + "loss": 0.7997, + "num_input_tokens_seen": 43495584, + "step": 74925 + }, + { + "epoch": 11.160262138814417, + "grad_norm": 0.028076171875, + "learning_rate": 0.014580879187532843, + "loss": 0.7865, + "num_input_tokens_seen": 43498496, + "step": 74930 + }, + { + "epoch": 11.161006851355376, + "grad_norm": 0.02880859375, + "learning_rate": 0.014578930299421752, + "loss": 0.8164, + "num_input_tokens_seen": 43501632, + "step": 74935 + }, + { + "epoch": 11.161751563896336, + "grad_norm": 0.033935546875, + "learning_rate": 0.014576981418424206, + "loss": 0.7938, + "num_input_tokens_seen": 43504544, + "step": 74940 + }, + { + "epoch": 11.162496276437295, + "grad_norm": 0.03173828125, + "learning_rate": 0.014575032544573131, + "loss": 0.8536, + "num_input_tokens_seen": 43507392, + "step": 74945 + }, + { + "epoch": 11.163240988978254, + "grad_norm": 0.0235595703125, + "learning_rate": 0.014573083677901442, + "loss": 0.783, + "num_input_tokens_seen": 43510176, + "step": 74950 + }, + { + "epoch": 11.163985701519213, + "grad_norm": 0.064453125, + "learning_rate": 0.014571134818442077, + "loss": 0.8138, + "num_input_tokens_seen": 43512960, + "step": 74955 + }, + { + "epoch": 11.164730414060173, + "grad_norm": 0.056640625, + "learning_rate": 0.014569185966227947, + "loss": 0.8171, + "num_input_tokens_seen": 43515712, + "step": 74960 + }, + { + "epoch": 11.165475126601132, + "grad_norm": 0.045654296875, + "learning_rate": 0.014567237121291989, + "loss": 0.7908, + "num_input_tokens_seen": 43518528, + "step": 74965 + }, + { + "epoch": 11.16621983914209, + "grad_norm": 0.030029296875, + "learning_rate": 0.014565288283667117, + "loss": 0.7988, + "num_input_tokens_seen": 43521344, + "step": 74970 + }, + { + "epoch": 11.16696455168305, + "grad_norm": 0.05224609375, + "learning_rate": 0.01456333945338626, + "loss": 0.8089, + "num_input_tokens_seen": 43524000, + "step": 74975 + }, + { + "epoch": 11.16770926422401, + "grad_norm": 0.032958984375, + "learning_rate": 0.014561390630482333, + "loss": 0.7888, + "num_input_tokens_seen": 43526752, + "step": 74980 + }, + { + "epoch": 11.168453976764969, + "grad_norm": 0.04833984375, + "learning_rate": 0.014559441814988272, + "loss": 0.8021, + "num_input_tokens_seen": 43529696, + "step": 74985 + }, + { + "epoch": 11.169198689305928, + "grad_norm": 0.031982421875, + "learning_rate": 0.014557493006936994, + "loss": 0.8042, + "num_input_tokens_seen": 43532352, + "step": 74990 + }, + { + "epoch": 11.169943401846886, + "grad_norm": 0.030029296875, + "learning_rate": 0.014555544206361414, + "loss": 0.7923, + "num_input_tokens_seen": 43535296, + "step": 74995 + }, + { + "epoch": 11.170688114387847, + "grad_norm": 0.0186767578125, + "learning_rate": 0.014553595413294472, + "loss": 0.8247, + "num_input_tokens_seen": 43538496, + "step": 75000 + }, + { + "epoch": 11.171432826928806, + "grad_norm": 0.02978515625, + "learning_rate": 0.014551646627769078, + "loss": 0.8245, + "num_input_tokens_seen": 43541408, + "step": 75005 + }, + { + "epoch": 11.172177539469764, + "grad_norm": 0.033447265625, + "learning_rate": 0.014549697849818159, + "loss": 0.8099, + "num_input_tokens_seen": 43544672, + "step": 75010 + }, + { + "epoch": 11.172922252010723, + "grad_norm": 0.036865234375, + "learning_rate": 0.014547749079474634, + "loss": 0.8075, + "num_input_tokens_seen": 43547680, + "step": 75015 + }, + { + "epoch": 11.173666964551684, + "grad_norm": 0.02880859375, + "learning_rate": 0.014545800316771433, + "loss": 0.8077, + "num_input_tokens_seen": 43550656, + "step": 75020 + }, + { + "epoch": 11.174411677092642, + "grad_norm": 0.047607421875, + "learning_rate": 0.014543851561741475, + "loss": 0.7993, + "num_input_tokens_seen": 43553664, + "step": 75025 + }, + { + "epoch": 11.175156389633601, + "grad_norm": 0.050537109375, + "learning_rate": 0.014541902814417675, + "loss": 0.8192, + "num_input_tokens_seen": 43556576, + "step": 75030 + }, + { + "epoch": 11.17590110217456, + "grad_norm": 0.027099609375, + "learning_rate": 0.014539954074832965, + "loss": 0.8105, + "num_input_tokens_seen": 43559168, + "step": 75035 + }, + { + "epoch": 11.17664581471552, + "grad_norm": 0.03466796875, + "learning_rate": 0.014538005343020263, + "loss": 0.7984, + "num_input_tokens_seen": 43561792, + "step": 75040 + }, + { + "epoch": 11.17739052725648, + "grad_norm": 0.0301513671875, + "learning_rate": 0.014536056619012496, + "loss": 0.786, + "num_input_tokens_seen": 43564768, + "step": 75045 + }, + { + "epoch": 11.178135239797438, + "grad_norm": 0.04052734375, + "learning_rate": 0.014534107902842573, + "loss": 0.7825, + "num_input_tokens_seen": 43567648, + "step": 75050 + }, + { + "epoch": 11.178879952338397, + "grad_norm": 0.0380859375, + "learning_rate": 0.014532159194543428, + "loss": 0.7855, + "num_input_tokens_seen": 43570528, + "step": 75055 + }, + { + "epoch": 11.179624664879357, + "grad_norm": 0.025390625, + "learning_rate": 0.014530210494147974, + "loss": 0.7997, + "num_input_tokens_seen": 43573440, + "step": 75060 + }, + { + "epoch": 11.180369377420316, + "grad_norm": 0.033935546875, + "learning_rate": 0.014528261801689143, + "loss": 0.8353, + "num_input_tokens_seen": 43576448, + "step": 75065 + }, + { + "epoch": 11.181114089961275, + "grad_norm": 0.03466796875, + "learning_rate": 0.014526313117199843, + "loss": 0.8076, + "num_input_tokens_seen": 43579264, + "step": 75070 + }, + { + "epoch": 11.181858802502234, + "grad_norm": 0.0341796875, + "learning_rate": 0.014524364440713004, + "loss": 0.7876, + "num_input_tokens_seen": 43582176, + "step": 75075 + }, + { + "epoch": 11.182603515043194, + "grad_norm": 0.0302734375, + "learning_rate": 0.014522415772261547, + "loss": 0.8088, + "num_input_tokens_seen": 43585088, + "step": 75080 + }, + { + "epoch": 11.183348227584153, + "grad_norm": 0.0303955078125, + "learning_rate": 0.014520467111878384, + "loss": 0.7899, + "num_input_tokens_seen": 43587744, + "step": 75085 + }, + { + "epoch": 11.184092940125112, + "grad_norm": 0.0281982421875, + "learning_rate": 0.014518518459596447, + "loss": 0.792, + "num_input_tokens_seen": 43590528, + "step": 75090 + }, + { + "epoch": 11.18483765266607, + "grad_norm": 0.0291748046875, + "learning_rate": 0.014516569815448648, + "loss": 0.8091, + "num_input_tokens_seen": 43593248, + "step": 75095 + }, + { + "epoch": 11.18558236520703, + "grad_norm": 0.031494140625, + "learning_rate": 0.014514621179467915, + "loss": 0.8096, + "num_input_tokens_seen": 43596160, + "step": 75100 + }, + { + "epoch": 11.18632707774799, + "grad_norm": 0.020751953125, + "learning_rate": 0.014512672551687155, + "loss": 0.7951, + "num_input_tokens_seen": 43598880, + "step": 75105 + }, + { + "epoch": 11.187071790288948, + "grad_norm": 0.0400390625, + "learning_rate": 0.014510723932139306, + "loss": 0.7931, + "num_input_tokens_seen": 43601696, + "step": 75110 + }, + { + "epoch": 11.187816502829907, + "grad_norm": 0.03173828125, + "learning_rate": 0.014508775320857278, + "loss": 0.79, + "num_input_tokens_seen": 43604672, + "step": 75115 + }, + { + "epoch": 11.188561215370866, + "grad_norm": 0.034423828125, + "learning_rate": 0.014506826717873986, + "loss": 0.7816, + "num_input_tokens_seen": 43607424, + "step": 75120 + }, + { + "epoch": 11.189305927911827, + "grad_norm": 0.055419921875, + "learning_rate": 0.01450487812322236, + "loss": 0.8301, + "num_input_tokens_seen": 43610528, + "step": 75125 + }, + { + "epoch": 11.190050640452785, + "grad_norm": 0.03076171875, + "learning_rate": 0.014502929536935313, + "loss": 0.79, + "num_input_tokens_seen": 43613376, + "step": 75130 + }, + { + "epoch": 11.190795352993744, + "grad_norm": 0.027099609375, + "learning_rate": 0.01450098095904577, + "loss": 0.7869, + "num_input_tokens_seen": 43616384, + "step": 75135 + }, + { + "epoch": 11.191540065534703, + "grad_norm": 0.04296875, + "learning_rate": 0.01449903238958664, + "loss": 0.8021, + "num_input_tokens_seen": 43619104, + "step": 75140 + }, + { + "epoch": 11.192284778075663, + "grad_norm": 0.03857421875, + "learning_rate": 0.014497083828590853, + "loss": 0.7964, + "num_input_tokens_seen": 43622080, + "step": 75145 + }, + { + "epoch": 11.193029490616622, + "grad_norm": 0.022705078125, + "learning_rate": 0.014495135276091326, + "loss": 0.8167, + "num_input_tokens_seen": 43625056, + "step": 75150 + }, + { + "epoch": 11.19377420315758, + "grad_norm": 0.03515625, + "learning_rate": 0.014493186732120968, + "loss": 0.7992, + "num_input_tokens_seen": 43628096, + "step": 75155 + }, + { + "epoch": 11.19451891569854, + "grad_norm": 0.043701171875, + "learning_rate": 0.014491238196712712, + "loss": 0.774, + "num_input_tokens_seen": 43630912, + "step": 75160 + }, + { + "epoch": 11.1952636282395, + "grad_norm": 0.0194091796875, + "learning_rate": 0.014489289669899465, + "loss": 0.811, + "num_input_tokens_seen": 43633664, + "step": 75165 + }, + { + "epoch": 11.196008340780459, + "grad_norm": 0.02978515625, + "learning_rate": 0.014487341151714154, + "loss": 0.7914, + "num_input_tokens_seen": 43636512, + "step": 75170 + }, + { + "epoch": 11.196753053321418, + "grad_norm": 0.0439453125, + "learning_rate": 0.014485392642189687, + "loss": 0.7744, + "num_input_tokens_seen": 43639488, + "step": 75175 + }, + { + "epoch": 11.197497765862376, + "grad_norm": 0.0322265625, + "learning_rate": 0.014483444141358994, + "loss": 0.8096, + "num_input_tokens_seen": 43642240, + "step": 75180 + }, + { + "epoch": 11.198242478403337, + "grad_norm": 0.03759765625, + "learning_rate": 0.01448149564925498, + "loss": 0.8142, + "num_input_tokens_seen": 43644992, + "step": 75185 + }, + { + "epoch": 11.198987190944296, + "grad_norm": 0.0291748046875, + "learning_rate": 0.01447954716591058, + "loss": 0.7903, + "num_input_tokens_seen": 43647872, + "step": 75190 + }, + { + "epoch": 11.199731903485254, + "grad_norm": 0.0322265625, + "learning_rate": 0.014477598691358699, + "loss": 0.8146, + "num_input_tokens_seen": 43650720, + "step": 75195 + }, + { + "epoch": 11.200476616026213, + "grad_norm": 0.02587890625, + "learning_rate": 0.014475650225632254, + "loss": 0.8254, + "num_input_tokens_seen": 43653312, + "step": 75200 + }, + { + "epoch": 11.201221328567174, + "grad_norm": 0.0289306640625, + "learning_rate": 0.014473701768764169, + "loss": 0.8034, + "num_input_tokens_seen": 43656000, + "step": 75205 + }, + { + "epoch": 11.201966041108133, + "grad_norm": 0.0250244140625, + "learning_rate": 0.014471753320787352, + "loss": 0.8118, + "num_input_tokens_seen": 43658784, + "step": 75210 + }, + { + "epoch": 11.202710753649091, + "grad_norm": 0.038330078125, + "learning_rate": 0.014469804881734731, + "loss": 0.7905, + "num_input_tokens_seen": 43661664, + "step": 75215 + }, + { + "epoch": 11.20345546619005, + "grad_norm": 0.033203125, + "learning_rate": 0.014467856451639212, + "loss": 0.8139, + "num_input_tokens_seen": 43664544, + "step": 75220 + }, + { + "epoch": 11.20420017873101, + "grad_norm": 0.036865234375, + "learning_rate": 0.014465908030533723, + "loss": 0.8162, + "num_input_tokens_seen": 43667776, + "step": 75225 + }, + { + "epoch": 11.20494489127197, + "grad_norm": 0.035888671875, + "learning_rate": 0.014463959618451176, + "loss": 0.7933, + "num_input_tokens_seen": 43670688, + "step": 75230 + }, + { + "epoch": 11.205689603812928, + "grad_norm": 0.0238037109375, + "learning_rate": 0.014462011215424482, + "loss": 0.7993, + "num_input_tokens_seen": 43673792, + "step": 75235 + }, + { + "epoch": 11.206434316353887, + "grad_norm": 0.032958984375, + "learning_rate": 0.014460062821486567, + "loss": 0.8068, + "num_input_tokens_seen": 43676800, + "step": 75240 + }, + { + "epoch": 11.207179028894847, + "grad_norm": 0.045166015625, + "learning_rate": 0.014458114436670333, + "loss": 0.8072, + "num_input_tokens_seen": 43679392, + "step": 75245 + }, + { + "epoch": 11.207923741435806, + "grad_norm": 0.03271484375, + "learning_rate": 0.014456166061008713, + "loss": 0.8018, + "num_input_tokens_seen": 43681984, + "step": 75250 + }, + { + "epoch": 11.208668453976765, + "grad_norm": 0.02392578125, + "learning_rate": 0.014454217694534607, + "loss": 0.8181, + "num_input_tokens_seen": 43684864, + "step": 75255 + }, + { + "epoch": 11.209413166517724, + "grad_norm": 0.0201416015625, + "learning_rate": 0.014452269337280946, + "loss": 0.7872, + "num_input_tokens_seen": 43687776, + "step": 75260 + }, + { + "epoch": 11.210157879058684, + "grad_norm": 0.0311279296875, + "learning_rate": 0.014450320989280634, + "loss": 0.7936, + "num_input_tokens_seen": 43690720, + "step": 75265 + }, + { + "epoch": 11.210902591599643, + "grad_norm": 0.0380859375, + "learning_rate": 0.014448372650566593, + "loss": 0.8153, + "num_input_tokens_seen": 43693696, + "step": 75270 + }, + { + "epoch": 11.211647304140602, + "grad_norm": 0.027587890625, + "learning_rate": 0.014446424321171736, + "loss": 0.7986, + "num_input_tokens_seen": 43696512, + "step": 75275 + }, + { + "epoch": 11.21239201668156, + "grad_norm": 0.029541015625, + "learning_rate": 0.014444476001128971, + "loss": 0.803, + "num_input_tokens_seen": 43699424, + "step": 75280 + }, + { + "epoch": 11.21313672922252, + "grad_norm": 0.041259765625, + "learning_rate": 0.014442527690471227, + "loss": 0.8075, + "num_input_tokens_seen": 43702176, + "step": 75285 + }, + { + "epoch": 11.21388144176348, + "grad_norm": 0.034423828125, + "learning_rate": 0.014440579389231403, + "loss": 0.7824, + "num_input_tokens_seen": 43705216, + "step": 75290 + }, + { + "epoch": 11.214626154304439, + "grad_norm": 0.03369140625, + "learning_rate": 0.014438631097442428, + "loss": 0.8111, + "num_input_tokens_seen": 43708256, + "step": 75295 + }, + { + "epoch": 11.215370866845397, + "grad_norm": 0.0308837890625, + "learning_rate": 0.014436682815137207, + "loss": 0.812, + "num_input_tokens_seen": 43711520, + "step": 75300 + }, + { + "epoch": 11.216115579386356, + "grad_norm": 0.0205078125, + "learning_rate": 0.014434734542348662, + "loss": 0.8134, + "num_input_tokens_seen": 43714368, + "step": 75305 + }, + { + "epoch": 11.216860291927317, + "grad_norm": 0.03564453125, + "learning_rate": 0.014432786279109702, + "loss": 0.789, + "num_input_tokens_seen": 43717216, + "step": 75310 + }, + { + "epoch": 11.217605004468275, + "grad_norm": 0.0274658203125, + "learning_rate": 0.014430838025453233, + "loss": 0.7994, + "num_input_tokens_seen": 43719936, + "step": 75315 + }, + { + "epoch": 11.218349717009234, + "grad_norm": 0.0235595703125, + "learning_rate": 0.014428889781412184, + "loss": 0.7996, + "num_input_tokens_seen": 43722784, + "step": 75320 + }, + { + "epoch": 11.219094429550193, + "grad_norm": 0.033203125, + "learning_rate": 0.014426941547019456, + "loss": 0.7792, + "num_input_tokens_seen": 43725792, + "step": 75325 + }, + { + "epoch": 11.219839142091153, + "grad_norm": 0.0308837890625, + "learning_rate": 0.014424993322307976, + "loss": 0.805, + "num_input_tokens_seen": 43728832, + "step": 75330 + }, + { + "epoch": 11.220583854632112, + "grad_norm": 0.0299072265625, + "learning_rate": 0.014423045107310644, + "loss": 0.7969, + "num_input_tokens_seen": 43731776, + "step": 75335 + }, + { + "epoch": 11.221328567173071, + "grad_norm": 0.035400390625, + "learning_rate": 0.014421096902060383, + "loss": 0.8199, + "num_input_tokens_seen": 43734656, + "step": 75340 + }, + { + "epoch": 11.22207327971403, + "grad_norm": 0.023193359375, + "learning_rate": 0.014419148706590093, + "loss": 0.8082, + "num_input_tokens_seen": 43737664, + "step": 75345 + }, + { + "epoch": 11.22281799225499, + "grad_norm": 0.044677734375, + "learning_rate": 0.014417200520932703, + "loss": 0.7878, + "num_input_tokens_seen": 43740544, + "step": 75350 + }, + { + "epoch": 11.223562704795949, + "grad_norm": 0.0380859375, + "learning_rate": 0.014415252345121119, + "loss": 0.7999, + "num_input_tokens_seen": 43743232, + "step": 75355 + }, + { + "epoch": 11.224307417336908, + "grad_norm": 0.034423828125, + "learning_rate": 0.014413304179188244, + "loss": 0.8211, + "num_input_tokens_seen": 43746272, + "step": 75360 + }, + { + "epoch": 11.225052129877866, + "grad_norm": 0.04541015625, + "learning_rate": 0.014411356023167007, + "loss": 0.7918, + "num_input_tokens_seen": 43749184, + "step": 75365 + }, + { + "epoch": 11.225796842418827, + "grad_norm": 0.03076171875, + "learning_rate": 0.014409407877090307, + "loss": 0.8219, + "num_input_tokens_seen": 43752192, + "step": 75370 + }, + { + "epoch": 11.226541554959786, + "grad_norm": 0.031005859375, + "learning_rate": 0.014407459740991065, + "loss": 0.7883, + "num_input_tokens_seen": 43754944, + "step": 75375 + }, + { + "epoch": 11.227286267500745, + "grad_norm": 0.035400390625, + "learning_rate": 0.014405511614902182, + "loss": 0.7967, + "num_input_tokens_seen": 43758016, + "step": 75380 + }, + { + "epoch": 11.228030980041703, + "grad_norm": 0.02490234375, + "learning_rate": 0.014403563498856582, + "loss": 0.7942, + "num_input_tokens_seen": 43760992, + "step": 75385 + }, + { + "epoch": 11.228775692582664, + "grad_norm": 0.03125, + "learning_rate": 0.014401615392887169, + "loss": 0.7939, + "num_input_tokens_seen": 43764192, + "step": 75390 + }, + { + "epoch": 11.229520405123623, + "grad_norm": 0.04931640625, + "learning_rate": 0.014399667297026854, + "loss": 0.8317, + "num_input_tokens_seen": 43766880, + "step": 75395 + }, + { + "epoch": 11.230265117664581, + "grad_norm": 0.0296630859375, + "learning_rate": 0.01439771921130855, + "loss": 0.8053, + "num_input_tokens_seen": 43769696, + "step": 75400 + }, + { + "epoch": 11.23100983020554, + "grad_norm": 0.06640625, + "learning_rate": 0.014395771135765168, + "loss": 0.7953, + "num_input_tokens_seen": 43772640, + "step": 75405 + }, + { + "epoch": 11.2317545427465, + "grad_norm": 0.046875, + "learning_rate": 0.014393823070429622, + "loss": 0.7947, + "num_input_tokens_seen": 43775712, + "step": 75410 + }, + { + "epoch": 11.23249925528746, + "grad_norm": 0.06005859375, + "learning_rate": 0.01439187501533481, + "loss": 0.8015, + "num_input_tokens_seen": 43778592, + "step": 75415 + }, + { + "epoch": 11.233243967828418, + "grad_norm": 0.036376953125, + "learning_rate": 0.01438992697051366, + "loss": 0.8, + "num_input_tokens_seen": 43781472, + "step": 75420 + }, + { + "epoch": 11.233988680369377, + "grad_norm": 0.040771484375, + "learning_rate": 0.014387978935999073, + "loss": 0.7835, + "num_input_tokens_seen": 43784512, + "step": 75425 + }, + { + "epoch": 11.234733392910337, + "grad_norm": 0.0299072265625, + "learning_rate": 0.01438603091182396, + "loss": 0.799, + "num_input_tokens_seen": 43787264, + "step": 75430 + }, + { + "epoch": 11.235478105451296, + "grad_norm": 0.0322265625, + "learning_rate": 0.01438408289802123, + "loss": 0.8038, + "num_input_tokens_seen": 43789952, + "step": 75435 + }, + { + "epoch": 11.236222817992255, + "grad_norm": 0.03759765625, + "learning_rate": 0.01438213489462379, + "loss": 0.8045, + "num_input_tokens_seen": 43792864, + "step": 75440 + }, + { + "epoch": 11.236967530533214, + "grad_norm": 0.037109375, + "learning_rate": 0.014380186901664557, + "loss": 0.8156, + "num_input_tokens_seen": 43795616, + "step": 75445 + }, + { + "epoch": 11.237712243074174, + "grad_norm": 0.027099609375, + "learning_rate": 0.014378238919176431, + "loss": 0.798, + "num_input_tokens_seen": 43798528, + "step": 75450 + }, + { + "epoch": 11.238456955615133, + "grad_norm": 0.031982421875, + "learning_rate": 0.014376290947192331, + "loss": 0.8025, + "num_input_tokens_seen": 43801440, + "step": 75455 + }, + { + "epoch": 11.239201668156092, + "grad_norm": 0.05029296875, + "learning_rate": 0.01437434298574516, + "loss": 0.8067, + "num_input_tokens_seen": 43804320, + "step": 75460 + }, + { + "epoch": 11.23994638069705, + "grad_norm": 0.041748046875, + "learning_rate": 0.014372395034867832, + "loss": 0.7914, + "num_input_tokens_seen": 43806944, + "step": 75465 + }, + { + "epoch": 11.24069109323801, + "grad_norm": 0.02685546875, + "learning_rate": 0.014370447094593251, + "loss": 0.8247, + "num_input_tokens_seen": 43809632, + "step": 75470 + }, + { + "epoch": 11.24143580577897, + "grad_norm": 0.05078125, + "learning_rate": 0.014368499164954321, + "loss": 0.8128, + "num_input_tokens_seen": 43812096, + "step": 75475 + }, + { + "epoch": 11.242180518319929, + "grad_norm": 0.037353515625, + "learning_rate": 0.014366551245983962, + "loss": 0.8176, + "num_input_tokens_seen": 43814688, + "step": 75480 + }, + { + "epoch": 11.242925230860887, + "grad_norm": 0.064453125, + "learning_rate": 0.01436460333771507, + "loss": 0.7904, + "num_input_tokens_seen": 43817632, + "step": 75485 + }, + { + "epoch": 11.243669943401846, + "grad_norm": 0.049560546875, + "learning_rate": 0.014362655440180566, + "loss": 0.8086, + "num_input_tokens_seen": 43820288, + "step": 75490 + }, + { + "epoch": 11.244414655942807, + "grad_norm": 0.0517578125, + "learning_rate": 0.014360707553413345, + "loss": 0.7876, + "num_input_tokens_seen": 43823008, + "step": 75495 + }, + { + "epoch": 11.245159368483765, + "grad_norm": 0.02880859375, + "learning_rate": 0.014358759677446323, + "loss": 0.7923, + "num_input_tokens_seen": 43826336, + "step": 75500 + }, + { + "epoch": 11.245904081024724, + "grad_norm": 0.0498046875, + "learning_rate": 0.0143568118123124, + "loss": 0.8103, + "num_input_tokens_seen": 43829472, + "step": 75505 + }, + { + "epoch": 11.246648793565683, + "grad_norm": 0.043701171875, + "learning_rate": 0.014354863958044493, + "loss": 0.8105, + "num_input_tokens_seen": 43832448, + "step": 75510 + }, + { + "epoch": 11.247393506106643, + "grad_norm": 0.038330078125, + "learning_rate": 0.014352916114675504, + "loss": 0.7891, + "num_input_tokens_seen": 43835488, + "step": 75515 + }, + { + "epoch": 11.248138218647602, + "grad_norm": 0.031005859375, + "learning_rate": 0.014350968282238334, + "loss": 0.8075, + "num_input_tokens_seen": 43838368, + "step": 75520 + }, + { + "epoch": 11.248882931188561, + "grad_norm": 0.03759765625, + "learning_rate": 0.014349020460765902, + "loss": 0.8005, + "num_input_tokens_seen": 43841152, + "step": 75525 + }, + { + "epoch": 11.24962764372952, + "grad_norm": 0.030517578125, + "learning_rate": 0.014347072650291103, + "loss": 0.7974, + "num_input_tokens_seen": 43843936, + "step": 75530 + }, + { + "epoch": 11.25037235627048, + "grad_norm": 0.030029296875, + "learning_rate": 0.01434512485084685, + "loss": 0.8037, + "num_input_tokens_seen": 43846720, + "step": 75535 + }, + { + "epoch": 11.251117068811439, + "grad_norm": 0.0308837890625, + "learning_rate": 0.014343177062466045, + "loss": 0.7887, + "num_input_tokens_seen": 43849728, + "step": 75540 + }, + { + "epoch": 11.251861781352398, + "grad_norm": 0.08203125, + "learning_rate": 0.014341229285181598, + "loss": 0.8315, + "num_input_tokens_seen": 43852672, + "step": 75545 + }, + { + "epoch": 11.252606493893357, + "grad_norm": 0.0186767578125, + "learning_rate": 0.014339281519026414, + "loss": 0.7971, + "num_input_tokens_seen": 43855360, + "step": 75550 + }, + { + "epoch": 11.253351206434317, + "grad_norm": 0.035888671875, + "learning_rate": 0.014337333764033393, + "loss": 0.7823, + "num_input_tokens_seen": 43858144, + "step": 75555 + }, + { + "epoch": 11.254095918975276, + "grad_norm": 0.0194091796875, + "learning_rate": 0.014335386020235447, + "loss": 0.7837, + "num_input_tokens_seen": 43860992, + "step": 75560 + }, + { + "epoch": 11.254840631516235, + "grad_norm": 0.0322265625, + "learning_rate": 0.014333438287665479, + "loss": 0.7866, + "num_input_tokens_seen": 43863936, + "step": 75565 + }, + { + "epoch": 11.255585344057193, + "grad_norm": 0.03076171875, + "learning_rate": 0.014331490566356395, + "loss": 0.8051, + "num_input_tokens_seen": 43866912, + "step": 75570 + }, + { + "epoch": 11.256330056598154, + "grad_norm": 0.032958984375, + "learning_rate": 0.014329542856341092, + "loss": 0.8066, + "num_input_tokens_seen": 43869696, + "step": 75575 + }, + { + "epoch": 11.257074769139113, + "grad_norm": 0.030517578125, + "learning_rate": 0.01432759515765249, + "loss": 0.7811, + "num_input_tokens_seen": 43873056, + "step": 75580 + }, + { + "epoch": 11.257819481680071, + "grad_norm": 0.0322265625, + "learning_rate": 0.014325647470323475, + "loss": 0.7989, + "num_input_tokens_seen": 43875872, + "step": 75585 + }, + { + "epoch": 11.25856419422103, + "grad_norm": 0.019287109375, + "learning_rate": 0.014323699794386968, + "loss": 0.8178, + "num_input_tokens_seen": 43878784, + "step": 75590 + }, + { + "epoch": 11.25930890676199, + "grad_norm": 0.0289306640625, + "learning_rate": 0.014321752129875867, + "loss": 0.821, + "num_input_tokens_seen": 43881760, + "step": 75595 + }, + { + "epoch": 11.26005361930295, + "grad_norm": 0.05615234375, + "learning_rate": 0.014319804476823069, + "loss": 0.7889, + "num_input_tokens_seen": 43884544, + "step": 75600 + }, + { + "epoch": 11.260798331843908, + "grad_norm": 0.0283203125, + "learning_rate": 0.014317856835261487, + "loss": 0.7918, + "num_input_tokens_seen": 43887392, + "step": 75605 + }, + { + "epoch": 11.261543044384867, + "grad_norm": 0.047607421875, + "learning_rate": 0.014315909205224013, + "loss": 0.8009, + "num_input_tokens_seen": 43890144, + "step": 75610 + }, + { + "epoch": 11.262287756925826, + "grad_norm": 0.03515625, + "learning_rate": 0.014313961586743565, + "loss": 0.8018, + "num_input_tokens_seen": 43892832, + "step": 75615 + }, + { + "epoch": 11.263032469466786, + "grad_norm": 0.0498046875, + "learning_rate": 0.014312013979853034, + "loss": 0.8134, + "num_input_tokens_seen": 43895680, + "step": 75620 + }, + { + "epoch": 11.263777182007745, + "grad_norm": 0.039794921875, + "learning_rate": 0.014310066384585332, + "loss": 0.7946, + "num_input_tokens_seen": 43898720, + "step": 75625 + }, + { + "epoch": 11.264521894548704, + "grad_norm": 0.0213623046875, + "learning_rate": 0.014308118800973359, + "loss": 0.8009, + "num_input_tokens_seen": 43901504, + "step": 75630 + }, + { + "epoch": 11.265266607089663, + "grad_norm": 0.04150390625, + "learning_rate": 0.01430617122905001, + "loss": 0.7966, + "num_input_tokens_seen": 43904704, + "step": 75635 + }, + { + "epoch": 11.266011319630623, + "grad_norm": 0.03271484375, + "learning_rate": 0.014304223668848199, + "loss": 0.795, + "num_input_tokens_seen": 43907392, + "step": 75640 + }, + { + "epoch": 11.266756032171582, + "grad_norm": 0.0306396484375, + "learning_rate": 0.014302276120400812, + "loss": 0.8145, + "num_input_tokens_seen": 43910176, + "step": 75645 + }, + { + "epoch": 11.26750074471254, + "grad_norm": 0.039306640625, + "learning_rate": 0.01430032858374077, + "loss": 0.7782, + "num_input_tokens_seen": 43912960, + "step": 75650 + }, + { + "epoch": 11.2682454572535, + "grad_norm": 0.03125, + "learning_rate": 0.01429838105890096, + "loss": 0.7875, + "num_input_tokens_seen": 43915552, + "step": 75655 + }, + { + "epoch": 11.26899016979446, + "grad_norm": 0.0277099609375, + "learning_rate": 0.01429643354591429, + "loss": 0.7887, + "num_input_tokens_seen": 43918464, + "step": 75660 + }, + { + "epoch": 11.269734882335419, + "grad_norm": 0.0361328125, + "learning_rate": 0.014294486044813662, + "loss": 0.7917, + "num_input_tokens_seen": 43921792, + "step": 75665 + }, + { + "epoch": 11.270479594876377, + "grad_norm": 0.057861328125, + "learning_rate": 0.014292538555631978, + "loss": 0.8157, + "num_input_tokens_seen": 43924448, + "step": 75670 + }, + { + "epoch": 11.271224307417336, + "grad_norm": 0.03271484375, + "learning_rate": 0.014290591078402134, + "loss": 0.8176, + "num_input_tokens_seen": 43927456, + "step": 75675 + }, + { + "epoch": 11.271969019958297, + "grad_norm": 0.042236328125, + "learning_rate": 0.014288643613157027, + "loss": 0.7951, + "num_input_tokens_seen": 43930592, + "step": 75680 + }, + { + "epoch": 11.272713732499255, + "grad_norm": 0.04638671875, + "learning_rate": 0.014286696159929571, + "loss": 0.802, + "num_input_tokens_seen": 43933728, + "step": 75685 + }, + { + "epoch": 11.273458445040214, + "grad_norm": 0.041259765625, + "learning_rate": 0.01428474871875265, + "loss": 0.8039, + "num_input_tokens_seen": 43936736, + "step": 75690 + }, + { + "epoch": 11.274203157581173, + "grad_norm": 0.09375, + "learning_rate": 0.01428280128965918, + "loss": 0.8165, + "num_input_tokens_seen": 43939616, + "step": 75695 + }, + { + "epoch": 11.274947870122134, + "grad_norm": 0.048828125, + "learning_rate": 0.01428085387268205, + "loss": 0.8071, + "num_input_tokens_seen": 43942720, + "step": 75700 + }, + { + "epoch": 11.275692582663092, + "grad_norm": 0.044921875, + "learning_rate": 0.014278906467854166, + "loss": 0.7854, + "num_input_tokens_seen": 43945536, + "step": 75705 + }, + { + "epoch": 11.276437295204051, + "grad_norm": 0.04150390625, + "learning_rate": 0.014276959075208416, + "loss": 0.7966, + "num_input_tokens_seen": 43948672, + "step": 75710 + }, + { + "epoch": 11.27718200774501, + "grad_norm": 0.032470703125, + "learning_rate": 0.014275011694777717, + "loss": 0.7891, + "num_input_tokens_seen": 43951488, + "step": 75715 + }, + { + "epoch": 11.27792672028597, + "grad_norm": 0.041259765625, + "learning_rate": 0.014273064326594956, + "loss": 0.796, + "num_input_tokens_seen": 43954432, + "step": 75720 + }, + { + "epoch": 11.278671432826929, + "grad_norm": 0.048828125, + "learning_rate": 0.014271116970693035, + "loss": 0.7882, + "num_input_tokens_seen": 43957280, + "step": 75725 + }, + { + "epoch": 11.279416145367888, + "grad_norm": 0.029541015625, + "learning_rate": 0.014269169627104849, + "loss": 0.8171, + "num_input_tokens_seen": 43959872, + "step": 75730 + }, + { + "epoch": 11.280160857908847, + "grad_norm": 0.037841796875, + "learning_rate": 0.014267222295863299, + "loss": 0.7774, + "num_input_tokens_seen": 43962560, + "step": 75735 + }, + { + "epoch": 11.280905570449807, + "grad_norm": 0.04931640625, + "learning_rate": 0.014265274977001287, + "loss": 0.8112, + "num_input_tokens_seen": 43965600, + "step": 75740 + }, + { + "epoch": 11.281650282990766, + "grad_norm": 0.0322265625, + "learning_rate": 0.014263327670551702, + "loss": 0.7927, + "num_input_tokens_seen": 43968512, + "step": 75745 + }, + { + "epoch": 11.282394995531725, + "grad_norm": 0.041748046875, + "learning_rate": 0.014261380376547454, + "loss": 0.8141, + "num_input_tokens_seen": 43971808, + "step": 75750 + }, + { + "epoch": 11.283139708072683, + "grad_norm": 0.040771484375, + "learning_rate": 0.014259433095021434, + "loss": 0.7789, + "num_input_tokens_seen": 43974816, + "step": 75755 + }, + { + "epoch": 11.283884420613644, + "grad_norm": 0.04248046875, + "learning_rate": 0.014257485826006536, + "loss": 0.8, + "num_input_tokens_seen": 43978016, + "step": 75760 + }, + { + "epoch": 11.284629133154603, + "grad_norm": 0.033447265625, + "learning_rate": 0.014255538569535662, + "loss": 0.8084, + "num_input_tokens_seen": 43981184, + "step": 75765 + }, + { + "epoch": 11.285373845695561, + "grad_norm": 0.020751953125, + "learning_rate": 0.0142535913256417, + "loss": 0.7962, + "num_input_tokens_seen": 43984480, + "step": 75770 + }, + { + "epoch": 11.28611855823652, + "grad_norm": 0.036376953125, + "learning_rate": 0.014251644094357563, + "loss": 0.8069, + "num_input_tokens_seen": 43987424, + "step": 75775 + }, + { + "epoch": 11.28686327077748, + "grad_norm": 0.050537109375, + "learning_rate": 0.014249696875716132, + "loss": 0.805, + "num_input_tokens_seen": 43990144, + "step": 75780 + }, + { + "epoch": 11.28760798331844, + "grad_norm": 0.043212890625, + "learning_rate": 0.014247749669750315, + "loss": 0.8059, + "num_input_tokens_seen": 43993248, + "step": 75785 + }, + { + "epoch": 11.288352695859398, + "grad_norm": 0.031982421875, + "learning_rate": 0.014245802476492998, + "loss": 0.796, + "num_input_tokens_seen": 43997184, + "step": 75790 + }, + { + "epoch": 11.289097408400357, + "grad_norm": 0.03076171875, + "learning_rate": 0.014243855295977086, + "loss": 0.7873, + "num_input_tokens_seen": 44000192, + "step": 75795 + }, + { + "epoch": 11.289842120941316, + "grad_norm": 0.0286865234375, + "learning_rate": 0.014241908128235472, + "loss": 0.7772, + "num_input_tokens_seen": 44003232, + "step": 75800 + }, + { + "epoch": 11.290586833482276, + "grad_norm": 0.0279541015625, + "learning_rate": 0.014239960973301042, + "loss": 0.8148, + "num_input_tokens_seen": 44006208, + "step": 75805 + }, + { + "epoch": 11.291331546023235, + "grad_norm": 0.059326171875, + "learning_rate": 0.014238013831206707, + "loss": 0.7836, + "num_input_tokens_seen": 44008992, + "step": 75810 + }, + { + "epoch": 11.292076258564194, + "grad_norm": 0.0439453125, + "learning_rate": 0.014236066701985345, + "loss": 0.7895, + "num_input_tokens_seen": 44011904, + "step": 75815 + }, + { + "epoch": 11.292820971105153, + "grad_norm": 0.03564453125, + "learning_rate": 0.014234119585669867, + "loss": 0.782, + "num_input_tokens_seen": 44014656, + "step": 75820 + }, + { + "epoch": 11.293565683646113, + "grad_norm": 0.0301513671875, + "learning_rate": 0.014232172482293156, + "loss": 0.8189, + "num_input_tokens_seen": 44017472, + "step": 75825 + }, + { + "epoch": 11.294310396187072, + "grad_norm": 0.07373046875, + "learning_rate": 0.014230225391888115, + "loss": 0.8134, + "num_input_tokens_seen": 44020640, + "step": 75830 + }, + { + "epoch": 11.29505510872803, + "grad_norm": 0.0341796875, + "learning_rate": 0.014228278314487634, + "loss": 0.8135, + "num_input_tokens_seen": 44023648, + "step": 75835 + }, + { + "epoch": 11.29579982126899, + "grad_norm": 0.040283203125, + "learning_rate": 0.014226331250124598, + "loss": 0.8007, + "num_input_tokens_seen": 44026592, + "step": 75840 + }, + { + "epoch": 11.29654453380995, + "grad_norm": 0.048828125, + "learning_rate": 0.014224384198831918, + "loss": 0.7811, + "num_input_tokens_seen": 44029440, + "step": 75845 + }, + { + "epoch": 11.297289246350909, + "grad_norm": 0.0306396484375, + "learning_rate": 0.01422243716064247, + "loss": 0.7911, + "num_input_tokens_seen": 44032256, + "step": 75850 + }, + { + "epoch": 11.298033958891867, + "grad_norm": 0.032470703125, + "learning_rate": 0.014220490135589164, + "loss": 0.8087, + "num_input_tokens_seen": 44035456, + "step": 75855 + }, + { + "epoch": 11.298778671432826, + "grad_norm": 0.0380859375, + "learning_rate": 0.014218543123704882, + "loss": 0.7741, + "num_input_tokens_seen": 44038368, + "step": 75860 + }, + { + "epoch": 11.299523383973787, + "grad_norm": 0.03515625, + "learning_rate": 0.014216596125022522, + "loss": 0.8123, + "num_input_tokens_seen": 44041600, + "step": 75865 + }, + { + "epoch": 11.300268096514746, + "grad_norm": 0.03564453125, + "learning_rate": 0.014214649139574967, + "loss": 0.7942, + "num_input_tokens_seen": 44044736, + "step": 75870 + }, + { + "epoch": 11.301012809055704, + "grad_norm": 0.0361328125, + "learning_rate": 0.014212702167395124, + "loss": 0.8315, + "num_input_tokens_seen": 44047744, + "step": 75875 + }, + { + "epoch": 11.301757521596663, + "grad_norm": 0.048828125, + "learning_rate": 0.014210755208515878, + "loss": 0.7922, + "num_input_tokens_seen": 44050560, + "step": 75880 + }, + { + "epoch": 11.302502234137624, + "grad_norm": 0.039794921875, + "learning_rate": 0.014208808262970114, + "loss": 0.7891, + "num_input_tokens_seen": 44053344, + "step": 75885 + }, + { + "epoch": 11.303246946678582, + "grad_norm": 0.032958984375, + "learning_rate": 0.014206861330790736, + "loss": 0.7936, + "num_input_tokens_seen": 44056000, + "step": 75890 + }, + { + "epoch": 11.303991659219541, + "grad_norm": 0.051025390625, + "learning_rate": 0.014204914412010628, + "loss": 0.8088, + "num_input_tokens_seen": 44058976, + "step": 75895 + }, + { + "epoch": 11.3047363717605, + "grad_norm": 0.031982421875, + "learning_rate": 0.014202967506662684, + "loss": 0.8023, + "num_input_tokens_seen": 44062048, + "step": 75900 + }, + { + "epoch": 11.30548108430146, + "grad_norm": 0.042724609375, + "learning_rate": 0.01420102061477979, + "loss": 0.7964, + "num_input_tokens_seen": 44065120, + "step": 75905 + }, + { + "epoch": 11.30622579684242, + "grad_norm": 0.03466796875, + "learning_rate": 0.014199073736394844, + "loss": 0.7893, + "num_input_tokens_seen": 44067904, + "step": 75910 + }, + { + "epoch": 11.306970509383378, + "grad_norm": 0.039306640625, + "learning_rate": 0.014197126871540736, + "loss": 0.7947, + "num_input_tokens_seen": 44071008, + "step": 75915 + }, + { + "epoch": 11.307715221924337, + "grad_norm": 0.022216796875, + "learning_rate": 0.014195180020250346, + "loss": 0.798, + "num_input_tokens_seen": 44073888, + "step": 75920 + }, + { + "epoch": 11.308459934465297, + "grad_norm": 0.045654296875, + "learning_rate": 0.014193233182556577, + "loss": 0.8045, + "num_input_tokens_seen": 44076544, + "step": 75925 + }, + { + "epoch": 11.309204647006256, + "grad_norm": 0.03173828125, + "learning_rate": 0.014191286358492311, + "loss": 0.8022, + "num_input_tokens_seen": 44079456, + "step": 75930 + }, + { + "epoch": 11.309949359547215, + "grad_norm": 0.036376953125, + "learning_rate": 0.014189339548090445, + "loss": 0.7943, + "num_input_tokens_seen": 44082272, + "step": 75935 + }, + { + "epoch": 11.310694072088173, + "grad_norm": 0.0291748046875, + "learning_rate": 0.014187392751383855, + "loss": 0.7735, + "num_input_tokens_seen": 44085088, + "step": 75940 + }, + { + "epoch": 11.311438784629134, + "grad_norm": 0.032958984375, + "learning_rate": 0.014185445968405446, + "loss": 0.7924, + "num_input_tokens_seen": 44088064, + "step": 75945 + }, + { + "epoch": 11.312183497170093, + "grad_norm": 0.040283203125, + "learning_rate": 0.014183499199188092, + "loss": 0.7946, + "num_input_tokens_seen": 44090816, + "step": 75950 + }, + { + "epoch": 11.312928209711052, + "grad_norm": 0.0390625, + "learning_rate": 0.014181552443764695, + "loss": 0.7788, + "num_input_tokens_seen": 44093696, + "step": 75955 + }, + { + "epoch": 11.31367292225201, + "grad_norm": 0.049072265625, + "learning_rate": 0.01417960570216814, + "loss": 0.7983, + "num_input_tokens_seen": 44096512, + "step": 75960 + }, + { + "epoch": 11.31441763479297, + "grad_norm": 0.054443359375, + "learning_rate": 0.014177658974431307, + "loss": 0.7766, + "num_input_tokens_seen": 44099712, + "step": 75965 + }, + { + "epoch": 11.31516234733393, + "grad_norm": 0.04833984375, + "learning_rate": 0.014175712260587095, + "loss": 0.8157, + "num_input_tokens_seen": 44102560, + "step": 75970 + }, + { + "epoch": 11.315907059874888, + "grad_norm": 0.05029296875, + "learning_rate": 0.01417376556066838, + "loss": 0.775, + "num_input_tokens_seen": 44105472, + "step": 75975 + }, + { + "epoch": 11.316651772415847, + "grad_norm": 0.046142578125, + "learning_rate": 0.014171818874708062, + "loss": 0.825, + "num_input_tokens_seen": 44108384, + "step": 75980 + }, + { + "epoch": 11.317396484956806, + "grad_norm": 0.039306640625, + "learning_rate": 0.014169872202739015, + "loss": 0.8147, + "num_input_tokens_seen": 44111424, + "step": 75985 + }, + { + "epoch": 11.318141197497766, + "grad_norm": 0.03662109375, + "learning_rate": 0.014167925544794142, + "loss": 0.7884, + "num_input_tokens_seen": 44114496, + "step": 75990 + }, + { + "epoch": 11.318885910038725, + "grad_norm": 0.032958984375, + "learning_rate": 0.014165978900906321, + "loss": 0.8158, + "num_input_tokens_seen": 44117696, + "step": 75995 + }, + { + "epoch": 11.319630622579684, + "grad_norm": 0.031494140625, + "learning_rate": 0.014164032271108434, + "loss": 0.7991, + "num_input_tokens_seen": 44120608, + "step": 76000 + }, + { + "epoch": 11.320375335120643, + "grad_norm": 0.050048828125, + "learning_rate": 0.014162085655433376, + "loss": 0.7906, + "num_input_tokens_seen": 44123392, + "step": 76005 + }, + { + "epoch": 11.321120047661603, + "grad_norm": 0.0400390625, + "learning_rate": 0.014160139053914023, + "loss": 0.7974, + "num_input_tokens_seen": 44126624, + "step": 76010 + }, + { + "epoch": 11.321864760202562, + "grad_norm": 0.05126953125, + "learning_rate": 0.014158192466583274, + "loss": 0.8082, + "num_input_tokens_seen": 44129600, + "step": 76015 + }, + { + "epoch": 11.32260947274352, + "grad_norm": 0.055908203125, + "learning_rate": 0.014156245893474, + "loss": 0.7971, + "num_input_tokens_seen": 44132384, + "step": 76020 + }, + { + "epoch": 11.32335418528448, + "grad_norm": 0.04296875, + "learning_rate": 0.0141542993346191, + "loss": 0.8005, + "num_input_tokens_seen": 44135264, + "step": 76025 + }, + { + "epoch": 11.32409889782544, + "grad_norm": 0.042724609375, + "learning_rate": 0.014152352790051453, + "loss": 0.7847, + "num_input_tokens_seen": 44138176, + "step": 76030 + }, + { + "epoch": 11.324843610366399, + "grad_norm": 0.033203125, + "learning_rate": 0.014150406259803947, + "loss": 0.7848, + "num_input_tokens_seen": 44141280, + "step": 76035 + }, + { + "epoch": 11.325588322907358, + "grad_norm": 0.052978515625, + "learning_rate": 0.01414845974390946, + "loss": 0.8013, + "num_input_tokens_seen": 44143904, + "step": 76040 + }, + { + "epoch": 11.326333035448316, + "grad_norm": 0.045654296875, + "learning_rate": 0.014146513242400877, + "loss": 0.7947, + "num_input_tokens_seen": 44147136, + "step": 76045 + }, + { + "epoch": 11.327077747989277, + "grad_norm": 0.03125, + "learning_rate": 0.01414456675531109, + "loss": 0.7946, + "num_input_tokens_seen": 44149888, + "step": 76050 + }, + { + "epoch": 11.327822460530236, + "grad_norm": 0.048828125, + "learning_rate": 0.014142620282672975, + "loss": 0.7984, + "num_input_tokens_seen": 44152544, + "step": 76055 + }, + { + "epoch": 11.328567173071194, + "grad_norm": 0.039306640625, + "learning_rate": 0.014140673824519421, + "loss": 0.7858, + "num_input_tokens_seen": 44155744, + "step": 76060 + }, + { + "epoch": 11.329311885612153, + "grad_norm": 0.039306640625, + "learning_rate": 0.014138727380883308, + "loss": 0.8045, + "num_input_tokens_seen": 44158656, + "step": 76065 + }, + { + "epoch": 11.330056598153114, + "grad_norm": 0.037353515625, + "learning_rate": 0.014136780951797524, + "loss": 0.8038, + "num_input_tokens_seen": 44161376, + "step": 76070 + }, + { + "epoch": 11.330801310694072, + "grad_norm": 0.036865234375, + "learning_rate": 0.014134834537294947, + "loss": 0.792, + "num_input_tokens_seen": 44164096, + "step": 76075 + }, + { + "epoch": 11.331546023235031, + "grad_norm": 0.052001953125, + "learning_rate": 0.014132888137408458, + "loss": 0.8031, + "num_input_tokens_seen": 44166816, + "step": 76080 + }, + { + "epoch": 11.33229073577599, + "grad_norm": 0.050048828125, + "learning_rate": 0.014130941752170946, + "loss": 0.8138, + "num_input_tokens_seen": 44169632, + "step": 76085 + }, + { + "epoch": 11.33303544831695, + "grad_norm": 0.03271484375, + "learning_rate": 0.014128995381615288, + "loss": 0.7815, + "num_input_tokens_seen": 44172352, + "step": 76090 + }, + { + "epoch": 11.33378016085791, + "grad_norm": 0.06884765625, + "learning_rate": 0.014127049025774371, + "loss": 0.79, + "num_input_tokens_seen": 44175520, + "step": 76095 + }, + { + "epoch": 11.334524873398868, + "grad_norm": 0.038818359375, + "learning_rate": 0.014125102684681068, + "loss": 0.7991, + "num_input_tokens_seen": 44178304, + "step": 76100 + }, + { + "epoch": 11.335269585939827, + "grad_norm": 0.0849609375, + "learning_rate": 0.014123156358368269, + "loss": 0.7827, + "num_input_tokens_seen": 44181024, + "step": 76105 + }, + { + "epoch": 11.336014298480787, + "grad_norm": 0.03955078125, + "learning_rate": 0.014121210046868848, + "loss": 0.7988, + "num_input_tokens_seen": 44183968, + "step": 76110 + }, + { + "epoch": 11.336759011021746, + "grad_norm": 0.031494140625, + "learning_rate": 0.014119263750215697, + "loss": 0.8305, + "num_input_tokens_seen": 44186752, + "step": 76115 + }, + { + "epoch": 11.337503723562705, + "grad_norm": 0.04541015625, + "learning_rate": 0.014117317468441689, + "loss": 0.7763, + "num_input_tokens_seen": 44189728, + "step": 76120 + }, + { + "epoch": 11.338248436103664, + "grad_norm": 0.033203125, + "learning_rate": 0.014115371201579702, + "loss": 0.7888, + "num_input_tokens_seen": 44192576, + "step": 76125 + }, + { + "epoch": 11.338993148644622, + "grad_norm": 0.037353515625, + "learning_rate": 0.014113424949662624, + "loss": 0.7983, + "num_input_tokens_seen": 44195136, + "step": 76130 + }, + { + "epoch": 11.339737861185583, + "grad_norm": 0.03125, + "learning_rate": 0.014111478712723323, + "loss": 0.8205, + "num_input_tokens_seen": 44198048, + "step": 76135 + }, + { + "epoch": 11.340482573726542, + "grad_norm": 0.0255126953125, + "learning_rate": 0.014109532490794692, + "loss": 0.8111, + "num_input_tokens_seen": 44201024, + "step": 76140 + }, + { + "epoch": 11.3412272862675, + "grad_norm": 0.032470703125, + "learning_rate": 0.014107586283909599, + "loss": 0.7996, + "num_input_tokens_seen": 44204224, + "step": 76145 + }, + { + "epoch": 11.341971998808459, + "grad_norm": 0.0419921875, + "learning_rate": 0.014105640092100934, + "loss": 0.7935, + "num_input_tokens_seen": 44207072, + "step": 76150 + }, + { + "epoch": 11.34271671134942, + "grad_norm": 0.0634765625, + "learning_rate": 0.014103693915401571, + "loss": 0.792, + "num_input_tokens_seen": 44209984, + "step": 76155 + }, + { + "epoch": 11.343461423890378, + "grad_norm": 0.0341796875, + "learning_rate": 0.014101747753844386, + "loss": 0.7994, + "num_input_tokens_seen": 44212800, + "step": 76160 + }, + { + "epoch": 11.344206136431337, + "grad_norm": 0.052001953125, + "learning_rate": 0.014099801607462263, + "loss": 0.8092, + "num_input_tokens_seen": 44215776, + "step": 76165 + }, + { + "epoch": 11.344950848972296, + "grad_norm": 0.0478515625, + "learning_rate": 0.01409785547628807, + "loss": 0.7828, + "num_input_tokens_seen": 44218624, + "step": 76170 + }, + { + "epoch": 11.345695561513256, + "grad_norm": 0.035400390625, + "learning_rate": 0.014095909360354696, + "loss": 0.8047, + "num_input_tokens_seen": 44221568, + "step": 76175 + }, + { + "epoch": 11.346440274054215, + "grad_norm": 0.031982421875, + "learning_rate": 0.014093963259695011, + "loss": 0.804, + "num_input_tokens_seen": 44224704, + "step": 76180 + }, + { + "epoch": 11.347184986595174, + "grad_norm": 0.0390625, + "learning_rate": 0.014092017174341902, + "loss": 0.7952, + "num_input_tokens_seen": 44227584, + "step": 76185 + }, + { + "epoch": 11.347929699136133, + "grad_norm": 0.038818359375, + "learning_rate": 0.014090071104328234, + "loss": 0.8043, + "num_input_tokens_seen": 44230944, + "step": 76190 + }, + { + "epoch": 11.348674411677093, + "grad_norm": 0.048583984375, + "learning_rate": 0.014088125049686897, + "loss": 0.8065, + "num_input_tokens_seen": 44233792, + "step": 76195 + }, + { + "epoch": 11.349419124218052, + "grad_norm": 0.041015625, + "learning_rate": 0.014086179010450759, + "loss": 0.7767, + "num_input_tokens_seen": 44236800, + "step": 76200 + }, + { + "epoch": 11.35016383675901, + "grad_norm": 0.052001953125, + "learning_rate": 0.01408423298665269, + "loss": 0.7765, + "num_input_tokens_seen": 44239776, + "step": 76205 + }, + { + "epoch": 11.35090854929997, + "grad_norm": 0.04345703125, + "learning_rate": 0.014082286978325581, + "loss": 0.8164, + "num_input_tokens_seen": 44242912, + "step": 76210 + }, + { + "epoch": 11.35165326184093, + "grad_norm": 0.043212890625, + "learning_rate": 0.014080340985502296, + "loss": 0.7676, + "num_input_tokens_seen": 44245856, + "step": 76215 + }, + { + "epoch": 11.352397974381889, + "grad_norm": 0.03857421875, + "learning_rate": 0.01407839500821572, + "loss": 0.7924, + "num_input_tokens_seen": 44248928, + "step": 76220 + }, + { + "epoch": 11.353142686922848, + "grad_norm": 0.0966796875, + "learning_rate": 0.014076449046498723, + "loss": 0.7626, + "num_input_tokens_seen": 44251904, + "step": 76225 + }, + { + "epoch": 11.353887399463806, + "grad_norm": 0.03173828125, + "learning_rate": 0.014074503100384182, + "loss": 0.8048, + "num_input_tokens_seen": 44254592, + "step": 76230 + }, + { + "epoch": 11.354632112004767, + "grad_norm": 0.044921875, + "learning_rate": 0.014072557169904971, + "loss": 0.8674, + "num_input_tokens_seen": 44257760, + "step": 76235 + }, + { + "epoch": 11.355376824545726, + "grad_norm": 0.0302734375, + "learning_rate": 0.014070611255093958, + "loss": 0.796, + "num_input_tokens_seen": 44260544, + "step": 76240 + }, + { + "epoch": 11.356121537086684, + "grad_norm": 0.03466796875, + "learning_rate": 0.014068665355984028, + "loss": 0.8098, + "num_input_tokens_seen": 44263168, + "step": 76245 + }, + { + "epoch": 11.356866249627643, + "grad_norm": 0.036376953125, + "learning_rate": 0.014066719472608045, + "loss": 0.8158, + "num_input_tokens_seen": 44265824, + "step": 76250 + }, + { + "epoch": 11.357610962168604, + "grad_norm": 0.033935546875, + "learning_rate": 0.014064773604998894, + "loss": 0.7971, + "num_input_tokens_seen": 44268928, + "step": 76255 + }, + { + "epoch": 11.358355674709562, + "grad_norm": 0.031982421875, + "learning_rate": 0.01406282775318944, + "loss": 0.8379, + "num_input_tokens_seen": 44271712, + "step": 76260 + }, + { + "epoch": 11.359100387250521, + "grad_norm": 0.041259765625, + "learning_rate": 0.01406088191721256, + "loss": 0.8012, + "num_input_tokens_seen": 44274592, + "step": 76265 + }, + { + "epoch": 11.35984509979148, + "grad_norm": 0.037841796875, + "learning_rate": 0.014058936097101122, + "loss": 0.7918, + "num_input_tokens_seen": 44277696, + "step": 76270 + }, + { + "epoch": 11.36058981233244, + "grad_norm": 0.0277099609375, + "learning_rate": 0.014056990292888006, + "loss": 0.8012, + "num_input_tokens_seen": 44280832, + "step": 76275 + }, + { + "epoch": 11.3613345248734, + "grad_norm": 0.044677734375, + "learning_rate": 0.01405504450460608, + "loss": 0.8153, + "num_input_tokens_seen": 44283680, + "step": 76280 + }, + { + "epoch": 11.362079237414358, + "grad_norm": 0.03271484375, + "learning_rate": 0.014053098732288213, + "loss": 0.7976, + "num_input_tokens_seen": 44286784, + "step": 76285 + }, + { + "epoch": 11.362823949955317, + "grad_norm": 0.06494140625, + "learning_rate": 0.014051152975967285, + "loss": 0.8215, + "num_input_tokens_seen": 44289440, + "step": 76290 + }, + { + "epoch": 11.363568662496277, + "grad_norm": 0.026611328125, + "learning_rate": 0.01404920723567616, + "loss": 0.7978, + "num_input_tokens_seen": 44292352, + "step": 76295 + }, + { + "epoch": 11.364313375037236, + "grad_norm": 0.04052734375, + "learning_rate": 0.014047261511447718, + "loss": 0.7924, + "num_input_tokens_seen": 44295360, + "step": 76300 + }, + { + "epoch": 11.365058087578195, + "grad_norm": 0.03857421875, + "learning_rate": 0.014045315803314815, + "loss": 0.7867, + "num_input_tokens_seen": 44298272, + "step": 76305 + }, + { + "epoch": 11.365802800119154, + "grad_norm": 0.038818359375, + "learning_rate": 0.014043370111310339, + "loss": 0.816, + "num_input_tokens_seen": 44301472, + "step": 76310 + }, + { + "epoch": 11.366547512660112, + "grad_norm": 0.03271484375, + "learning_rate": 0.014041424435467145, + "loss": 0.8344, + "num_input_tokens_seen": 44304800, + "step": 76315 + }, + { + "epoch": 11.367292225201073, + "grad_norm": 0.029541015625, + "learning_rate": 0.014039478775818121, + "loss": 0.7913, + "num_input_tokens_seen": 44307520, + "step": 76320 + }, + { + "epoch": 11.368036937742032, + "grad_norm": 0.0439453125, + "learning_rate": 0.014037533132396121, + "loss": 0.781, + "num_input_tokens_seen": 44310528, + "step": 76325 + }, + { + "epoch": 11.36878165028299, + "grad_norm": 0.021240234375, + "learning_rate": 0.014035587505234023, + "loss": 0.8025, + "num_input_tokens_seen": 44313344, + "step": 76330 + }, + { + "epoch": 11.36952636282395, + "grad_norm": 0.03955078125, + "learning_rate": 0.014033641894364695, + "loss": 0.807, + "num_input_tokens_seen": 44316096, + "step": 76335 + }, + { + "epoch": 11.37027107536491, + "grad_norm": 0.0311279296875, + "learning_rate": 0.014031696299820998, + "loss": 0.8215, + "num_input_tokens_seen": 44318816, + "step": 76340 + }, + { + "epoch": 11.371015787905868, + "grad_norm": 0.03173828125, + "learning_rate": 0.014029750721635814, + "loss": 0.7751, + "num_input_tokens_seen": 44321440, + "step": 76345 + }, + { + "epoch": 11.371760500446827, + "grad_norm": 0.0400390625, + "learning_rate": 0.014027805159842, + "loss": 0.8065, + "num_input_tokens_seen": 44324160, + "step": 76350 + }, + { + "epoch": 11.372505212987786, + "grad_norm": 0.022705078125, + "learning_rate": 0.014025859614472436, + "loss": 0.8, + "num_input_tokens_seen": 44326944, + "step": 76355 + }, + { + "epoch": 11.373249925528746, + "grad_norm": 0.04248046875, + "learning_rate": 0.014023914085559983, + "loss": 0.7944, + "num_input_tokens_seen": 44329824, + "step": 76360 + }, + { + "epoch": 11.373994638069705, + "grad_norm": 0.045654296875, + "learning_rate": 0.014021968573137507, + "loss": 0.7875, + "num_input_tokens_seen": 44332544, + "step": 76365 + }, + { + "epoch": 11.374739350610664, + "grad_norm": 0.02294921875, + "learning_rate": 0.014020023077237882, + "loss": 0.8291, + "num_input_tokens_seen": 44335424, + "step": 76370 + }, + { + "epoch": 11.375484063151623, + "grad_norm": 0.0693359375, + "learning_rate": 0.014018077597893962, + "loss": 0.817, + "num_input_tokens_seen": 44338272, + "step": 76375 + }, + { + "epoch": 11.376228775692583, + "grad_norm": 0.03466796875, + "learning_rate": 0.014016132135138632, + "loss": 0.8186, + "num_input_tokens_seen": 44341152, + "step": 76380 + }, + { + "epoch": 11.376973488233542, + "grad_norm": 0.0303955078125, + "learning_rate": 0.014014186689004745, + "loss": 0.797, + "num_input_tokens_seen": 44344192, + "step": 76385 + }, + { + "epoch": 11.3777182007745, + "grad_norm": 0.0306396484375, + "learning_rate": 0.014012241259525172, + "loss": 0.8171, + "num_input_tokens_seen": 44347264, + "step": 76390 + }, + { + "epoch": 11.37846291331546, + "grad_norm": 0.0286865234375, + "learning_rate": 0.01401029584673278, + "loss": 0.7789, + "num_input_tokens_seen": 44350304, + "step": 76395 + }, + { + "epoch": 11.37920762585642, + "grad_norm": 0.031982421875, + "learning_rate": 0.014008350450660437, + "loss": 0.8084, + "num_input_tokens_seen": 44353344, + "step": 76400 + }, + { + "epoch": 11.379952338397379, + "grad_norm": 0.037109375, + "learning_rate": 0.014006405071341004, + "loss": 0.81, + "num_input_tokens_seen": 44356064, + "step": 76405 + }, + { + "epoch": 11.380697050938338, + "grad_norm": 0.03955078125, + "learning_rate": 0.014004459708807341, + "loss": 0.7923, + "num_input_tokens_seen": 44358720, + "step": 76410 + }, + { + "epoch": 11.381441763479296, + "grad_norm": 0.0228271484375, + "learning_rate": 0.014002514363092326, + "loss": 0.7837, + "num_input_tokens_seen": 44361696, + "step": 76415 + }, + { + "epoch": 11.382186476020257, + "grad_norm": 0.031494140625, + "learning_rate": 0.014000569034228815, + "loss": 0.7961, + "num_input_tokens_seen": 44364480, + "step": 76420 + }, + { + "epoch": 11.382931188561216, + "grad_norm": 0.03466796875, + "learning_rate": 0.013998623722249677, + "loss": 0.8071, + "num_input_tokens_seen": 44367264, + "step": 76425 + }, + { + "epoch": 11.383675901102174, + "grad_norm": 0.056396484375, + "learning_rate": 0.013996678427187768, + "loss": 0.7879, + "num_input_tokens_seen": 44370304, + "step": 76430 + }, + { + "epoch": 11.384420613643133, + "grad_norm": 0.04833984375, + "learning_rate": 0.013994733149075962, + "loss": 0.7895, + "num_input_tokens_seen": 44373088, + "step": 76435 + }, + { + "epoch": 11.385165326184094, + "grad_norm": 0.042724609375, + "learning_rate": 0.013992787887947118, + "loss": 0.8181, + "num_input_tokens_seen": 44375936, + "step": 76440 + }, + { + "epoch": 11.385910038725052, + "grad_norm": 0.031494140625, + "learning_rate": 0.013990842643834094, + "loss": 0.7998, + "num_input_tokens_seen": 44378848, + "step": 76445 + }, + { + "epoch": 11.386654751266011, + "grad_norm": 0.037841796875, + "learning_rate": 0.013988897416769763, + "loss": 0.8412, + "num_input_tokens_seen": 44381568, + "step": 76450 + }, + { + "epoch": 11.38739946380697, + "grad_norm": 0.05224609375, + "learning_rate": 0.013986952206786979, + "loss": 0.7881, + "num_input_tokens_seen": 44384448, + "step": 76455 + }, + { + "epoch": 11.38814417634793, + "grad_norm": 0.0311279296875, + "learning_rate": 0.013985007013918612, + "loss": 0.7944, + "num_input_tokens_seen": 44387360, + "step": 76460 + }, + { + "epoch": 11.38888888888889, + "grad_norm": 0.035888671875, + "learning_rate": 0.013983061838197513, + "loss": 0.8036, + "num_input_tokens_seen": 44390112, + "step": 76465 + }, + { + "epoch": 11.389633601429848, + "grad_norm": 0.0361328125, + "learning_rate": 0.013981116679656557, + "loss": 0.781, + "num_input_tokens_seen": 44393408, + "step": 76470 + }, + { + "epoch": 11.390378313970807, + "grad_norm": 0.041748046875, + "learning_rate": 0.013979171538328592, + "loss": 0.8153, + "num_input_tokens_seen": 44396768, + "step": 76475 + }, + { + "epoch": 11.391123026511767, + "grad_norm": 0.051025390625, + "learning_rate": 0.013977226414246494, + "loss": 0.7902, + "num_input_tokens_seen": 44399584, + "step": 76480 + }, + { + "epoch": 11.391867739052726, + "grad_norm": 0.050048828125, + "learning_rate": 0.013975281307443117, + "loss": 0.8184, + "num_input_tokens_seen": 44402528, + "step": 76485 + }, + { + "epoch": 11.392612451593685, + "grad_norm": 0.044921875, + "learning_rate": 0.013973336217951316, + "loss": 0.8187, + "num_input_tokens_seen": 44405568, + "step": 76490 + }, + { + "epoch": 11.393357164134644, + "grad_norm": 0.1728515625, + "learning_rate": 0.013971391145803959, + "loss": 0.8221, + "num_input_tokens_seen": 44408448, + "step": 76495 + }, + { + "epoch": 11.394101876675602, + "grad_norm": 0.03857421875, + "learning_rate": 0.013969446091033899, + "loss": 0.8066, + "num_input_tokens_seen": 44411552, + "step": 76500 + }, + { + "epoch": 11.394846589216563, + "grad_norm": 0.03759765625, + "learning_rate": 0.013967501053674005, + "loss": 0.8041, + "num_input_tokens_seen": 44414496, + "step": 76505 + }, + { + "epoch": 11.395591301757522, + "grad_norm": 0.041748046875, + "learning_rate": 0.013965556033757124, + "loss": 0.7951, + "num_input_tokens_seen": 44417280, + "step": 76510 + }, + { + "epoch": 11.39633601429848, + "grad_norm": 0.0218505859375, + "learning_rate": 0.01396361103131613, + "loss": 0.7959, + "num_input_tokens_seen": 44420160, + "step": 76515 + }, + { + "epoch": 11.39708072683944, + "grad_norm": 0.03955078125, + "learning_rate": 0.013961666046383873, + "loss": 0.805, + "num_input_tokens_seen": 44423136, + "step": 76520 + }, + { + "epoch": 11.3978254393804, + "grad_norm": 0.026123046875, + "learning_rate": 0.01395972107899321, + "loss": 0.8057, + "num_input_tokens_seen": 44426112, + "step": 76525 + }, + { + "epoch": 11.398570151921358, + "grad_norm": 0.02880859375, + "learning_rate": 0.013957776129177007, + "loss": 0.7951, + "num_input_tokens_seen": 44429056, + "step": 76530 + }, + { + "epoch": 11.399314864462317, + "grad_norm": 0.049072265625, + "learning_rate": 0.013955831196968109, + "loss": 0.7928, + "num_input_tokens_seen": 44432032, + "step": 76535 + }, + { + "epoch": 11.400059577003276, + "grad_norm": 0.05419921875, + "learning_rate": 0.013953886282399389, + "loss": 0.7876, + "num_input_tokens_seen": 44435008, + "step": 76540 + }, + { + "epoch": 11.400804289544237, + "grad_norm": 0.04150390625, + "learning_rate": 0.01395194138550369, + "loss": 0.8037, + "num_input_tokens_seen": 44437632, + "step": 76545 + }, + { + "epoch": 11.401549002085195, + "grad_norm": 0.0341796875, + "learning_rate": 0.013949996506313882, + "loss": 0.7889, + "num_input_tokens_seen": 44440352, + "step": 76550 + }, + { + "epoch": 11.402293714626154, + "grad_norm": 0.0284423828125, + "learning_rate": 0.013948051644862815, + "loss": 0.7864, + "num_input_tokens_seen": 44443232, + "step": 76555 + }, + { + "epoch": 11.403038427167113, + "grad_norm": 0.0712890625, + "learning_rate": 0.013946106801183349, + "loss": 0.8128, + "num_input_tokens_seen": 44445856, + "step": 76560 + }, + { + "epoch": 11.403783139708073, + "grad_norm": 0.054443359375, + "learning_rate": 0.013944161975308336, + "loss": 0.7964, + "num_input_tokens_seen": 44448672, + "step": 76565 + }, + { + "epoch": 11.404527852249032, + "grad_norm": 0.04736328125, + "learning_rate": 0.013942217167270626, + "loss": 0.7915, + "num_input_tokens_seen": 44451360, + "step": 76570 + }, + { + "epoch": 11.40527256478999, + "grad_norm": 0.03955078125, + "learning_rate": 0.013940272377103088, + "loss": 0.8162, + "num_input_tokens_seen": 44454112, + "step": 76575 + }, + { + "epoch": 11.40601727733095, + "grad_norm": 0.046630859375, + "learning_rate": 0.013938327604838567, + "loss": 0.8131, + "num_input_tokens_seen": 44456832, + "step": 76580 + }, + { + "epoch": 11.40676198987191, + "grad_norm": 0.19140625, + "learning_rate": 0.013936382850509929, + "loss": 0.8283, + "num_input_tokens_seen": 44459680, + "step": 76585 + }, + { + "epoch": 11.407506702412869, + "grad_norm": 0.0439453125, + "learning_rate": 0.013934438114150016, + "loss": 0.7991, + "num_input_tokens_seen": 44462784, + "step": 76590 + }, + { + "epoch": 11.408251414953828, + "grad_norm": 0.0712890625, + "learning_rate": 0.013932493395791693, + "loss": 0.7955, + "num_input_tokens_seen": 44465664, + "step": 76595 + }, + { + "epoch": 11.408996127494786, + "grad_norm": 0.050537109375, + "learning_rate": 0.01393054869546781, + "loss": 0.8046, + "num_input_tokens_seen": 44468416, + "step": 76600 + }, + { + "epoch": 11.409740840035747, + "grad_norm": 0.04638671875, + "learning_rate": 0.013928604013211212, + "loss": 0.8157, + "num_input_tokens_seen": 44471008, + "step": 76605 + }, + { + "epoch": 11.410485552576706, + "grad_norm": 0.034912109375, + "learning_rate": 0.013926659349054765, + "loss": 0.8142, + "num_input_tokens_seen": 44474016, + "step": 76610 + }, + { + "epoch": 11.411230265117664, + "grad_norm": 0.042724609375, + "learning_rate": 0.013924714703031313, + "loss": 0.787, + "num_input_tokens_seen": 44476832, + "step": 76615 + }, + { + "epoch": 11.411974977658623, + "grad_norm": 0.03759765625, + "learning_rate": 0.01392277007517372, + "loss": 0.8037, + "num_input_tokens_seen": 44479840, + "step": 76620 + }, + { + "epoch": 11.412719690199584, + "grad_norm": 0.034912109375, + "learning_rate": 0.013920825465514828, + "loss": 0.7975, + "num_input_tokens_seen": 44482848, + "step": 76625 + }, + { + "epoch": 11.413464402740543, + "grad_norm": 0.022705078125, + "learning_rate": 0.013918880874087497, + "loss": 0.7848, + "num_input_tokens_seen": 44485696, + "step": 76630 + }, + { + "epoch": 11.414209115281501, + "grad_norm": 0.043701171875, + "learning_rate": 0.013916936300924566, + "loss": 0.7832, + "num_input_tokens_seen": 44488608, + "step": 76635 + }, + { + "epoch": 11.41495382782246, + "grad_norm": 0.0302734375, + "learning_rate": 0.013914991746058904, + "loss": 0.8175, + "num_input_tokens_seen": 44491424, + "step": 76640 + }, + { + "epoch": 11.41569854036342, + "grad_norm": 0.059326171875, + "learning_rate": 0.013913047209523353, + "loss": 0.7965, + "num_input_tokens_seen": 44494144, + "step": 76645 + }, + { + "epoch": 11.41644325290438, + "grad_norm": 0.041259765625, + "learning_rate": 0.013911102691350758, + "loss": 0.7777, + "num_input_tokens_seen": 44496992, + "step": 76650 + }, + { + "epoch": 11.417187965445338, + "grad_norm": 0.030029296875, + "learning_rate": 0.013909158191573984, + "loss": 0.8069, + "num_input_tokens_seen": 44499872, + "step": 76655 + }, + { + "epoch": 11.417932677986297, + "grad_norm": 0.037353515625, + "learning_rate": 0.013907213710225868, + "loss": 0.8372, + "num_input_tokens_seen": 44502816, + "step": 76660 + }, + { + "epoch": 11.418677390527257, + "grad_norm": 0.03125, + "learning_rate": 0.01390526924733927, + "loss": 0.7973, + "num_input_tokens_seen": 44505536, + "step": 76665 + }, + { + "epoch": 11.419422103068216, + "grad_norm": 0.03857421875, + "learning_rate": 0.01390332480294703, + "loss": 0.7968, + "num_input_tokens_seen": 44508992, + "step": 76670 + }, + { + "epoch": 11.420166815609175, + "grad_norm": 0.02978515625, + "learning_rate": 0.01390138037708201, + "loss": 0.8066, + "num_input_tokens_seen": 44511776, + "step": 76675 + }, + { + "epoch": 11.420911528150134, + "grad_norm": 0.0308837890625, + "learning_rate": 0.013899435969777049, + "loss": 0.7816, + "num_input_tokens_seen": 44514528, + "step": 76680 + }, + { + "epoch": 11.421656240691092, + "grad_norm": 0.034912109375, + "learning_rate": 0.013897491581064994, + "loss": 0.7954, + "num_input_tokens_seen": 44517600, + "step": 76685 + }, + { + "epoch": 11.422400953232053, + "grad_norm": 0.045654296875, + "learning_rate": 0.013895547210978704, + "loss": 0.834, + "num_input_tokens_seen": 44520576, + "step": 76690 + }, + { + "epoch": 11.423145665773012, + "grad_norm": 0.040283203125, + "learning_rate": 0.01389360285955102, + "loss": 0.8024, + "num_input_tokens_seen": 44523360, + "step": 76695 + }, + { + "epoch": 11.42389037831397, + "grad_norm": 0.04345703125, + "learning_rate": 0.013891658526814793, + "loss": 0.7986, + "num_input_tokens_seen": 44526144, + "step": 76700 + }, + { + "epoch": 11.42463509085493, + "grad_norm": 0.03369140625, + "learning_rate": 0.013889714212802861, + "loss": 0.7973, + "num_input_tokens_seen": 44528960, + "step": 76705 + }, + { + "epoch": 11.42537980339589, + "grad_norm": 0.045654296875, + "learning_rate": 0.013887769917548087, + "loss": 0.8055, + "num_input_tokens_seen": 44532128, + "step": 76710 + }, + { + "epoch": 11.426124515936849, + "grad_norm": 0.0308837890625, + "learning_rate": 0.013885825641083307, + "loss": 0.787, + "num_input_tokens_seen": 44535136, + "step": 76715 + }, + { + "epoch": 11.426869228477807, + "grad_norm": 0.0194091796875, + "learning_rate": 0.01388388138344137, + "loss": 0.7905, + "num_input_tokens_seen": 44538176, + "step": 76720 + }, + { + "epoch": 11.427613941018766, + "grad_norm": 0.040771484375, + "learning_rate": 0.013881937144655125, + "loss": 0.7855, + "num_input_tokens_seen": 44541024, + "step": 76725 + }, + { + "epoch": 11.428358653559727, + "grad_norm": 0.04638671875, + "learning_rate": 0.013879992924757412, + "loss": 0.8005, + "num_input_tokens_seen": 44543840, + "step": 76730 + }, + { + "epoch": 11.429103366100685, + "grad_norm": 0.036376953125, + "learning_rate": 0.013878048723781086, + "loss": 0.8011, + "num_input_tokens_seen": 44547008, + "step": 76735 + }, + { + "epoch": 11.429848078641644, + "grad_norm": 0.059814453125, + "learning_rate": 0.013876104541758978, + "loss": 0.793, + "num_input_tokens_seen": 44549632, + "step": 76740 + }, + { + "epoch": 11.430592791182603, + "grad_norm": 0.11474609375, + "learning_rate": 0.013874160378723948, + "loss": 0.8185, + "num_input_tokens_seen": 44552608, + "step": 76745 + }, + { + "epoch": 11.431337503723563, + "grad_norm": 0.031982421875, + "learning_rate": 0.013872216234708832, + "loss": 0.8216, + "num_input_tokens_seen": 44555392, + "step": 76750 + }, + { + "epoch": 11.432082216264522, + "grad_norm": 0.055419921875, + "learning_rate": 0.013870272109746479, + "loss": 0.7903, + "num_input_tokens_seen": 44558400, + "step": 76755 + }, + { + "epoch": 11.432826928805481, + "grad_norm": 0.0302734375, + "learning_rate": 0.013868328003869728, + "loss": 0.7744, + "num_input_tokens_seen": 44561216, + "step": 76760 + }, + { + "epoch": 11.43357164134644, + "grad_norm": 0.0286865234375, + "learning_rate": 0.013866383917111423, + "loss": 0.8139, + "num_input_tokens_seen": 44563936, + "step": 76765 + }, + { + "epoch": 11.4343163538874, + "grad_norm": 0.040283203125, + "learning_rate": 0.013864439849504416, + "loss": 0.8089, + "num_input_tokens_seen": 44566816, + "step": 76770 + }, + { + "epoch": 11.435061066428359, + "grad_norm": 0.039794921875, + "learning_rate": 0.013862495801081535, + "loss": 0.7728, + "num_input_tokens_seen": 44569536, + "step": 76775 + }, + { + "epoch": 11.435805778969318, + "grad_norm": 0.0228271484375, + "learning_rate": 0.013860551771875639, + "loss": 0.7907, + "num_input_tokens_seen": 44572448, + "step": 76780 + }, + { + "epoch": 11.436550491510276, + "grad_norm": 0.0299072265625, + "learning_rate": 0.013858607761919559, + "loss": 0.7979, + "num_input_tokens_seen": 44575264, + "step": 76785 + }, + { + "epoch": 11.437295204051237, + "grad_norm": 0.04736328125, + "learning_rate": 0.013856663771246142, + "loss": 0.8, + "num_input_tokens_seen": 44578112, + "step": 76790 + }, + { + "epoch": 11.438039916592196, + "grad_norm": 0.035888671875, + "learning_rate": 0.013854719799888225, + "loss": 0.7793, + "num_input_tokens_seen": 44580896, + "step": 76795 + }, + { + "epoch": 11.438784629133155, + "grad_norm": 0.0361328125, + "learning_rate": 0.013852775847878659, + "loss": 0.7821, + "num_input_tokens_seen": 44583936, + "step": 76800 + }, + { + "epoch": 11.439529341674113, + "grad_norm": 0.018798828125, + "learning_rate": 0.013850831915250278, + "loss": 0.8029, + "num_input_tokens_seen": 44586944, + "step": 76805 + }, + { + "epoch": 11.440274054215074, + "grad_norm": 0.03076171875, + "learning_rate": 0.013848888002035917, + "loss": 0.7888, + "num_input_tokens_seen": 44589920, + "step": 76810 + }, + { + "epoch": 11.441018766756033, + "grad_norm": 0.027587890625, + "learning_rate": 0.01384694410826843, + "loss": 0.8109, + "num_input_tokens_seen": 44592992, + "step": 76815 + }, + { + "epoch": 11.441763479296991, + "grad_norm": 0.0299072265625, + "learning_rate": 0.013845000233980649, + "loss": 0.782, + "num_input_tokens_seen": 44596000, + "step": 76820 + }, + { + "epoch": 11.44250819183795, + "grad_norm": 0.047119140625, + "learning_rate": 0.013843056379205416, + "loss": 0.7894, + "num_input_tokens_seen": 44598720, + "step": 76825 + }, + { + "epoch": 11.443252904378909, + "grad_norm": 0.046142578125, + "learning_rate": 0.013841112543975565, + "loss": 0.7875, + "num_input_tokens_seen": 44601440, + "step": 76830 + }, + { + "epoch": 11.44399761691987, + "grad_norm": 0.0252685546875, + "learning_rate": 0.013839168728323943, + "loss": 0.8038, + "num_input_tokens_seen": 44604608, + "step": 76835 + }, + { + "epoch": 11.444742329460828, + "grad_norm": 0.0306396484375, + "learning_rate": 0.013837224932283388, + "loss": 0.8102, + "num_input_tokens_seen": 44607744, + "step": 76840 + }, + { + "epoch": 11.445487042001787, + "grad_norm": 0.0301513671875, + "learning_rate": 0.01383528115588673, + "loss": 0.7999, + "num_input_tokens_seen": 44610528, + "step": 76845 + }, + { + "epoch": 11.446231754542746, + "grad_norm": 0.0303955078125, + "learning_rate": 0.01383333739916682, + "loss": 0.8104, + "num_input_tokens_seen": 44613184, + "step": 76850 + }, + { + "epoch": 11.446976467083706, + "grad_norm": 0.0274658203125, + "learning_rate": 0.013831393662156485, + "loss": 0.7917, + "num_input_tokens_seen": 44616288, + "step": 76855 + }, + { + "epoch": 11.447721179624665, + "grad_norm": 0.031494140625, + "learning_rate": 0.013829449944888567, + "loss": 0.7794, + "num_input_tokens_seen": 44619104, + "step": 76860 + }, + { + "epoch": 11.448465892165624, + "grad_norm": 0.05859375, + "learning_rate": 0.013827506247395899, + "loss": 0.8103, + "num_input_tokens_seen": 44621792, + "step": 76865 + }, + { + "epoch": 11.449210604706582, + "grad_norm": 0.03173828125, + "learning_rate": 0.013825562569711328, + "loss": 0.7891, + "num_input_tokens_seen": 44624832, + "step": 76870 + }, + { + "epoch": 11.449955317247543, + "grad_norm": 0.0244140625, + "learning_rate": 0.013823618911867674, + "loss": 0.7873, + "num_input_tokens_seen": 44627552, + "step": 76875 + }, + { + "epoch": 11.450700029788502, + "grad_norm": 0.05224609375, + "learning_rate": 0.013821675273897794, + "loss": 0.7948, + "num_input_tokens_seen": 44630592, + "step": 76880 + }, + { + "epoch": 11.45144474232946, + "grad_norm": 0.0341796875, + "learning_rate": 0.01381973165583451, + "loss": 0.7932, + "num_input_tokens_seen": 44633472, + "step": 76885 + }, + { + "epoch": 11.45218945487042, + "grad_norm": 0.06298828125, + "learning_rate": 0.013817788057710657, + "loss": 0.7982, + "num_input_tokens_seen": 44636224, + "step": 76890 + }, + { + "epoch": 11.45293416741138, + "grad_norm": 0.026611328125, + "learning_rate": 0.013815844479559075, + "loss": 0.7779, + "num_input_tokens_seen": 44639104, + "step": 76895 + }, + { + "epoch": 11.453678879952339, + "grad_norm": 0.0306396484375, + "learning_rate": 0.013813900921412593, + "loss": 0.7677, + "num_input_tokens_seen": 44642112, + "step": 76900 + }, + { + "epoch": 11.454423592493297, + "grad_norm": 0.042236328125, + "learning_rate": 0.013811957383304055, + "loss": 0.7873, + "num_input_tokens_seen": 44644896, + "step": 76905 + }, + { + "epoch": 11.455168305034256, + "grad_norm": 0.02392578125, + "learning_rate": 0.013810013865266282, + "loss": 0.7784, + "num_input_tokens_seen": 44647776, + "step": 76910 + }, + { + "epoch": 11.455913017575217, + "grad_norm": 0.05029296875, + "learning_rate": 0.013808070367332123, + "loss": 0.774, + "num_input_tokens_seen": 44650912, + "step": 76915 + }, + { + "epoch": 11.456657730116175, + "grad_norm": 0.09814453125, + "learning_rate": 0.013806126889534401, + "loss": 0.8082, + "num_input_tokens_seen": 44653728, + "step": 76920 + }, + { + "epoch": 11.457402442657134, + "grad_norm": 0.046875, + "learning_rate": 0.013804183431905955, + "loss": 0.7927, + "num_input_tokens_seen": 44656832, + "step": 76925 + }, + { + "epoch": 11.458147155198093, + "grad_norm": 0.072265625, + "learning_rate": 0.013802239994479614, + "loss": 0.7869, + "num_input_tokens_seen": 44659936, + "step": 76930 + }, + { + "epoch": 11.458891867739053, + "grad_norm": 0.04443359375, + "learning_rate": 0.013800296577288203, + "loss": 0.8048, + "num_input_tokens_seen": 44662656, + "step": 76935 + }, + { + "epoch": 11.459636580280012, + "grad_norm": 0.04736328125, + "learning_rate": 0.013798353180364572, + "loss": 0.8232, + "num_input_tokens_seen": 44665248, + "step": 76940 + }, + { + "epoch": 11.460381292820971, + "grad_norm": 0.046142578125, + "learning_rate": 0.013796409803741534, + "loss": 0.8117, + "num_input_tokens_seen": 44668128, + "step": 76945 + }, + { + "epoch": 11.46112600536193, + "grad_norm": 0.051513671875, + "learning_rate": 0.013794466447451935, + "loss": 0.806, + "num_input_tokens_seen": 44671072, + "step": 76950 + }, + { + "epoch": 11.46187071790289, + "grad_norm": 0.029296875, + "learning_rate": 0.013792523111528596, + "loss": 0.7931, + "num_input_tokens_seen": 44674240, + "step": 76955 + }, + { + "epoch": 11.462615430443849, + "grad_norm": 0.05859375, + "learning_rate": 0.013790579796004357, + "loss": 0.7995, + "num_input_tokens_seen": 44677280, + "step": 76960 + }, + { + "epoch": 11.463360142984808, + "grad_norm": 0.0419921875, + "learning_rate": 0.013788636500912043, + "loss": 0.8111, + "num_input_tokens_seen": 44680288, + "step": 76965 + }, + { + "epoch": 11.464104855525767, + "grad_norm": 0.06103515625, + "learning_rate": 0.013786693226284478, + "loss": 0.8037, + "num_input_tokens_seen": 44683040, + "step": 76970 + }, + { + "epoch": 11.464849568066727, + "grad_norm": 0.038330078125, + "learning_rate": 0.013784749972154503, + "loss": 0.7995, + "num_input_tokens_seen": 44686080, + "step": 76975 + }, + { + "epoch": 11.465594280607686, + "grad_norm": 0.0283203125, + "learning_rate": 0.013782806738554936, + "loss": 0.7899, + "num_input_tokens_seen": 44689088, + "step": 76980 + }, + { + "epoch": 11.466338993148645, + "grad_norm": 0.029296875, + "learning_rate": 0.013780863525518619, + "loss": 0.8062, + "num_input_tokens_seen": 44691840, + "step": 76985 + }, + { + "epoch": 11.467083705689603, + "grad_norm": 0.05517578125, + "learning_rate": 0.01377892033307837, + "loss": 0.8012, + "num_input_tokens_seen": 44694944, + "step": 76990 + }, + { + "epoch": 11.467828418230564, + "grad_norm": 0.052001953125, + "learning_rate": 0.013776977161267025, + "loss": 0.7825, + "num_input_tokens_seen": 44697664, + "step": 76995 + }, + { + "epoch": 11.468573130771523, + "grad_norm": 0.026611328125, + "learning_rate": 0.0137750340101174, + "loss": 0.8027, + "num_input_tokens_seen": 44700672, + "step": 77000 + }, + { + "epoch": 11.469317843312481, + "grad_norm": 0.04345703125, + "learning_rate": 0.01377309087966234, + "loss": 0.8041, + "num_input_tokens_seen": 44703488, + "step": 77005 + }, + { + "epoch": 11.47006255585344, + "grad_norm": 0.05419921875, + "learning_rate": 0.013771147769934659, + "loss": 0.7725, + "num_input_tokens_seen": 44706624, + "step": 77010 + }, + { + "epoch": 11.470807268394399, + "grad_norm": 0.07861328125, + "learning_rate": 0.013769204680967182, + "loss": 0.7772, + "num_input_tokens_seen": 44709440, + "step": 77015 + }, + { + "epoch": 11.47155198093536, + "grad_norm": 0.0634765625, + "learning_rate": 0.013767261612792748, + "loss": 0.7775, + "num_input_tokens_seen": 44712192, + "step": 77020 + }, + { + "epoch": 11.472296693476318, + "grad_norm": 0.062255859375, + "learning_rate": 0.013765318565444173, + "loss": 0.7717, + "num_input_tokens_seen": 44715424, + "step": 77025 + }, + { + "epoch": 11.473041406017277, + "grad_norm": 0.1064453125, + "learning_rate": 0.013763375538954287, + "loss": 0.7709, + "num_input_tokens_seen": 44718400, + "step": 77030 + }, + { + "epoch": 11.473786118558236, + "grad_norm": 0.1494140625, + "learning_rate": 0.013761432533355912, + "loss": 0.7991, + "num_input_tokens_seen": 44721504, + "step": 77035 + }, + { + "epoch": 11.474530831099196, + "grad_norm": 0.03515625, + "learning_rate": 0.01375948954868188, + "loss": 0.7795, + "num_input_tokens_seen": 44724352, + "step": 77040 + }, + { + "epoch": 11.475275543640155, + "grad_norm": 0.1298828125, + "learning_rate": 0.013757546584965011, + "loss": 0.7838, + "num_input_tokens_seen": 44727296, + "step": 77045 + }, + { + "epoch": 11.476020256181114, + "grad_norm": 0.2470703125, + "learning_rate": 0.013755603642238122, + "loss": 0.8436, + "num_input_tokens_seen": 44730176, + "step": 77050 + }, + { + "epoch": 11.476764968722073, + "grad_norm": 0.06787109375, + "learning_rate": 0.013753660720534052, + "loss": 0.8155, + "num_input_tokens_seen": 44733184, + "step": 77055 + }, + { + "epoch": 11.477509681263033, + "grad_norm": 0.078125, + "learning_rate": 0.013751717819885616, + "loss": 0.7943, + "num_input_tokens_seen": 44735872, + "step": 77060 + }, + { + "epoch": 11.478254393803992, + "grad_norm": 0.035400390625, + "learning_rate": 0.01374977494032564, + "loss": 0.798, + "num_input_tokens_seen": 44738784, + "step": 77065 + }, + { + "epoch": 11.47899910634495, + "grad_norm": 0.038330078125, + "learning_rate": 0.01374783208188694, + "loss": 0.8018, + "num_input_tokens_seen": 44741888, + "step": 77070 + }, + { + "epoch": 11.47974381888591, + "grad_norm": 0.07666015625, + "learning_rate": 0.01374588924460235, + "loss": 0.7801, + "num_input_tokens_seen": 44745024, + "step": 77075 + }, + { + "epoch": 11.48048853142687, + "grad_norm": 0.03515625, + "learning_rate": 0.013743946428504685, + "loss": 0.8315, + "num_input_tokens_seen": 44748288, + "step": 77080 + }, + { + "epoch": 11.481233243967829, + "grad_norm": 0.054931640625, + "learning_rate": 0.01374200363362677, + "loss": 0.7817, + "num_input_tokens_seen": 44751168, + "step": 77085 + }, + { + "epoch": 11.481977956508787, + "grad_norm": 0.068359375, + "learning_rate": 0.013740060860001423, + "loss": 0.7568, + "num_input_tokens_seen": 44754240, + "step": 77090 + }, + { + "epoch": 11.482722669049746, + "grad_norm": 0.045166015625, + "learning_rate": 0.013738118107661465, + "loss": 0.7832, + "num_input_tokens_seen": 44756736, + "step": 77095 + }, + { + "epoch": 11.483467381590707, + "grad_norm": 0.087890625, + "learning_rate": 0.013736175376639724, + "loss": 0.7988, + "num_input_tokens_seen": 44759744, + "step": 77100 + }, + { + "epoch": 11.484212094131665, + "grad_norm": 0.046875, + "learning_rate": 0.01373423266696901, + "loss": 0.7313, + "num_input_tokens_seen": 44762688, + "step": 77105 + }, + { + "epoch": 11.484956806672624, + "grad_norm": 0.04736328125, + "learning_rate": 0.013732289978682154, + "loss": 0.8334, + "num_input_tokens_seen": 44765440, + "step": 77110 + }, + { + "epoch": 11.485701519213583, + "grad_norm": 0.052978515625, + "learning_rate": 0.013730347311811967, + "loss": 0.7753, + "num_input_tokens_seen": 44768256, + "step": 77115 + }, + { + "epoch": 11.486446231754543, + "grad_norm": 0.038330078125, + "learning_rate": 0.013728404666391275, + "loss": 0.8086, + "num_input_tokens_seen": 44771136, + "step": 77120 + }, + { + "epoch": 11.487190944295502, + "grad_norm": 0.0498046875, + "learning_rate": 0.013726462042452894, + "loss": 0.8238, + "num_input_tokens_seen": 44774240, + "step": 77125 + }, + { + "epoch": 11.487935656836461, + "grad_norm": 0.06982421875, + "learning_rate": 0.013724519440029637, + "loss": 0.8103, + "num_input_tokens_seen": 44776832, + "step": 77130 + }, + { + "epoch": 11.48868036937742, + "grad_norm": 0.044921875, + "learning_rate": 0.013722576859154333, + "loss": 0.7913, + "num_input_tokens_seen": 44779424, + "step": 77135 + }, + { + "epoch": 11.48942508191838, + "grad_norm": 0.0208740234375, + "learning_rate": 0.013720634299859789, + "loss": 0.8009, + "num_input_tokens_seen": 44782272, + "step": 77140 + }, + { + "epoch": 11.490169794459339, + "grad_norm": 0.0361328125, + "learning_rate": 0.013718691762178834, + "loss": 0.8054, + "num_input_tokens_seen": 44785280, + "step": 77145 + }, + { + "epoch": 11.490914507000298, + "grad_norm": 0.051513671875, + "learning_rate": 0.013716749246144275, + "loss": 0.8209, + "num_input_tokens_seen": 44788160, + "step": 77150 + }, + { + "epoch": 11.491659219541257, + "grad_norm": 0.030517578125, + "learning_rate": 0.013714806751788938, + "loss": 0.7918, + "num_input_tokens_seen": 44790848, + "step": 77155 + }, + { + "epoch": 11.492403932082217, + "grad_norm": 0.039794921875, + "learning_rate": 0.013712864279145628, + "loss": 0.8047, + "num_input_tokens_seen": 44793952, + "step": 77160 + }, + { + "epoch": 11.493148644623176, + "grad_norm": 0.0546875, + "learning_rate": 0.013710921828247175, + "loss": 0.7929, + "num_input_tokens_seen": 44796864, + "step": 77165 + }, + { + "epoch": 11.493893357164135, + "grad_norm": 0.043701171875, + "learning_rate": 0.013708979399126386, + "loss": 0.8008, + "num_input_tokens_seen": 44799872, + "step": 77170 + }, + { + "epoch": 11.494638069705093, + "grad_norm": 0.051513671875, + "learning_rate": 0.013707036991816073, + "loss": 0.8202, + "num_input_tokens_seen": 44802688, + "step": 77175 + }, + { + "epoch": 11.495382782246054, + "grad_norm": 0.043212890625, + "learning_rate": 0.01370509460634906, + "loss": 0.8334, + "num_input_tokens_seen": 44805440, + "step": 77180 + }, + { + "epoch": 11.496127494787013, + "grad_norm": 0.05078125, + "learning_rate": 0.013703152242758157, + "loss": 0.8087, + "num_input_tokens_seen": 44808352, + "step": 77185 + }, + { + "epoch": 11.496872207327971, + "grad_norm": 0.042236328125, + "learning_rate": 0.013701209901076181, + "loss": 0.7697, + "num_input_tokens_seen": 44811136, + "step": 77190 + }, + { + "epoch": 11.49761691986893, + "grad_norm": 0.019775390625, + "learning_rate": 0.013699267581335937, + "loss": 0.7962, + "num_input_tokens_seen": 44813984, + "step": 77195 + }, + { + "epoch": 11.498361632409889, + "grad_norm": 0.029541015625, + "learning_rate": 0.013697325283570252, + "loss": 0.8146, + "num_input_tokens_seen": 44816544, + "step": 77200 + }, + { + "epoch": 11.49910634495085, + "grad_norm": 0.0263671875, + "learning_rate": 0.013695383007811932, + "loss": 0.8195, + "num_input_tokens_seen": 44819488, + "step": 77205 + }, + { + "epoch": 11.499851057491808, + "grad_norm": 0.027099609375, + "learning_rate": 0.013693440754093783, + "loss": 0.8417, + "num_input_tokens_seen": 44822432, + "step": 77210 + }, + { + "epoch": 11.500595770032767, + "grad_norm": 0.043212890625, + "learning_rate": 0.013691498522448633, + "loss": 0.7886, + "num_input_tokens_seen": 44825408, + "step": 77215 + }, + { + "epoch": 11.501340482573726, + "grad_norm": 0.03662109375, + "learning_rate": 0.013689556312909282, + "loss": 0.8027, + "num_input_tokens_seen": 44828256, + "step": 77220 + }, + { + "epoch": 11.502085195114686, + "grad_norm": 0.033447265625, + "learning_rate": 0.013687614125508547, + "loss": 0.8123, + "num_input_tokens_seen": 44831360, + "step": 77225 + }, + { + "epoch": 11.502829907655645, + "grad_norm": 0.0390625, + "learning_rate": 0.013685671960279233, + "loss": 0.8061, + "num_input_tokens_seen": 44834176, + "step": 77230 + }, + { + "epoch": 11.503574620196604, + "grad_norm": 0.04541015625, + "learning_rate": 0.013683729817254159, + "loss": 0.8049, + "num_input_tokens_seen": 44836864, + "step": 77235 + }, + { + "epoch": 11.504319332737563, + "grad_norm": 0.051513671875, + "learning_rate": 0.013681787696466127, + "loss": 0.7941, + "num_input_tokens_seen": 44839680, + "step": 77240 + }, + { + "epoch": 11.505064045278523, + "grad_norm": 0.056640625, + "learning_rate": 0.01367984559794796, + "loss": 0.7933, + "num_input_tokens_seen": 44842912, + "step": 77245 + }, + { + "epoch": 11.505808757819482, + "grad_norm": 0.036865234375, + "learning_rate": 0.013677903521732458, + "loss": 0.8177, + "num_input_tokens_seen": 44846016, + "step": 77250 + }, + { + "epoch": 11.50655347036044, + "grad_norm": 0.03369140625, + "learning_rate": 0.013675961467852429, + "loss": 0.7988, + "num_input_tokens_seen": 44849120, + "step": 77255 + }, + { + "epoch": 11.5072981829014, + "grad_norm": 0.045654296875, + "learning_rate": 0.01367401943634069, + "loss": 0.7953, + "num_input_tokens_seen": 44852000, + "step": 77260 + }, + { + "epoch": 11.50804289544236, + "grad_norm": 0.044921875, + "learning_rate": 0.013672077427230037, + "loss": 0.819, + "num_input_tokens_seen": 44854912, + "step": 77265 + }, + { + "epoch": 11.508787607983319, + "grad_norm": 0.055419921875, + "learning_rate": 0.013670135440553292, + "loss": 0.7809, + "num_input_tokens_seen": 44857760, + "step": 77270 + }, + { + "epoch": 11.509532320524277, + "grad_norm": 0.0419921875, + "learning_rate": 0.013668193476343253, + "loss": 0.807, + "num_input_tokens_seen": 44860512, + "step": 77275 + }, + { + "epoch": 11.510277033065236, + "grad_norm": 0.0703125, + "learning_rate": 0.013666251534632738, + "loss": 0.8254, + "num_input_tokens_seen": 44863680, + "step": 77280 + }, + { + "epoch": 11.511021745606197, + "grad_norm": 0.048095703125, + "learning_rate": 0.013664309615454544, + "loss": 0.7958, + "num_input_tokens_seen": 44866528, + "step": 77285 + }, + { + "epoch": 11.511766458147155, + "grad_norm": 0.06787109375, + "learning_rate": 0.013662367718841483, + "loss": 0.7988, + "num_input_tokens_seen": 44869568, + "step": 77290 + }, + { + "epoch": 11.512511170688114, + "grad_norm": 0.05029296875, + "learning_rate": 0.01366042584482636, + "loss": 0.8013, + "num_input_tokens_seen": 44872608, + "step": 77295 + }, + { + "epoch": 11.513255883229073, + "grad_norm": 0.06689453125, + "learning_rate": 0.013658483993441977, + "loss": 0.7943, + "num_input_tokens_seen": 44875680, + "step": 77300 + }, + { + "epoch": 11.514000595770034, + "grad_norm": 0.031982421875, + "learning_rate": 0.013656542164721147, + "loss": 0.8062, + "num_input_tokens_seen": 44878400, + "step": 77305 + }, + { + "epoch": 11.514745308310992, + "grad_norm": 0.02978515625, + "learning_rate": 0.013654600358696667, + "loss": 0.7806, + "num_input_tokens_seen": 44881184, + "step": 77310 + }, + { + "epoch": 11.515490020851951, + "grad_norm": 0.031982421875, + "learning_rate": 0.013652658575401353, + "loss": 0.8193, + "num_input_tokens_seen": 44884384, + "step": 77315 + }, + { + "epoch": 11.51623473339291, + "grad_norm": 0.03125, + "learning_rate": 0.013650716814868, + "loss": 0.8002, + "num_input_tokens_seen": 44887264, + "step": 77320 + }, + { + "epoch": 11.51697944593387, + "grad_norm": 0.03271484375, + "learning_rate": 0.013648775077129418, + "loss": 0.7743, + "num_input_tokens_seen": 44889952, + "step": 77325 + }, + { + "epoch": 11.517724158474829, + "grad_norm": 0.056396484375, + "learning_rate": 0.013646833362218407, + "loss": 0.8143, + "num_input_tokens_seen": 44893152, + "step": 77330 + }, + { + "epoch": 11.518468871015788, + "grad_norm": 0.032958984375, + "learning_rate": 0.013644891670167766, + "loss": 0.7922, + "num_input_tokens_seen": 44896128, + "step": 77335 + }, + { + "epoch": 11.519213583556747, + "grad_norm": 0.04443359375, + "learning_rate": 0.013642950001010307, + "loss": 0.7963, + "num_input_tokens_seen": 44898976, + "step": 77340 + }, + { + "epoch": 11.519958296097705, + "grad_norm": 0.033447265625, + "learning_rate": 0.013641008354778824, + "loss": 0.7991, + "num_input_tokens_seen": 44901536, + "step": 77345 + }, + { + "epoch": 11.520703008638666, + "grad_norm": 0.05712890625, + "learning_rate": 0.01363906673150613, + "loss": 0.8039, + "num_input_tokens_seen": 44904256, + "step": 77350 + }, + { + "epoch": 11.521447721179625, + "grad_norm": 0.030029296875, + "learning_rate": 0.013637125131225017, + "loss": 0.8029, + "num_input_tokens_seen": 44907104, + "step": 77355 + }, + { + "epoch": 11.522192433720583, + "grad_norm": 0.048095703125, + "learning_rate": 0.013635183553968293, + "loss": 0.814, + "num_input_tokens_seen": 44909856, + "step": 77360 + }, + { + "epoch": 11.522937146261544, + "grad_norm": 0.0439453125, + "learning_rate": 0.013633241999768755, + "loss": 0.8168, + "num_input_tokens_seen": 44912928, + "step": 77365 + }, + { + "epoch": 11.523681858802503, + "grad_norm": 0.036865234375, + "learning_rate": 0.013631300468659199, + "loss": 0.8023, + "num_input_tokens_seen": 44915648, + "step": 77370 + }, + { + "epoch": 11.524426571343461, + "grad_norm": 0.036865234375, + "learning_rate": 0.013629358960672435, + "loss": 0.7954, + "num_input_tokens_seen": 44918496, + "step": 77375 + }, + { + "epoch": 11.52517128388442, + "grad_norm": 0.09765625, + "learning_rate": 0.013627417475841255, + "loss": 0.8089, + "num_input_tokens_seen": 44921312, + "step": 77380 + }, + { + "epoch": 11.525915996425379, + "grad_norm": 0.03515625, + "learning_rate": 0.013625476014198466, + "loss": 0.7726, + "num_input_tokens_seen": 44924160, + "step": 77385 + }, + { + "epoch": 11.52666070896634, + "grad_norm": 0.041259765625, + "learning_rate": 0.01362353457577686, + "loss": 0.8057, + "num_input_tokens_seen": 44926944, + "step": 77390 + }, + { + "epoch": 11.527405421507298, + "grad_norm": 0.037109375, + "learning_rate": 0.01362159316060924, + "loss": 0.8038, + "num_input_tokens_seen": 44929856, + "step": 77395 + }, + { + "epoch": 11.528150134048257, + "grad_norm": 0.033935546875, + "learning_rate": 0.013619651768728399, + "loss": 0.8013, + "num_input_tokens_seen": 44932832, + "step": 77400 + }, + { + "epoch": 11.528894846589216, + "grad_norm": 0.0341796875, + "learning_rate": 0.013617710400167142, + "loss": 0.8093, + "num_input_tokens_seen": 44936064, + "step": 77405 + }, + { + "epoch": 11.529639559130176, + "grad_norm": 0.029052734375, + "learning_rate": 0.013615769054958265, + "loss": 0.8067, + "num_input_tokens_seen": 44938944, + "step": 77410 + }, + { + "epoch": 11.530384271671135, + "grad_norm": 0.044921875, + "learning_rate": 0.01361382773313456, + "loss": 0.8162, + "num_input_tokens_seen": 44941504, + "step": 77415 + }, + { + "epoch": 11.531128984212094, + "grad_norm": 0.033935546875, + "learning_rate": 0.013611886434728826, + "loss": 0.7948, + "num_input_tokens_seen": 44944352, + "step": 77420 + }, + { + "epoch": 11.531873696753053, + "grad_norm": 0.0224609375, + "learning_rate": 0.013609945159773859, + "loss": 0.8009, + "num_input_tokens_seen": 44947072, + "step": 77425 + }, + { + "epoch": 11.532618409294013, + "grad_norm": 0.03564453125, + "learning_rate": 0.01360800390830246, + "loss": 0.7899, + "num_input_tokens_seen": 44950208, + "step": 77430 + }, + { + "epoch": 11.533363121834972, + "grad_norm": 0.037841796875, + "learning_rate": 0.013606062680347412, + "loss": 0.7903, + "num_input_tokens_seen": 44953024, + "step": 77435 + }, + { + "epoch": 11.53410783437593, + "grad_norm": 0.0213623046875, + "learning_rate": 0.013604121475941526, + "loss": 0.8096, + "num_input_tokens_seen": 44955872, + "step": 77440 + }, + { + "epoch": 11.53485254691689, + "grad_norm": 0.1123046875, + "learning_rate": 0.013602180295117585, + "loss": 0.7806, + "num_input_tokens_seen": 44958720, + "step": 77445 + }, + { + "epoch": 11.53559725945785, + "grad_norm": 0.029296875, + "learning_rate": 0.013600239137908393, + "loss": 0.8182, + "num_input_tokens_seen": 44961408, + "step": 77450 + }, + { + "epoch": 11.536341971998809, + "grad_norm": 0.033447265625, + "learning_rate": 0.013598298004346735, + "loss": 0.7954, + "num_input_tokens_seen": 44964288, + "step": 77455 + }, + { + "epoch": 11.537086684539767, + "grad_norm": 0.0341796875, + "learning_rate": 0.013596356894465403, + "loss": 0.7734, + "num_input_tokens_seen": 44967040, + "step": 77460 + }, + { + "epoch": 11.537831397080726, + "grad_norm": 0.05615234375, + "learning_rate": 0.0135944158082972, + "loss": 0.7872, + "num_input_tokens_seen": 44969952, + "step": 77465 + }, + { + "epoch": 11.538576109621687, + "grad_norm": 0.0341796875, + "learning_rate": 0.013592474745874909, + "loss": 0.8199, + "num_input_tokens_seen": 44972896, + "step": 77470 + }, + { + "epoch": 11.539320822162646, + "grad_norm": 0.033203125, + "learning_rate": 0.013590533707231331, + "loss": 0.7851, + "num_input_tokens_seen": 44975840, + "step": 77475 + }, + { + "epoch": 11.540065534703604, + "grad_norm": 0.031494140625, + "learning_rate": 0.01358859269239925, + "loss": 0.7837, + "num_input_tokens_seen": 44978688, + "step": 77480 + }, + { + "epoch": 11.540810247244563, + "grad_norm": 0.025634765625, + "learning_rate": 0.013586651701411465, + "loss": 0.8027, + "num_input_tokens_seen": 44981600, + "step": 77485 + }, + { + "epoch": 11.541554959785524, + "grad_norm": 0.028076171875, + "learning_rate": 0.013584710734300764, + "loss": 0.7834, + "num_input_tokens_seen": 44984544, + "step": 77490 + }, + { + "epoch": 11.542299672326482, + "grad_norm": 0.07568359375, + "learning_rate": 0.01358276979109993, + "loss": 0.7921, + "num_input_tokens_seen": 44987232, + "step": 77495 + }, + { + "epoch": 11.543044384867441, + "grad_norm": 0.0361328125, + "learning_rate": 0.013580828871841765, + "loss": 0.8055, + "num_input_tokens_seen": 44990272, + "step": 77500 + }, + { + "epoch": 11.5437890974084, + "grad_norm": 0.03662109375, + "learning_rate": 0.013578887976559048, + "loss": 0.781, + "num_input_tokens_seen": 44993472, + "step": 77505 + }, + { + "epoch": 11.54453380994936, + "grad_norm": 0.0263671875, + "learning_rate": 0.01357694710528458, + "loss": 0.7948, + "num_input_tokens_seen": 44996256, + "step": 77510 + }, + { + "epoch": 11.54527852249032, + "grad_norm": 0.02783203125, + "learning_rate": 0.013575006258051142, + "loss": 0.7672, + "num_input_tokens_seen": 44999232, + "step": 77515 + }, + { + "epoch": 11.546023235031278, + "grad_norm": 0.1416015625, + "learning_rate": 0.01357306543489153, + "loss": 0.8476, + "num_input_tokens_seen": 45001952, + "step": 77520 + }, + { + "epoch": 11.546767947572237, + "grad_norm": 0.0478515625, + "learning_rate": 0.013571124635838518, + "loss": 0.8025, + "num_input_tokens_seen": 45004864, + "step": 77525 + }, + { + "epoch": 11.547512660113195, + "grad_norm": 0.0283203125, + "learning_rate": 0.01356918386092491, + "loss": 0.8201, + "num_input_tokens_seen": 45007584, + "step": 77530 + }, + { + "epoch": 11.548257372654156, + "grad_norm": 0.040771484375, + "learning_rate": 0.013567243110183487, + "loss": 0.8313, + "num_input_tokens_seen": 45010464, + "step": 77535 + }, + { + "epoch": 11.549002085195115, + "grad_norm": 0.0308837890625, + "learning_rate": 0.013565302383647029, + "loss": 0.811, + "num_input_tokens_seen": 45013504, + "step": 77540 + }, + { + "epoch": 11.549746797736073, + "grad_norm": 0.0228271484375, + "learning_rate": 0.013563361681348335, + "loss": 0.8172, + "num_input_tokens_seen": 45016544, + "step": 77545 + }, + { + "epoch": 11.550491510277032, + "grad_norm": 0.025146484375, + "learning_rate": 0.013561421003320182, + "loss": 0.8292, + "num_input_tokens_seen": 45019456, + "step": 77550 + }, + { + "epoch": 11.551236222817993, + "grad_norm": 0.023681640625, + "learning_rate": 0.013559480349595364, + "loss": 0.7876, + "num_input_tokens_seen": 45022464, + "step": 77555 + }, + { + "epoch": 11.551980935358952, + "grad_norm": 0.0302734375, + "learning_rate": 0.013557539720206652, + "loss": 0.7802, + "num_input_tokens_seen": 45025664, + "step": 77560 + }, + { + "epoch": 11.55272564789991, + "grad_norm": 0.0308837890625, + "learning_rate": 0.01355559911518685, + "loss": 0.7941, + "num_input_tokens_seen": 45028512, + "step": 77565 + }, + { + "epoch": 11.553470360440869, + "grad_norm": 0.0283203125, + "learning_rate": 0.01355365853456873, + "loss": 0.8078, + "num_input_tokens_seen": 45031584, + "step": 77570 + }, + { + "epoch": 11.55421507298183, + "grad_norm": 0.043701171875, + "learning_rate": 0.013551717978385074, + "loss": 0.8016, + "num_input_tokens_seen": 45034560, + "step": 77575 + }, + { + "epoch": 11.554959785522788, + "grad_norm": 0.0299072265625, + "learning_rate": 0.013549777446668677, + "loss": 0.795, + "num_input_tokens_seen": 45037504, + "step": 77580 + }, + { + "epoch": 11.555704498063747, + "grad_norm": 0.03955078125, + "learning_rate": 0.013547836939452313, + "loss": 0.7797, + "num_input_tokens_seen": 45040352, + "step": 77585 + }, + { + "epoch": 11.556449210604706, + "grad_norm": 0.0194091796875, + "learning_rate": 0.013545896456768772, + "loss": 0.789, + "num_input_tokens_seen": 45043328, + "step": 77590 + }, + { + "epoch": 11.557193923145666, + "grad_norm": 0.0269775390625, + "learning_rate": 0.013543955998650825, + "loss": 0.8163, + "num_input_tokens_seen": 45047488, + "step": 77595 + }, + { + "epoch": 11.557938635686625, + "grad_norm": 0.032958984375, + "learning_rate": 0.01354201556513127, + "loss": 0.7829, + "num_input_tokens_seen": 45050336, + "step": 77600 + }, + { + "epoch": 11.558683348227584, + "grad_norm": 0.045654296875, + "learning_rate": 0.013540075156242871, + "loss": 0.8147, + "num_input_tokens_seen": 45053472, + "step": 77605 + }, + { + "epoch": 11.559428060768543, + "grad_norm": 0.0390625, + "learning_rate": 0.013538134772018425, + "loss": 0.8067, + "num_input_tokens_seen": 45056192, + "step": 77610 + }, + { + "epoch": 11.560172773309503, + "grad_norm": 0.020751953125, + "learning_rate": 0.013536194412490708, + "loss": 0.8291, + "num_input_tokens_seen": 45058880, + "step": 77615 + }, + { + "epoch": 11.560917485850462, + "grad_norm": 0.042724609375, + "learning_rate": 0.013534254077692496, + "loss": 0.8019, + "num_input_tokens_seen": 45061664, + "step": 77620 + }, + { + "epoch": 11.56166219839142, + "grad_norm": 0.0194091796875, + "learning_rate": 0.013532313767656573, + "loss": 0.7745, + "num_input_tokens_seen": 45064416, + "step": 77625 + }, + { + "epoch": 11.56240691093238, + "grad_norm": 0.027099609375, + "learning_rate": 0.013530373482415712, + "loss": 0.7802, + "num_input_tokens_seen": 45067232, + "step": 77630 + }, + { + "epoch": 11.56315162347334, + "grad_norm": 0.02978515625, + "learning_rate": 0.013528433222002702, + "loss": 0.7994, + "num_input_tokens_seen": 45070112, + "step": 77635 + }, + { + "epoch": 11.563896336014299, + "grad_norm": 0.037841796875, + "learning_rate": 0.013526492986450311, + "loss": 0.7904, + "num_input_tokens_seen": 45073440, + "step": 77640 + }, + { + "epoch": 11.564641048555258, + "grad_norm": 0.0498046875, + "learning_rate": 0.013524552775791331, + "loss": 0.7975, + "num_input_tokens_seen": 45076416, + "step": 77645 + }, + { + "epoch": 11.565385761096216, + "grad_norm": 0.04833984375, + "learning_rate": 0.013522612590058531, + "loss": 0.8313, + "num_input_tokens_seen": 45079424, + "step": 77650 + }, + { + "epoch": 11.566130473637177, + "grad_norm": 0.0208740234375, + "learning_rate": 0.013520672429284689, + "loss": 0.8096, + "num_input_tokens_seen": 45082368, + "step": 77655 + }, + { + "epoch": 11.566875186178136, + "grad_norm": 0.03662109375, + "learning_rate": 0.013518732293502584, + "loss": 0.787, + "num_input_tokens_seen": 45085280, + "step": 77660 + }, + { + "epoch": 11.567619898719094, + "grad_norm": 0.0267333984375, + "learning_rate": 0.013516792182744986, + "loss": 0.791, + "num_input_tokens_seen": 45088416, + "step": 77665 + }, + { + "epoch": 11.568364611260053, + "grad_norm": 0.0302734375, + "learning_rate": 0.013514852097044682, + "loss": 0.788, + "num_input_tokens_seen": 45091520, + "step": 77670 + }, + { + "epoch": 11.569109323801012, + "grad_norm": 0.046142578125, + "learning_rate": 0.013512912036434436, + "loss": 0.8124, + "num_input_tokens_seen": 45094368, + "step": 77675 + }, + { + "epoch": 11.569854036341972, + "grad_norm": 0.0272216796875, + "learning_rate": 0.013510972000947035, + "loss": 0.8074, + "num_input_tokens_seen": 45097184, + "step": 77680 + }, + { + "epoch": 11.570598748882931, + "grad_norm": 0.05078125, + "learning_rate": 0.013509031990615247, + "loss": 0.8177, + "num_input_tokens_seen": 45100000, + "step": 77685 + }, + { + "epoch": 11.57134346142389, + "grad_norm": 0.0299072265625, + "learning_rate": 0.013507092005471852, + "loss": 0.8191, + "num_input_tokens_seen": 45102848, + "step": 77690 + }, + { + "epoch": 11.57208817396485, + "grad_norm": 0.0264892578125, + "learning_rate": 0.01350515204554962, + "loss": 0.8085, + "num_input_tokens_seen": 45105472, + "step": 77695 + }, + { + "epoch": 11.57283288650581, + "grad_norm": 0.10205078125, + "learning_rate": 0.013503212110881319, + "loss": 0.8305, + "num_input_tokens_seen": 45108384, + "step": 77700 + }, + { + "epoch": 11.573577599046768, + "grad_norm": 0.035888671875, + "learning_rate": 0.013501272201499733, + "loss": 0.7909, + "num_input_tokens_seen": 45111680, + "step": 77705 + }, + { + "epoch": 11.574322311587727, + "grad_norm": 0.036376953125, + "learning_rate": 0.013499332317437622, + "loss": 0.8307, + "num_input_tokens_seen": 45114304, + "step": 77710 + }, + { + "epoch": 11.575067024128685, + "grad_norm": 0.040283203125, + "learning_rate": 0.013497392458727777, + "loss": 0.7944, + "num_input_tokens_seen": 45117088, + "step": 77715 + }, + { + "epoch": 11.575811736669646, + "grad_norm": 0.0279541015625, + "learning_rate": 0.013495452625402953, + "loss": 0.7959, + "num_input_tokens_seen": 45120000, + "step": 77720 + }, + { + "epoch": 11.576556449210605, + "grad_norm": 0.03466796875, + "learning_rate": 0.013493512817495931, + "loss": 0.7954, + "num_input_tokens_seen": 45122976, + "step": 77725 + }, + { + "epoch": 11.577301161751564, + "grad_norm": 0.04296875, + "learning_rate": 0.01349157303503948, + "loss": 0.7854, + "num_input_tokens_seen": 45125952, + "step": 77730 + }, + { + "epoch": 11.578045874292522, + "grad_norm": 0.0223388671875, + "learning_rate": 0.013489633278066362, + "loss": 0.7889, + "num_input_tokens_seen": 45128736, + "step": 77735 + }, + { + "epoch": 11.578790586833483, + "grad_norm": 0.029052734375, + "learning_rate": 0.013487693546609362, + "loss": 0.8166, + "num_input_tokens_seen": 45131840, + "step": 77740 + }, + { + "epoch": 11.579535299374442, + "grad_norm": 0.037353515625, + "learning_rate": 0.013485753840701241, + "loss": 0.8179, + "num_input_tokens_seen": 45135296, + "step": 77745 + }, + { + "epoch": 11.5802800119154, + "grad_norm": 0.035400390625, + "learning_rate": 0.013483814160374766, + "loss": 0.8009, + "num_input_tokens_seen": 45138240, + "step": 77750 + }, + { + "epoch": 11.581024724456359, + "grad_norm": 0.0223388671875, + "learning_rate": 0.013481874505662713, + "loss": 0.7825, + "num_input_tokens_seen": 45141088, + "step": 77755 + }, + { + "epoch": 11.58176943699732, + "grad_norm": 0.03369140625, + "learning_rate": 0.013479934876597849, + "loss": 0.8003, + "num_input_tokens_seen": 45144288, + "step": 77760 + }, + { + "epoch": 11.582514149538278, + "grad_norm": 0.0257568359375, + "learning_rate": 0.013477995273212934, + "loss": 0.8016, + "num_input_tokens_seen": 45147264, + "step": 77765 + }, + { + "epoch": 11.583258862079237, + "grad_norm": 0.03466796875, + "learning_rate": 0.013476055695540749, + "loss": 0.7975, + "num_input_tokens_seen": 45150176, + "step": 77770 + }, + { + "epoch": 11.584003574620196, + "grad_norm": 0.036376953125, + "learning_rate": 0.013474116143614052, + "loss": 0.803, + "num_input_tokens_seen": 45152992, + "step": 77775 + }, + { + "epoch": 11.584748287161156, + "grad_norm": 0.044189453125, + "learning_rate": 0.01347217661746561, + "loss": 0.7883, + "num_input_tokens_seen": 45155808, + "step": 77780 + }, + { + "epoch": 11.585492999702115, + "grad_norm": 0.047119140625, + "learning_rate": 0.013470237117128196, + "loss": 0.8383, + "num_input_tokens_seen": 45158528, + "step": 77785 + }, + { + "epoch": 11.586237712243074, + "grad_norm": 0.033935546875, + "learning_rate": 0.013468297642634564, + "loss": 0.8074, + "num_input_tokens_seen": 45161280, + "step": 77790 + }, + { + "epoch": 11.586982424784033, + "grad_norm": 0.0341796875, + "learning_rate": 0.013466358194017492, + "loss": 0.7814, + "num_input_tokens_seen": 45164160, + "step": 77795 + }, + { + "epoch": 11.587727137324993, + "grad_norm": 0.031494140625, + "learning_rate": 0.013464418771309734, + "loss": 0.8017, + "num_input_tokens_seen": 45167200, + "step": 77800 + }, + { + "epoch": 11.588471849865952, + "grad_norm": 0.040283203125, + "learning_rate": 0.013462479374544065, + "loss": 0.7949, + "num_input_tokens_seen": 45169856, + "step": 77805 + }, + { + "epoch": 11.58921656240691, + "grad_norm": 0.031005859375, + "learning_rate": 0.013460540003753246, + "loss": 0.8018, + "num_input_tokens_seen": 45172544, + "step": 77810 + }, + { + "epoch": 11.58996127494787, + "grad_norm": 0.027099609375, + "learning_rate": 0.013458600658970036, + "loss": 0.7868, + "num_input_tokens_seen": 45175616, + "step": 77815 + }, + { + "epoch": 11.59070598748883, + "grad_norm": 0.042236328125, + "learning_rate": 0.013456661340227202, + "loss": 0.7893, + "num_input_tokens_seen": 45178560, + "step": 77820 + }, + { + "epoch": 11.591450700029789, + "grad_norm": 0.03857421875, + "learning_rate": 0.013454722047557502, + "loss": 0.791, + "num_input_tokens_seen": 45181376, + "step": 77825 + }, + { + "epoch": 11.592195412570748, + "grad_norm": 0.054931640625, + "learning_rate": 0.013452782780993709, + "loss": 0.8014, + "num_input_tokens_seen": 45184160, + "step": 77830 + }, + { + "epoch": 11.592940125111706, + "grad_norm": 0.042724609375, + "learning_rate": 0.013450843540568571, + "loss": 0.7805, + "num_input_tokens_seen": 45187104, + "step": 77835 + }, + { + "epoch": 11.593684837652667, + "grad_norm": 0.036865234375, + "learning_rate": 0.013448904326314865, + "loss": 0.796, + "num_input_tokens_seen": 45189984, + "step": 77840 + }, + { + "epoch": 11.594429550193626, + "grad_norm": 0.035400390625, + "learning_rate": 0.013446965138265338, + "loss": 0.7733, + "num_input_tokens_seen": 45192832, + "step": 77845 + }, + { + "epoch": 11.595174262734584, + "grad_norm": 0.029296875, + "learning_rate": 0.013445025976452762, + "loss": 0.7717, + "num_input_tokens_seen": 45195680, + "step": 77850 + }, + { + "epoch": 11.595918975275543, + "grad_norm": 0.030029296875, + "learning_rate": 0.01344308684090989, + "loss": 0.8062, + "num_input_tokens_seen": 45198432, + "step": 77855 + }, + { + "epoch": 11.596663687816502, + "grad_norm": 0.04736328125, + "learning_rate": 0.013441147731669477, + "loss": 0.8216, + "num_input_tokens_seen": 45201152, + "step": 77860 + }, + { + "epoch": 11.597408400357462, + "grad_norm": 0.04052734375, + "learning_rate": 0.013439208648764297, + "loss": 0.8002, + "num_input_tokens_seen": 45204288, + "step": 77865 + }, + { + "epoch": 11.598153112898421, + "grad_norm": 0.03173828125, + "learning_rate": 0.013437269592227092, + "loss": 0.8368, + "num_input_tokens_seen": 45207360, + "step": 77870 + }, + { + "epoch": 11.59889782543938, + "grad_norm": 0.03466796875, + "learning_rate": 0.013435330562090637, + "loss": 0.7866, + "num_input_tokens_seen": 45210528, + "step": 77875 + }, + { + "epoch": 11.59964253798034, + "grad_norm": 0.047607421875, + "learning_rate": 0.01343339155838768, + "loss": 0.7894, + "num_input_tokens_seen": 45213024, + "step": 77880 + }, + { + "epoch": 11.6003872505213, + "grad_norm": 0.03173828125, + "learning_rate": 0.01343145258115098, + "loss": 0.7985, + "num_input_tokens_seen": 45215776, + "step": 77885 + }, + { + "epoch": 11.601131963062258, + "grad_norm": 0.0322265625, + "learning_rate": 0.013429513630413297, + "loss": 0.8237, + "num_input_tokens_seen": 45218880, + "step": 77890 + }, + { + "epoch": 11.601876675603217, + "grad_norm": 0.02685546875, + "learning_rate": 0.013427574706207377, + "loss": 0.8283, + "num_input_tokens_seen": 45222112, + "step": 77895 + }, + { + "epoch": 11.602621388144176, + "grad_norm": 0.0269775390625, + "learning_rate": 0.013425635808565992, + "loss": 0.7754, + "num_input_tokens_seen": 45225024, + "step": 77900 + }, + { + "epoch": 11.603366100685136, + "grad_norm": 0.032958984375, + "learning_rate": 0.013423696937521884, + "loss": 0.7953, + "num_input_tokens_seen": 45227776, + "step": 77905 + }, + { + "epoch": 11.604110813226095, + "grad_norm": 0.0478515625, + "learning_rate": 0.013421758093107819, + "loss": 0.8326, + "num_input_tokens_seen": 45230656, + "step": 77910 + }, + { + "epoch": 11.604855525767054, + "grad_norm": 0.0390625, + "learning_rate": 0.013419819275356543, + "loss": 0.7831, + "num_input_tokens_seen": 45233440, + "step": 77915 + }, + { + "epoch": 11.605600238308012, + "grad_norm": 0.045166015625, + "learning_rate": 0.01341788048430082, + "loss": 0.7827, + "num_input_tokens_seen": 45236416, + "step": 77920 + }, + { + "epoch": 11.606344950848973, + "grad_norm": 0.033447265625, + "learning_rate": 0.013415941719973389, + "loss": 0.806, + "num_input_tokens_seen": 45239424, + "step": 77925 + }, + { + "epoch": 11.607089663389932, + "grad_norm": 0.0272216796875, + "learning_rate": 0.013414002982407022, + "loss": 0.8031, + "num_input_tokens_seen": 45242336, + "step": 77930 + }, + { + "epoch": 11.60783437593089, + "grad_norm": 0.03173828125, + "learning_rate": 0.013412064271634461, + "loss": 0.8024, + "num_input_tokens_seen": 45245824, + "step": 77935 + }, + { + "epoch": 11.60857908847185, + "grad_norm": 0.04248046875, + "learning_rate": 0.013410125587688454, + "loss": 0.7902, + "num_input_tokens_seen": 45248864, + "step": 77940 + }, + { + "epoch": 11.60932380101281, + "grad_norm": 0.02734375, + "learning_rate": 0.013408186930601767, + "loss": 0.7907, + "num_input_tokens_seen": 45251584, + "step": 77945 + }, + { + "epoch": 11.610068513553768, + "grad_norm": 0.036865234375, + "learning_rate": 0.01340624830040714, + "loss": 0.7759, + "num_input_tokens_seen": 45254624, + "step": 77950 + }, + { + "epoch": 11.610813226094727, + "grad_norm": 0.046875, + "learning_rate": 0.01340430969713733, + "loss": 0.7925, + "num_input_tokens_seen": 45257600, + "step": 77955 + }, + { + "epoch": 11.611557938635686, + "grad_norm": 0.0361328125, + "learning_rate": 0.01340237112082508, + "loss": 0.7965, + "num_input_tokens_seen": 45260320, + "step": 77960 + }, + { + "epoch": 11.612302651176647, + "grad_norm": 0.08349609375, + "learning_rate": 0.013400432571503152, + "loss": 0.8293, + "num_input_tokens_seen": 45263136, + "step": 77965 + }, + { + "epoch": 11.613047363717605, + "grad_norm": 0.0205078125, + "learning_rate": 0.013398494049204294, + "loss": 0.7929, + "num_input_tokens_seen": 45266112, + "step": 77970 + }, + { + "epoch": 11.613792076258564, + "grad_norm": 0.032958984375, + "learning_rate": 0.01339655555396124, + "loss": 0.7871, + "num_input_tokens_seen": 45269248, + "step": 77975 + }, + { + "epoch": 11.614536788799523, + "grad_norm": 0.038330078125, + "learning_rate": 0.013394617085806759, + "loss": 0.7932, + "num_input_tokens_seen": 45272224, + "step": 77980 + }, + { + "epoch": 11.615281501340483, + "grad_norm": 0.030517578125, + "learning_rate": 0.013392678644773587, + "loss": 0.7842, + "num_input_tokens_seen": 45275040, + "step": 77985 + }, + { + "epoch": 11.616026213881442, + "grad_norm": 0.040771484375, + "learning_rate": 0.013390740230894479, + "loss": 0.7883, + "num_input_tokens_seen": 45277920, + "step": 77990 + }, + { + "epoch": 11.6167709264224, + "grad_norm": 0.039794921875, + "learning_rate": 0.013388801844202174, + "loss": 0.7823, + "num_input_tokens_seen": 45280960, + "step": 77995 + }, + { + "epoch": 11.61751563896336, + "grad_norm": 0.057861328125, + "learning_rate": 0.013386863484729427, + "loss": 0.8119, + "num_input_tokens_seen": 45283936, + "step": 78000 + }, + { + "epoch": 11.61826035150432, + "grad_norm": 0.042724609375, + "learning_rate": 0.013384925152508978, + "loss": 0.8119, + "num_input_tokens_seen": 45286752, + "step": 78005 + }, + { + "epoch": 11.619005064045279, + "grad_norm": 0.0242919921875, + "learning_rate": 0.013382986847573585, + "loss": 0.7945, + "num_input_tokens_seen": 45289856, + "step": 78010 + }, + { + "epoch": 11.619749776586238, + "grad_norm": 0.038330078125, + "learning_rate": 0.013381048569955983, + "loss": 0.7869, + "num_input_tokens_seen": 45292704, + "step": 78015 + }, + { + "epoch": 11.620494489127196, + "grad_norm": 0.03662109375, + "learning_rate": 0.013379110319688919, + "loss": 0.8146, + "num_input_tokens_seen": 45295616, + "step": 78020 + }, + { + "epoch": 11.621239201668157, + "grad_norm": 0.027587890625, + "learning_rate": 0.013377172096805142, + "loss": 0.8289, + "num_input_tokens_seen": 45298560, + "step": 78025 + }, + { + "epoch": 11.621983914209116, + "grad_norm": 0.0498046875, + "learning_rate": 0.013375233901337383, + "loss": 0.809, + "num_input_tokens_seen": 45301472, + "step": 78030 + }, + { + "epoch": 11.622728626750074, + "grad_norm": 0.027587890625, + "learning_rate": 0.013373295733318406, + "loss": 0.8003, + "num_input_tokens_seen": 45304224, + "step": 78035 + }, + { + "epoch": 11.623473339291033, + "grad_norm": 0.035888671875, + "learning_rate": 0.013371357592780938, + "loss": 0.795, + "num_input_tokens_seen": 45307296, + "step": 78040 + }, + { + "epoch": 11.624218051831992, + "grad_norm": 0.05224609375, + "learning_rate": 0.013369419479757732, + "loss": 0.798, + "num_input_tokens_seen": 45310176, + "step": 78045 + }, + { + "epoch": 11.624962764372953, + "grad_norm": 0.046875, + "learning_rate": 0.013367481394281525, + "loss": 0.8188, + "num_input_tokens_seen": 45313280, + "step": 78050 + }, + { + "epoch": 11.625707476913911, + "grad_norm": 0.033935546875, + "learning_rate": 0.013365543336385063, + "loss": 0.7791, + "num_input_tokens_seen": 45316224, + "step": 78055 + }, + { + "epoch": 11.62645218945487, + "grad_norm": 0.046630859375, + "learning_rate": 0.013363605306101088, + "loss": 0.8007, + "num_input_tokens_seen": 45319040, + "step": 78060 + }, + { + "epoch": 11.627196901995829, + "grad_norm": 0.050048828125, + "learning_rate": 0.013361667303462329, + "loss": 0.8292, + "num_input_tokens_seen": 45321696, + "step": 78065 + }, + { + "epoch": 11.62794161453679, + "grad_norm": 0.033935546875, + "learning_rate": 0.013359729328501543, + "loss": 0.817, + "num_input_tokens_seen": 45324608, + "step": 78070 + }, + { + "epoch": 11.628686327077748, + "grad_norm": 0.0703125, + "learning_rate": 0.01335779138125146, + "loss": 0.7798, + "num_input_tokens_seen": 45327904, + "step": 78075 + }, + { + "epoch": 11.629431039618707, + "grad_norm": 0.03564453125, + "learning_rate": 0.01335585346174482, + "loss": 0.7879, + "num_input_tokens_seen": 45330720, + "step": 78080 + }, + { + "epoch": 11.630175752159666, + "grad_norm": 0.0439453125, + "learning_rate": 0.013353915570014368, + "loss": 0.7866, + "num_input_tokens_seen": 45333568, + "step": 78085 + }, + { + "epoch": 11.630920464700626, + "grad_norm": 0.04052734375, + "learning_rate": 0.013351977706092841, + "loss": 0.7888, + "num_input_tokens_seen": 45336256, + "step": 78090 + }, + { + "epoch": 11.631665177241585, + "grad_norm": 0.0303955078125, + "learning_rate": 0.013350039870012976, + "loss": 0.807, + "num_input_tokens_seen": 45338976, + "step": 78095 + }, + { + "epoch": 11.632409889782544, + "grad_norm": 0.054931640625, + "learning_rate": 0.013348102061807504, + "loss": 0.7819, + "num_input_tokens_seen": 45341728, + "step": 78100 + }, + { + "epoch": 11.633154602323502, + "grad_norm": 0.0458984375, + "learning_rate": 0.013346164281509174, + "loss": 0.7962, + "num_input_tokens_seen": 45344704, + "step": 78105 + }, + { + "epoch": 11.633899314864463, + "grad_norm": 0.0400390625, + "learning_rate": 0.013344226529150719, + "loss": 0.8053, + "num_input_tokens_seen": 45347648, + "step": 78110 + }, + { + "epoch": 11.634644027405422, + "grad_norm": 0.045166015625, + "learning_rate": 0.013342288804764872, + "loss": 0.8104, + "num_input_tokens_seen": 45350688, + "step": 78115 + }, + { + "epoch": 11.63538873994638, + "grad_norm": 0.0289306640625, + "learning_rate": 0.013340351108384366, + "loss": 0.8163, + "num_input_tokens_seen": 45353632, + "step": 78120 + }, + { + "epoch": 11.63613345248734, + "grad_norm": 0.037841796875, + "learning_rate": 0.013338413440041948, + "loss": 0.8333, + "num_input_tokens_seen": 45356320, + "step": 78125 + }, + { + "epoch": 11.6368781650283, + "grad_norm": 0.030029296875, + "learning_rate": 0.013336475799770339, + "loss": 0.7926, + "num_input_tokens_seen": 45358880, + "step": 78130 + }, + { + "epoch": 11.637622877569259, + "grad_norm": 0.041259765625, + "learning_rate": 0.013334538187602288, + "loss": 0.7877, + "num_input_tokens_seen": 45361760, + "step": 78135 + }, + { + "epoch": 11.638367590110217, + "grad_norm": 0.02783203125, + "learning_rate": 0.013332600603570522, + "loss": 0.7779, + "num_input_tokens_seen": 45364544, + "step": 78140 + }, + { + "epoch": 11.639112302651176, + "grad_norm": 0.030517578125, + "learning_rate": 0.01333066304770777, + "loss": 0.7942, + "num_input_tokens_seen": 45367616, + "step": 78145 + }, + { + "epoch": 11.639857015192137, + "grad_norm": 0.05322265625, + "learning_rate": 0.013328725520046773, + "loss": 0.8097, + "num_input_tokens_seen": 45370400, + "step": 78150 + }, + { + "epoch": 11.640601727733095, + "grad_norm": 0.021728515625, + "learning_rate": 0.013326788020620253, + "loss": 0.7815, + "num_input_tokens_seen": 45373152, + "step": 78155 + }, + { + "epoch": 11.641346440274054, + "grad_norm": 0.05615234375, + "learning_rate": 0.013324850549460955, + "loss": 0.7966, + "num_input_tokens_seen": 45376064, + "step": 78160 + }, + { + "epoch": 11.642091152815013, + "grad_norm": 0.059326171875, + "learning_rate": 0.013322913106601598, + "loss": 0.8283, + "num_input_tokens_seen": 45379072, + "step": 78165 + }, + { + "epoch": 11.642835865355973, + "grad_norm": 0.052001953125, + "learning_rate": 0.013320975692074926, + "loss": 0.7795, + "num_input_tokens_seen": 45382016, + "step": 78170 + }, + { + "epoch": 11.643580577896932, + "grad_norm": 0.04833984375, + "learning_rate": 0.013319038305913663, + "loss": 0.8195, + "num_input_tokens_seen": 45384960, + "step": 78175 + }, + { + "epoch": 11.64432529043789, + "grad_norm": 0.062255859375, + "learning_rate": 0.013317100948150537, + "loss": 0.7956, + "num_input_tokens_seen": 45387744, + "step": 78180 + }, + { + "epoch": 11.64507000297885, + "grad_norm": 0.0301513671875, + "learning_rate": 0.013315163618818283, + "loss": 0.7769, + "num_input_tokens_seen": 45390496, + "step": 78185 + }, + { + "epoch": 11.64581471551981, + "grad_norm": 0.049560546875, + "learning_rate": 0.013313226317949619, + "loss": 0.827, + "num_input_tokens_seen": 45393568, + "step": 78190 + }, + { + "epoch": 11.646559428060769, + "grad_norm": 0.0390625, + "learning_rate": 0.01331128904557729, + "loss": 0.7874, + "num_input_tokens_seen": 45396480, + "step": 78195 + }, + { + "epoch": 11.647304140601728, + "grad_norm": 0.036376953125, + "learning_rate": 0.01330935180173401, + "loss": 0.7985, + "num_input_tokens_seen": 45399360, + "step": 78200 + }, + { + "epoch": 11.648048853142686, + "grad_norm": 0.04638671875, + "learning_rate": 0.01330741458645252, + "loss": 0.8044, + "num_input_tokens_seen": 45402464, + "step": 78205 + }, + { + "epoch": 11.648793565683647, + "grad_norm": 0.0274658203125, + "learning_rate": 0.013305477399765536, + "loss": 0.7936, + "num_input_tokens_seen": 45405472, + "step": 78210 + }, + { + "epoch": 11.649538278224606, + "grad_norm": 0.05224609375, + "learning_rate": 0.013303540241705792, + "loss": 0.7898, + "num_input_tokens_seen": 45408416, + "step": 78215 + }, + { + "epoch": 11.650282990765565, + "grad_norm": 0.050048828125, + "learning_rate": 0.013301603112306014, + "loss": 0.8021, + "num_input_tokens_seen": 45411328, + "step": 78220 + }, + { + "epoch": 11.651027703306523, + "grad_norm": 0.0267333984375, + "learning_rate": 0.013299666011598914, + "loss": 0.7808, + "num_input_tokens_seen": 45414240, + "step": 78225 + }, + { + "epoch": 11.651772415847482, + "grad_norm": 0.033203125, + "learning_rate": 0.01329772893961724, + "loss": 0.8229, + "num_input_tokens_seen": 45417568, + "step": 78230 + }, + { + "epoch": 11.652517128388443, + "grad_norm": 0.04541015625, + "learning_rate": 0.013295791896393697, + "loss": 0.7963, + "num_input_tokens_seen": 45420384, + "step": 78235 + }, + { + "epoch": 11.653261840929401, + "grad_norm": 0.03271484375, + "learning_rate": 0.013293854881961024, + "loss": 0.819, + "num_input_tokens_seen": 45423328, + "step": 78240 + }, + { + "epoch": 11.65400655347036, + "grad_norm": 0.032470703125, + "learning_rate": 0.013291917896351936, + "loss": 0.7957, + "num_input_tokens_seen": 45426336, + "step": 78245 + }, + { + "epoch": 11.654751266011319, + "grad_norm": 0.0208740234375, + "learning_rate": 0.01328998093959916, + "loss": 0.8036, + "num_input_tokens_seen": 45429216, + "step": 78250 + }, + { + "epoch": 11.65549597855228, + "grad_norm": 0.042724609375, + "learning_rate": 0.01328804401173542, + "loss": 0.7823, + "num_input_tokens_seen": 45432096, + "step": 78255 + }, + { + "epoch": 11.656240691093238, + "grad_norm": 0.04248046875, + "learning_rate": 0.013286107112793431, + "loss": 0.7936, + "num_input_tokens_seen": 45435072, + "step": 78260 + }, + { + "epoch": 11.656985403634197, + "grad_norm": 0.042724609375, + "learning_rate": 0.013284170242805926, + "loss": 0.8139, + "num_input_tokens_seen": 45437632, + "step": 78265 + }, + { + "epoch": 11.657730116175156, + "grad_norm": 0.04638671875, + "learning_rate": 0.013282233401805616, + "loss": 0.7875, + "num_input_tokens_seen": 45440736, + "step": 78270 + }, + { + "epoch": 11.658474828716116, + "grad_norm": 0.04443359375, + "learning_rate": 0.01328029658982523, + "loss": 0.7999, + "num_input_tokens_seen": 45443744, + "step": 78275 + }, + { + "epoch": 11.659219541257075, + "grad_norm": 0.041015625, + "learning_rate": 0.013278359806897485, + "loss": 0.8032, + "num_input_tokens_seen": 45446560, + "step": 78280 + }, + { + "epoch": 11.659964253798034, + "grad_norm": 0.04638671875, + "learning_rate": 0.013276423053055103, + "loss": 0.7938, + "num_input_tokens_seen": 45449344, + "step": 78285 + }, + { + "epoch": 11.660708966338992, + "grad_norm": 0.038330078125, + "learning_rate": 0.013274486328330795, + "loss": 0.804, + "num_input_tokens_seen": 45452544, + "step": 78290 + }, + { + "epoch": 11.661453678879953, + "grad_norm": 0.04638671875, + "learning_rate": 0.013272549632757294, + "loss": 0.7858, + "num_input_tokens_seen": 45455328, + "step": 78295 + }, + { + "epoch": 11.662198391420912, + "grad_norm": 0.04736328125, + "learning_rate": 0.01327061296636731, + "loss": 0.7978, + "num_input_tokens_seen": 45458752, + "step": 78300 + }, + { + "epoch": 11.66294310396187, + "grad_norm": 0.028564453125, + "learning_rate": 0.013268676329193556, + "loss": 0.7922, + "num_input_tokens_seen": 45461440, + "step": 78305 + }, + { + "epoch": 11.66368781650283, + "grad_norm": 0.0245361328125, + "learning_rate": 0.013266739721268765, + "loss": 0.8141, + "num_input_tokens_seen": 45464448, + "step": 78310 + }, + { + "epoch": 11.66443252904379, + "grad_norm": 0.030029296875, + "learning_rate": 0.013264803142625639, + "loss": 0.7908, + "num_input_tokens_seen": 45467072, + "step": 78315 + }, + { + "epoch": 11.665177241584749, + "grad_norm": 0.027587890625, + "learning_rate": 0.013262866593296903, + "loss": 0.7831, + "num_input_tokens_seen": 45469984, + "step": 78320 + }, + { + "epoch": 11.665921954125707, + "grad_norm": 0.048828125, + "learning_rate": 0.013260930073315266, + "loss": 0.8294, + "num_input_tokens_seen": 45472800, + "step": 78325 + }, + { + "epoch": 11.666666666666666, + "grad_norm": 0.04296875, + "learning_rate": 0.013258993582713452, + "loss": 0.8087, + "num_input_tokens_seen": 45475584, + "step": 78330 + }, + { + "epoch": 11.667411379207627, + "grad_norm": 0.049072265625, + "learning_rate": 0.013257057121524174, + "loss": 0.7959, + "num_input_tokens_seen": 45478656, + "step": 78335 + }, + { + "epoch": 11.668156091748585, + "grad_norm": 0.0458984375, + "learning_rate": 0.013255120689780136, + "loss": 0.7948, + "num_input_tokens_seen": 45481664, + "step": 78340 + }, + { + "epoch": 11.668900804289544, + "grad_norm": 0.040771484375, + "learning_rate": 0.01325318428751407, + "loss": 0.8027, + "num_input_tokens_seen": 45484864, + "step": 78345 + }, + { + "epoch": 11.669645516830503, + "grad_norm": 0.0294189453125, + "learning_rate": 0.013251247914758674, + "loss": 0.8324, + "num_input_tokens_seen": 45487776, + "step": 78350 + }, + { + "epoch": 11.670390229371463, + "grad_norm": 0.049072265625, + "learning_rate": 0.013249311571546673, + "loss": 0.7887, + "num_input_tokens_seen": 45490624, + "step": 78355 + }, + { + "epoch": 11.671134941912422, + "grad_norm": 0.046875, + "learning_rate": 0.013247375257910764, + "loss": 0.7869, + "num_input_tokens_seen": 45493312, + "step": 78360 + }, + { + "epoch": 11.671879654453381, + "grad_norm": 0.03173828125, + "learning_rate": 0.013245438973883677, + "loss": 0.7916, + "num_input_tokens_seen": 45496640, + "step": 78365 + }, + { + "epoch": 11.67262436699434, + "grad_norm": 0.02685546875, + "learning_rate": 0.01324350271949811, + "loss": 0.8086, + "num_input_tokens_seen": 45499648, + "step": 78370 + }, + { + "epoch": 11.673369079535298, + "grad_norm": 0.051513671875, + "learning_rate": 0.013241566494786784, + "loss": 0.7908, + "num_input_tokens_seen": 45502400, + "step": 78375 + }, + { + "epoch": 11.674113792076259, + "grad_norm": 0.05126953125, + "learning_rate": 0.013239630299782406, + "loss": 0.8086, + "num_input_tokens_seen": 45505216, + "step": 78380 + }, + { + "epoch": 11.674858504617218, + "grad_norm": 0.028564453125, + "learning_rate": 0.01323769413451768, + "loss": 0.8047, + "num_input_tokens_seen": 45508160, + "step": 78385 + }, + { + "epoch": 11.675603217158177, + "grad_norm": 0.037841796875, + "learning_rate": 0.013235757999025327, + "loss": 0.79, + "num_input_tokens_seen": 45510912, + "step": 78390 + }, + { + "epoch": 11.676347929699137, + "grad_norm": 0.0303955078125, + "learning_rate": 0.013233821893338039, + "loss": 0.7972, + "num_input_tokens_seen": 45513696, + "step": 78395 + }, + { + "epoch": 11.677092642240096, + "grad_norm": 0.029052734375, + "learning_rate": 0.013231885817488543, + "loss": 0.7569, + "num_input_tokens_seen": 45516480, + "step": 78400 + }, + { + "epoch": 11.677837354781055, + "grad_norm": 0.031494140625, + "learning_rate": 0.013229949771509538, + "loss": 0.795, + "num_input_tokens_seen": 45519168, + "step": 78405 + }, + { + "epoch": 11.678582067322013, + "grad_norm": 0.042236328125, + "learning_rate": 0.01322801375543373, + "loss": 0.8064, + "num_input_tokens_seen": 45522080, + "step": 78410 + }, + { + "epoch": 11.679326779862972, + "grad_norm": 0.0439453125, + "learning_rate": 0.013226077769293833, + "loss": 0.7904, + "num_input_tokens_seen": 45524768, + "step": 78415 + }, + { + "epoch": 11.680071492403933, + "grad_norm": 0.041015625, + "learning_rate": 0.013224141813122545, + "loss": 0.803, + "num_input_tokens_seen": 45527456, + "step": 78420 + }, + { + "epoch": 11.680816204944891, + "grad_norm": 0.035888671875, + "learning_rate": 0.01322220588695258, + "loss": 0.8036, + "num_input_tokens_seen": 45530368, + "step": 78425 + }, + { + "epoch": 11.68156091748585, + "grad_norm": 0.044677734375, + "learning_rate": 0.013220269990816633, + "loss": 0.7714, + "num_input_tokens_seen": 45533536, + "step": 78430 + }, + { + "epoch": 11.682305630026809, + "grad_norm": 0.02880859375, + "learning_rate": 0.013218334124747422, + "loss": 0.7926, + "num_input_tokens_seen": 45536224, + "step": 78435 + }, + { + "epoch": 11.68305034256777, + "grad_norm": 0.044677734375, + "learning_rate": 0.013216398288777643, + "loss": 0.7889, + "num_input_tokens_seen": 45539168, + "step": 78440 + }, + { + "epoch": 11.683795055108728, + "grad_norm": 0.027099609375, + "learning_rate": 0.013214462482940004, + "loss": 0.7908, + "num_input_tokens_seen": 45542048, + "step": 78445 + }, + { + "epoch": 11.684539767649687, + "grad_norm": 0.049072265625, + "learning_rate": 0.0132125267072672, + "loss": 0.7928, + "num_input_tokens_seen": 45545056, + "step": 78450 + }, + { + "epoch": 11.685284480190646, + "grad_norm": 0.04833984375, + "learning_rate": 0.01321059096179195, + "loss": 0.7984, + "num_input_tokens_seen": 45547712, + "step": 78455 + }, + { + "epoch": 11.686029192731606, + "grad_norm": 0.024658203125, + "learning_rate": 0.013208655246546944, + "loss": 0.8108, + "num_input_tokens_seen": 45550432, + "step": 78460 + }, + { + "epoch": 11.686773905272565, + "grad_norm": 0.037353515625, + "learning_rate": 0.013206719561564882, + "loss": 0.8152, + "num_input_tokens_seen": 45553568, + "step": 78465 + }, + { + "epoch": 11.687518617813524, + "grad_norm": 0.05078125, + "learning_rate": 0.013204783906878476, + "loss": 0.846, + "num_input_tokens_seen": 45556288, + "step": 78470 + }, + { + "epoch": 11.688263330354483, + "grad_norm": 0.031494140625, + "learning_rate": 0.013202848282520418, + "loss": 0.7921, + "num_input_tokens_seen": 45559328, + "step": 78475 + }, + { + "epoch": 11.689008042895443, + "grad_norm": 0.041015625, + "learning_rate": 0.013200912688523416, + "loss": 0.7912, + "num_input_tokens_seen": 45561984, + "step": 78480 + }, + { + "epoch": 11.689752755436402, + "grad_norm": 0.040771484375, + "learning_rate": 0.013198977124920158, + "loss": 0.7915, + "num_input_tokens_seen": 45565088, + "step": 78485 + }, + { + "epoch": 11.69049746797736, + "grad_norm": 0.033447265625, + "learning_rate": 0.013197041591743357, + "loss": 0.7743, + "num_input_tokens_seen": 45568096, + "step": 78490 + }, + { + "epoch": 11.69124218051832, + "grad_norm": 0.02734375, + "learning_rate": 0.013195106089025709, + "loss": 0.8052, + "num_input_tokens_seen": 45570880, + "step": 78495 + }, + { + "epoch": 11.69198689305928, + "grad_norm": 0.05029296875, + "learning_rate": 0.013193170616799898, + "loss": 0.7988, + "num_input_tokens_seen": 45573824, + "step": 78500 + }, + { + "epoch": 11.692731605600239, + "grad_norm": 0.04248046875, + "learning_rate": 0.013191235175098645, + "loss": 0.7914, + "num_input_tokens_seen": 45576480, + "step": 78505 + }, + { + "epoch": 11.693476318141197, + "grad_norm": 0.03955078125, + "learning_rate": 0.013189299763954629, + "loss": 0.7821, + "num_input_tokens_seen": 45579680, + "step": 78510 + }, + { + "epoch": 11.694221030682156, + "grad_norm": 0.0419921875, + "learning_rate": 0.013187364383400556, + "loss": 0.8113, + "num_input_tokens_seen": 45582592, + "step": 78515 + }, + { + "epoch": 11.694965743223117, + "grad_norm": 0.0286865234375, + "learning_rate": 0.013185429033469113, + "loss": 0.8063, + "num_input_tokens_seen": 45585472, + "step": 78520 + }, + { + "epoch": 11.695710455764075, + "grad_norm": 0.038330078125, + "learning_rate": 0.013183493714193008, + "loss": 0.8033, + "num_input_tokens_seen": 45588512, + "step": 78525 + }, + { + "epoch": 11.696455168305034, + "grad_norm": 0.040283203125, + "learning_rate": 0.013181558425604926, + "loss": 0.8239, + "num_input_tokens_seen": 45591488, + "step": 78530 + }, + { + "epoch": 11.697199880845993, + "grad_norm": 0.033447265625, + "learning_rate": 0.013179623167737572, + "loss": 0.8234, + "num_input_tokens_seen": 45594368, + "step": 78535 + }, + { + "epoch": 11.697944593386953, + "grad_norm": 0.02197265625, + "learning_rate": 0.013177687940623635, + "loss": 0.7902, + "num_input_tokens_seen": 45596992, + "step": 78540 + }, + { + "epoch": 11.698689305927912, + "grad_norm": 0.0289306640625, + "learning_rate": 0.013175752744295803, + "loss": 0.7906, + "num_input_tokens_seen": 45599616, + "step": 78545 + }, + { + "epoch": 11.699434018468871, + "grad_norm": 0.0322265625, + "learning_rate": 0.01317381757878678, + "loss": 0.7914, + "num_input_tokens_seen": 45602464, + "step": 78550 + }, + { + "epoch": 11.70017873100983, + "grad_norm": 0.03271484375, + "learning_rate": 0.013171882444129245, + "loss": 0.7839, + "num_input_tokens_seen": 45605696, + "step": 78555 + }, + { + "epoch": 11.700923443550789, + "grad_norm": 0.03759765625, + "learning_rate": 0.013169947340355904, + "loss": 0.8321, + "num_input_tokens_seen": 45608480, + "step": 78560 + }, + { + "epoch": 11.701668156091749, + "grad_norm": 0.033935546875, + "learning_rate": 0.013168012267499437, + "loss": 0.8425, + "num_input_tokens_seen": 45611232, + "step": 78565 + }, + { + "epoch": 11.702412868632708, + "grad_norm": 0.03857421875, + "learning_rate": 0.013166077225592548, + "loss": 0.8005, + "num_input_tokens_seen": 45614208, + "step": 78570 + }, + { + "epoch": 11.703157581173667, + "grad_norm": 0.0380859375, + "learning_rate": 0.01316414221466792, + "loss": 0.7885, + "num_input_tokens_seen": 45617280, + "step": 78575 + }, + { + "epoch": 11.703902293714627, + "grad_norm": 0.028076171875, + "learning_rate": 0.01316220723475824, + "loss": 0.8037, + "num_input_tokens_seen": 45620384, + "step": 78580 + }, + { + "epoch": 11.704647006255586, + "grad_norm": 0.0299072265625, + "learning_rate": 0.013160272285896205, + "loss": 0.7935, + "num_input_tokens_seen": 45623648, + "step": 78585 + }, + { + "epoch": 11.705391718796545, + "grad_norm": 0.031494140625, + "learning_rate": 0.013158337368114491, + "loss": 0.836, + "num_input_tokens_seen": 45626720, + "step": 78590 + }, + { + "epoch": 11.706136431337503, + "grad_norm": 0.03759765625, + "learning_rate": 0.013156402481445804, + "loss": 0.8049, + "num_input_tokens_seen": 45629728, + "step": 78595 + }, + { + "epoch": 11.706881143878462, + "grad_norm": 0.030029296875, + "learning_rate": 0.013154467625922815, + "loss": 0.7725, + "num_input_tokens_seen": 45632256, + "step": 78600 + }, + { + "epoch": 11.707625856419423, + "grad_norm": 0.033447265625, + "learning_rate": 0.013152532801578229, + "loss": 0.7932, + "num_input_tokens_seen": 45635136, + "step": 78605 + }, + { + "epoch": 11.708370568960381, + "grad_norm": 0.040283203125, + "learning_rate": 0.013150598008444718, + "loss": 0.7976, + "num_input_tokens_seen": 45637888, + "step": 78610 + }, + { + "epoch": 11.70911528150134, + "grad_norm": 0.038330078125, + "learning_rate": 0.013148663246554977, + "loss": 0.7994, + "num_input_tokens_seen": 45640928, + "step": 78615 + }, + { + "epoch": 11.709859994042299, + "grad_norm": 0.033203125, + "learning_rate": 0.013146728515941692, + "loss": 0.7986, + "num_input_tokens_seen": 45643904, + "step": 78620 + }, + { + "epoch": 11.71060470658326, + "grad_norm": 0.041748046875, + "learning_rate": 0.013144793816637535, + "loss": 0.828, + "num_input_tokens_seen": 45646976, + "step": 78625 + }, + { + "epoch": 11.711349419124218, + "grad_norm": 0.025390625, + "learning_rate": 0.013142859148675209, + "loss": 0.7844, + "num_input_tokens_seen": 45649760, + "step": 78630 + }, + { + "epoch": 11.712094131665177, + "grad_norm": 0.04052734375, + "learning_rate": 0.013140924512087383, + "loss": 0.7906, + "num_input_tokens_seen": 45652704, + "step": 78635 + }, + { + "epoch": 11.712838844206136, + "grad_norm": 0.037841796875, + "learning_rate": 0.013138989906906757, + "loss": 0.7874, + "num_input_tokens_seen": 45655520, + "step": 78640 + }, + { + "epoch": 11.713583556747096, + "grad_norm": 0.041259765625, + "learning_rate": 0.013137055333165998, + "loss": 0.7833, + "num_input_tokens_seen": 45658432, + "step": 78645 + }, + { + "epoch": 11.714328269288055, + "grad_norm": 0.05908203125, + "learning_rate": 0.013135120790897802, + "loss": 0.806, + "num_input_tokens_seen": 45661216, + "step": 78650 + }, + { + "epoch": 11.715072981829014, + "grad_norm": 0.0301513671875, + "learning_rate": 0.013133186280134835, + "loss": 0.8185, + "num_input_tokens_seen": 45663904, + "step": 78655 + }, + { + "epoch": 11.715817694369973, + "grad_norm": 0.032470703125, + "learning_rate": 0.013131251800909799, + "loss": 0.7925, + "num_input_tokens_seen": 45667008, + "step": 78660 + }, + { + "epoch": 11.716562406910933, + "grad_norm": 0.018798828125, + "learning_rate": 0.013129317353255362, + "loss": 0.8125, + "num_input_tokens_seen": 45670080, + "step": 78665 + }, + { + "epoch": 11.717307119451892, + "grad_norm": 0.03125, + "learning_rate": 0.013127382937204203, + "loss": 0.7913, + "num_input_tokens_seen": 45672992, + "step": 78670 + }, + { + "epoch": 11.71805183199285, + "grad_norm": 0.024169921875, + "learning_rate": 0.01312544855278901, + "loss": 0.8175, + "num_input_tokens_seen": 45675936, + "step": 78675 + }, + { + "epoch": 11.71879654453381, + "grad_norm": 0.037841796875, + "learning_rate": 0.013123514200042454, + "loss": 0.8035, + "num_input_tokens_seen": 45678880, + "step": 78680 + }, + { + "epoch": 11.71954125707477, + "grad_norm": 0.037353515625, + "learning_rate": 0.013121579878997225, + "loss": 0.7838, + "num_input_tokens_seen": 45681696, + "step": 78685 + }, + { + "epoch": 11.720285969615729, + "grad_norm": 0.0255126953125, + "learning_rate": 0.013119645589685987, + "loss": 0.7903, + "num_input_tokens_seen": 45684256, + "step": 78690 + }, + { + "epoch": 11.721030682156687, + "grad_norm": 0.0361328125, + "learning_rate": 0.013117711332141431, + "loss": 0.792, + "num_input_tokens_seen": 45687232, + "step": 78695 + }, + { + "epoch": 11.721775394697646, + "grad_norm": 0.023193359375, + "learning_rate": 0.01311577710639623, + "loss": 0.805, + "num_input_tokens_seen": 45690496, + "step": 78700 + }, + { + "epoch": 11.722520107238607, + "grad_norm": 0.045166015625, + "learning_rate": 0.013113842912483055, + "loss": 0.7878, + "num_input_tokens_seen": 45693280, + "step": 78705 + }, + { + "epoch": 11.723264819779565, + "grad_norm": 0.030517578125, + "learning_rate": 0.01311190875043459, + "loss": 0.7949, + "num_input_tokens_seen": 45696064, + "step": 78710 + }, + { + "epoch": 11.724009532320524, + "grad_norm": 0.033447265625, + "learning_rate": 0.013109974620283509, + "loss": 0.807, + "num_input_tokens_seen": 45698816, + "step": 78715 + }, + { + "epoch": 11.724754244861483, + "grad_norm": 0.034423828125, + "learning_rate": 0.013108040522062485, + "loss": 0.7805, + "num_input_tokens_seen": 45701696, + "step": 78720 + }, + { + "epoch": 11.725498957402444, + "grad_norm": 0.037353515625, + "learning_rate": 0.013106106455804189, + "loss": 0.8074, + "num_input_tokens_seen": 45704512, + "step": 78725 + }, + { + "epoch": 11.726243669943402, + "grad_norm": 0.03466796875, + "learning_rate": 0.013104172421541304, + "loss": 0.8121, + "num_input_tokens_seen": 45707328, + "step": 78730 + }, + { + "epoch": 11.726988382484361, + "grad_norm": 0.02587890625, + "learning_rate": 0.013102238419306498, + "loss": 0.8008, + "num_input_tokens_seen": 45710080, + "step": 78735 + }, + { + "epoch": 11.72773309502532, + "grad_norm": 0.033447265625, + "learning_rate": 0.013100304449132443, + "loss": 0.8126, + "num_input_tokens_seen": 45712896, + "step": 78740 + }, + { + "epoch": 11.728477807566279, + "grad_norm": 0.035400390625, + "learning_rate": 0.013098370511051819, + "loss": 0.7886, + "num_input_tokens_seen": 45715616, + "step": 78745 + }, + { + "epoch": 11.729222520107239, + "grad_norm": 0.033935546875, + "learning_rate": 0.01309643660509729, + "loss": 0.786, + "num_input_tokens_seen": 45718848, + "step": 78750 + }, + { + "epoch": 11.729967232648198, + "grad_norm": 0.03369140625, + "learning_rate": 0.013094502731301529, + "loss": 0.797, + "num_input_tokens_seen": 45721888, + "step": 78755 + }, + { + "epoch": 11.730711945189157, + "grad_norm": 0.026611328125, + "learning_rate": 0.013092568889697205, + "loss": 0.8259, + "num_input_tokens_seen": 45724832, + "step": 78760 + }, + { + "epoch": 11.731456657730115, + "grad_norm": 0.043212890625, + "learning_rate": 0.013090635080316996, + "loss": 0.7761, + "num_input_tokens_seen": 45728000, + "step": 78765 + }, + { + "epoch": 11.732201370271076, + "grad_norm": 0.035400390625, + "learning_rate": 0.013088701303193565, + "loss": 0.7747, + "num_input_tokens_seen": 45730784, + "step": 78770 + }, + { + "epoch": 11.732946082812035, + "grad_norm": 0.0390625, + "learning_rate": 0.013086767558359583, + "loss": 0.8021, + "num_input_tokens_seen": 45733568, + "step": 78775 + }, + { + "epoch": 11.733690795352993, + "grad_norm": 0.0220947265625, + "learning_rate": 0.013084833845847717, + "loss": 0.8094, + "num_input_tokens_seen": 45736416, + "step": 78780 + }, + { + "epoch": 11.734435507893952, + "grad_norm": 0.0308837890625, + "learning_rate": 0.013082900165690638, + "loss": 0.8057, + "num_input_tokens_seen": 45739104, + "step": 78785 + }, + { + "epoch": 11.735180220434913, + "grad_norm": 0.0194091796875, + "learning_rate": 0.013080966517921013, + "loss": 0.7924, + "num_input_tokens_seen": 45741984, + "step": 78790 + }, + { + "epoch": 11.735924932975871, + "grad_norm": 0.039794921875, + "learning_rate": 0.013079032902571503, + "loss": 0.7947, + "num_input_tokens_seen": 45744608, + "step": 78795 + }, + { + "epoch": 11.73666964551683, + "grad_norm": 0.035400390625, + "learning_rate": 0.013077099319674788, + "loss": 0.7964, + "num_input_tokens_seen": 45747328, + "step": 78800 + }, + { + "epoch": 11.737414358057789, + "grad_norm": 0.0400390625, + "learning_rate": 0.013075165769263519, + "loss": 0.7841, + "num_input_tokens_seen": 45750400, + "step": 78805 + }, + { + "epoch": 11.73815907059875, + "grad_norm": 0.037353515625, + "learning_rate": 0.013073232251370375, + "loss": 0.7742, + "num_input_tokens_seen": 45753152, + "step": 78810 + }, + { + "epoch": 11.738903783139708, + "grad_norm": 0.0301513671875, + "learning_rate": 0.013071298766028005, + "loss": 0.8226, + "num_input_tokens_seen": 45756320, + "step": 78815 + }, + { + "epoch": 11.739648495680667, + "grad_norm": 0.028564453125, + "learning_rate": 0.013069365313269087, + "loss": 0.8079, + "num_input_tokens_seen": 45759488, + "step": 78820 + }, + { + "epoch": 11.740393208221626, + "grad_norm": 0.05078125, + "learning_rate": 0.013067431893126284, + "loss": 0.8016, + "num_input_tokens_seen": 45762208, + "step": 78825 + }, + { + "epoch": 11.741137920762586, + "grad_norm": 0.04443359375, + "learning_rate": 0.013065498505632244, + "loss": 0.7841, + "num_input_tokens_seen": 45764896, + "step": 78830 + }, + { + "epoch": 11.741882633303545, + "grad_norm": 0.0291748046875, + "learning_rate": 0.013063565150819651, + "loss": 0.7998, + "num_input_tokens_seen": 45767712, + "step": 78835 + }, + { + "epoch": 11.742627345844504, + "grad_norm": 0.0380859375, + "learning_rate": 0.013061631828721151, + "loss": 0.7917, + "num_input_tokens_seen": 45770880, + "step": 78840 + }, + { + "epoch": 11.743372058385463, + "grad_norm": 0.0400390625, + "learning_rate": 0.013059698539369415, + "loss": 0.7941, + "num_input_tokens_seen": 45773856, + "step": 78845 + }, + { + "epoch": 11.744116770926423, + "grad_norm": 0.025390625, + "learning_rate": 0.013057765282797095, + "loss": 0.822, + "num_input_tokens_seen": 45776576, + "step": 78850 + }, + { + "epoch": 11.744861483467382, + "grad_norm": 0.038818359375, + "learning_rate": 0.01305583205903686, + "loss": 0.7815, + "num_input_tokens_seen": 45779392, + "step": 78855 + }, + { + "epoch": 11.74560619600834, + "grad_norm": 0.049072265625, + "learning_rate": 0.01305389886812137, + "loss": 0.7873, + "num_input_tokens_seen": 45782368, + "step": 78860 + }, + { + "epoch": 11.7463509085493, + "grad_norm": 0.042724609375, + "learning_rate": 0.013051965710083272, + "loss": 0.802, + "num_input_tokens_seen": 45785056, + "step": 78865 + }, + { + "epoch": 11.74709562109026, + "grad_norm": 0.04736328125, + "learning_rate": 0.01305003258495524, + "loss": 0.7848, + "num_input_tokens_seen": 45787904, + "step": 78870 + }, + { + "epoch": 11.747840333631219, + "grad_norm": 0.0556640625, + "learning_rate": 0.013048099492769924, + "loss": 0.8048, + "num_input_tokens_seen": 45790624, + "step": 78875 + }, + { + "epoch": 11.748585046172177, + "grad_norm": 0.0220947265625, + "learning_rate": 0.013046166433559986, + "loss": 0.8027, + "num_input_tokens_seen": 45793824, + "step": 78880 + }, + { + "epoch": 11.749329758713136, + "grad_norm": 0.0537109375, + "learning_rate": 0.013044233407358071, + "loss": 0.806, + "num_input_tokens_seen": 45796736, + "step": 78885 + }, + { + "epoch": 11.750074471254095, + "grad_norm": 0.17578125, + "learning_rate": 0.013042300414196854, + "loss": 0.7948, + "num_input_tokens_seen": 45799712, + "step": 78890 + }, + { + "epoch": 11.750819183795056, + "grad_norm": 0.035888671875, + "learning_rate": 0.013040367454108975, + "loss": 0.7804, + "num_input_tokens_seen": 45802528, + "step": 78895 + }, + { + "epoch": 11.751563896336014, + "grad_norm": 0.047119140625, + "learning_rate": 0.013038434527127102, + "loss": 0.8149, + "num_input_tokens_seen": 45805632, + "step": 78900 + }, + { + "epoch": 11.752308608876973, + "grad_norm": 0.037353515625, + "learning_rate": 0.013036501633283884, + "loss": 0.7885, + "num_input_tokens_seen": 45808256, + "step": 78905 + }, + { + "epoch": 11.753053321417934, + "grad_norm": 0.0341796875, + "learning_rate": 0.013034568772611973, + "loss": 0.7808, + "num_input_tokens_seen": 45811168, + "step": 78910 + }, + { + "epoch": 11.753798033958892, + "grad_norm": 0.025634765625, + "learning_rate": 0.013032635945144028, + "loss": 0.7926, + "num_input_tokens_seen": 45814048, + "step": 78915 + }, + { + "epoch": 11.754542746499851, + "grad_norm": 0.035888671875, + "learning_rate": 0.013030703150912691, + "loss": 0.7947, + "num_input_tokens_seen": 45816960, + "step": 78920 + }, + { + "epoch": 11.75528745904081, + "grad_norm": 0.041259765625, + "learning_rate": 0.013028770389950631, + "loss": 0.7917, + "num_input_tokens_seen": 45819680, + "step": 78925 + }, + { + "epoch": 11.756032171581769, + "grad_norm": 0.0419921875, + "learning_rate": 0.013026837662290486, + "loss": 0.8007, + "num_input_tokens_seen": 45823008, + "step": 78930 + }, + { + "epoch": 11.75677688412273, + "grad_norm": 0.038330078125, + "learning_rate": 0.013024904967964919, + "loss": 0.7963, + "num_input_tokens_seen": 45825952, + "step": 78935 + }, + { + "epoch": 11.757521596663688, + "grad_norm": 0.0186767578125, + "learning_rate": 0.013022972307006574, + "loss": 0.7866, + "num_input_tokens_seen": 45828672, + "step": 78940 + }, + { + "epoch": 11.758266309204647, + "grad_norm": 0.0634765625, + "learning_rate": 0.013021039679448102, + "loss": 0.8025, + "num_input_tokens_seen": 45831808, + "step": 78945 + }, + { + "epoch": 11.759011021745605, + "grad_norm": 0.041015625, + "learning_rate": 0.013019107085322154, + "loss": 0.8075, + "num_input_tokens_seen": 45834656, + "step": 78950 + }, + { + "epoch": 11.759755734286566, + "grad_norm": 0.038818359375, + "learning_rate": 0.013017174524661373, + "loss": 0.8123, + "num_input_tokens_seen": 45837376, + "step": 78955 + }, + { + "epoch": 11.760500446827525, + "grad_norm": 0.0286865234375, + "learning_rate": 0.01301524199749842, + "loss": 0.7924, + "num_input_tokens_seen": 45840416, + "step": 78960 + }, + { + "epoch": 11.761245159368483, + "grad_norm": 0.0615234375, + "learning_rate": 0.013013309503865929, + "loss": 0.7933, + "num_input_tokens_seen": 45843424, + "step": 78965 + }, + { + "epoch": 11.761989871909442, + "grad_norm": 0.04248046875, + "learning_rate": 0.01301137704379656, + "loss": 0.7901, + "num_input_tokens_seen": 45846336, + "step": 78970 + }, + { + "epoch": 11.762734584450403, + "grad_norm": 0.0208740234375, + "learning_rate": 0.013009444617322953, + "loss": 0.8134, + "num_input_tokens_seen": 45849024, + "step": 78975 + }, + { + "epoch": 11.763479296991362, + "grad_norm": 0.048095703125, + "learning_rate": 0.01300751222447776, + "loss": 0.7986, + "num_input_tokens_seen": 45851616, + "step": 78980 + }, + { + "epoch": 11.76422400953232, + "grad_norm": 0.0205078125, + "learning_rate": 0.013005579865293621, + "loss": 0.8081, + "num_input_tokens_seen": 45854592, + "step": 78985 + }, + { + "epoch": 11.764968722073279, + "grad_norm": 0.040771484375, + "learning_rate": 0.013003647539803178, + "loss": 0.7937, + "num_input_tokens_seen": 45857696, + "step": 78990 + }, + { + "epoch": 11.76571343461424, + "grad_norm": 0.031982421875, + "learning_rate": 0.013001715248039086, + "loss": 0.8076, + "num_input_tokens_seen": 45860640, + "step": 78995 + }, + { + "epoch": 11.766458147155198, + "grad_norm": 0.043701171875, + "learning_rate": 0.01299978299003398, + "loss": 0.8009, + "num_input_tokens_seen": 45863872, + "step": 79000 + }, + { + "epoch": 11.767202859696157, + "grad_norm": 0.04345703125, + "learning_rate": 0.012997850765820511, + "loss": 0.7638, + "num_input_tokens_seen": 45866624, + "step": 79005 + }, + { + "epoch": 11.767947572237116, + "grad_norm": 0.02978515625, + "learning_rate": 0.012995918575431316, + "loss": 0.7603, + "num_input_tokens_seen": 45869344, + "step": 79010 + }, + { + "epoch": 11.768692284778076, + "grad_norm": 0.06884765625, + "learning_rate": 0.012993986418899043, + "loss": 0.8413, + "num_input_tokens_seen": 45872256, + "step": 79015 + }, + { + "epoch": 11.769436997319035, + "grad_norm": 0.040283203125, + "learning_rate": 0.012992054296256331, + "loss": 0.794, + "num_input_tokens_seen": 45875008, + "step": 79020 + }, + { + "epoch": 11.770181709859994, + "grad_norm": 0.03662109375, + "learning_rate": 0.012990122207535815, + "loss": 0.8026, + "num_input_tokens_seen": 45877760, + "step": 79025 + }, + { + "epoch": 11.770926422400953, + "grad_norm": 0.04150390625, + "learning_rate": 0.012988190152770147, + "loss": 0.8085, + "num_input_tokens_seen": 45880672, + "step": 79030 + }, + { + "epoch": 11.771671134941913, + "grad_norm": 0.041259765625, + "learning_rate": 0.012986258131991956, + "loss": 0.8182, + "num_input_tokens_seen": 45883648, + "step": 79035 + }, + { + "epoch": 11.772415847482872, + "grad_norm": 0.044677734375, + "learning_rate": 0.012984326145233895, + "loss": 0.801, + "num_input_tokens_seen": 45886272, + "step": 79040 + }, + { + "epoch": 11.77316056002383, + "grad_norm": 0.033935546875, + "learning_rate": 0.012982394192528592, + "loss": 0.7782, + "num_input_tokens_seen": 45889184, + "step": 79045 + }, + { + "epoch": 11.77390527256479, + "grad_norm": 0.04638671875, + "learning_rate": 0.012980462273908687, + "loss": 0.7861, + "num_input_tokens_seen": 45892384, + "step": 79050 + }, + { + "epoch": 11.77464998510575, + "grad_norm": 0.044921875, + "learning_rate": 0.012978530389406816, + "loss": 0.7906, + "num_input_tokens_seen": 45895200, + "step": 79055 + }, + { + "epoch": 11.775394697646709, + "grad_norm": 0.039794921875, + "learning_rate": 0.012976598539055627, + "loss": 0.7978, + "num_input_tokens_seen": 45898240, + "step": 79060 + }, + { + "epoch": 11.776139410187668, + "grad_norm": 0.05224609375, + "learning_rate": 0.012974666722887749, + "loss": 0.7993, + "num_input_tokens_seen": 45900992, + "step": 79065 + }, + { + "epoch": 11.776884122728626, + "grad_norm": 0.0361328125, + "learning_rate": 0.01297273494093581, + "loss": 0.8013, + "num_input_tokens_seen": 45904064, + "step": 79070 + }, + { + "epoch": 11.777628835269585, + "grad_norm": 0.033447265625, + "learning_rate": 0.01297080319323246, + "loss": 0.801, + "num_input_tokens_seen": 45906784, + "step": 79075 + }, + { + "epoch": 11.778373547810546, + "grad_norm": 0.0284423828125, + "learning_rate": 0.012968871479810327, + "loss": 0.8035, + "num_input_tokens_seen": 45909632, + "step": 79080 + }, + { + "epoch": 11.779118260351504, + "grad_norm": 0.032958984375, + "learning_rate": 0.012966939800702047, + "loss": 0.7891, + "num_input_tokens_seen": 45912576, + "step": 79085 + }, + { + "epoch": 11.779862972892463, + "grad_norm": 0.03515625, + "learning_rate": 0.012965008155940247, + "loss": 0.8194, + "num_input_tokens_seen": 45915520, + "step": 79090 + }, + { + "epoch": 11.780607685433424, + "grad_norm": 0.038818359375, + "learning_rate": 0.012963076545557574, + "loss": 0.7826, + "num_input_tokens_seen": 45918752, + "step": 79095 + }, + { + "epoch": 11.781352397974382, + "grad_norm": 0.030029296875, + "learning_rate": 0.01296114496958665, + "loss": 0.7937, + "num_input_tokens_seen": 45921600, + "step": 79100 + }, + { + "epoch": 11.782097110515341, + "grad_norm": 0.0311279296875, + "learning_rate": 0.012959213428060107, + "loss": 0.8043, + "num_input_tokens_seen": 45924416, + "step": 79105 + }, + { + "epoch": 11.7828418230563, + "grad_norm": 0.043212890625, + "learning_rate": 0.01295728192101058, + "loss": 0.8131, + "num_input_tokens_seen": 45927520, + "step": 79110 + }, + { + "epoch": 11.783586535597259, + "grad_norm": 0.039794921875, + "learning_rate": 0.012955350448470697, + "loss": 0.7635, + "num_input_tokens_seen": 45930464, + "step": 79115 + }, + { + "epoch": 11.78433124813822, + "grad_norm": 0.044189453125, + "learning_rate": 0.012953419010473095, + "loss": 0.7949, + "num_input_tokens_seen": 45933248, + "step": 79120 + }, + { + "epoch": 11.785075960679178, + "grad_norm": 0.03369140625, + "learning_rate": 0.012951487607050392, + "loss": 0.8095, + "num_input_tokens_seen": 45936320, + "step": 79125 + }, + { + "epoch": 11.785820673220137, + "grad_norm": 0.0537109375, + "learning_rate": 0.012949556238235231, + "loss": 0.7861, + "num_input_tokens_seen": 45938944, + "step": 79130 + }, + { + "epoch": 11.786565385761095, + "grad_norm": 0.0196533203125, + "learning_rate": 0.012947624904060229, + "loss": 0.7949, + "num_input_tokens_seen": 45941824, + "step": 79135 + }, + { + "epoch": 11.787310098302056, + "grad_norm": 0.031005859375, + "learning_rate": 0.012945693604558021, + "loss": 0.7883, + "num_input_tokens_seen": 45944736, + "step": 79140 + }, + { + "epoch": 11.788054810843015, + "grad_norm": 0.0240478515625, + "learning_rate": 0.012943762339761229, + "loss": 0.7977, + "num_input_tokens_seen": 45947520, + "step": 79145 + }, + { + "epoch": 11.788799523383974, + "grad_norm": 0.0260009765625, + "learning_rate": 0.012941831109702482, + "loss": 0.7892, + "num_input_tokens_seen": 45950208, + "step": 79150 + }, + { + "epoch": 11.789544235924932, + "grad_norm": 0.0213623046875, + "learning_rate": 0.01293989991441441, + "loss": 0.7992, + "num_input_tokens_seen": 45953120, + "step": 79155 + }, + { + "epoch": 11.790288948465893, + "grad_norm": 0.038818359375, + "learning_rate": 0.01293796875392963, + "loss": 0.7965, + "num_input_tokens_seen": 45956032, + "step": 79160 + }, + { + "epoch": 11.791033661006852, + "grad_norm": 0.043212890625, + "learning_rate": 0.012936037628280779, + "loss": 0.7857, + "num_input_tokens_seen": 45959136, + "step": 79165 + }, + { + "epoch": 11.79177837354781, + "grad_norm": 0.037353515625, + "learning_rate": 0.012934106537500469, + "loss": 0.7965, + "num_input_tokens_seen": 45962080, + "step": 79170 + }, + { + "epoch": 11.792523086088769, + "grad_norm": 0.0306396484375, + "learning_rate": 0.012932175481621335, + "loss": 0.8251, + "num_input_tokens_seen": 45965088, + "step": 79175 + }, + { + "epoch": 11.79326779862973, + "grad_norm": 0.0294189453125, + "learning_rate": 0.012930244460675993, + "loss": 0.8031, + "num_input_tokens_seen": 45967936, + "step": 79180 + }, + { + "epoch": 11.794012511170688, + "grad_norm": 0.027587890625, + "learning_rate": 0.01292831347469706, + "loss": 0.7916, + "num_input_tokens_seen": 45971072, + "step": 79185 + }, + { + "epoch": 11.794757223711647, + "grad_norm": 0.0264892578125, + "learning_rate": 0.012926382523717174, + "loss": 0.8167, + "num_input_tokens_seen": 45974176, + "step": 79190 + }, + { + "epoch": 11.795501936252606, + "grad_norm": 0.03125, + "learning_rate": 0.01292445160776894, + "loss": 0.8145, + "num_input_tokens_seen": 45976832, + "step": 79195 + }, + { + "epoch": 11.796246648793566, + "grad_norm": 0.0220947265625, + "learning_rate": 0.012922520726884991, + "loss": 0.804, + "num_input_tokens_seen": 45979616, + "step": 79200 + }, + { + "epoch": 11.796991361334525, + "grad_norm": 0.031982421875, + "learning_rate": 0.012920589881097943, + "loss": 0.8049, + "num_input_tokens_seen": 45982656, + "step": 79205 + }, + { + "epoch": 11.797736073875484, + "grad_norm": 0.033447265625, + "learning_rate": 0.012918659070440416, + "loss": 0.8097, + "num_input_tokens_seen": 45985568, + "step": 79210 + }, + { + "epoch": 11.798480786416443, + "grad_norm": 0.040283203125, + "learning_rate": 0.012916728294945024, + "loss": 0.808, + "num_input_tokens_seen": 45988544, + "step": 79215 + }, + { + "epoch": 11.799225498957403, + "grad_norm": 0.0274658203125, + "learning_rate": 0.012914797554644397, + "loss": 0.8089, + "num_input_tokens_seen": 45991232, + "step": 79220 + }, + { + "epoch": 11.799970211498362, + "grad_norm": 0.035888671875, + "learning_rate": 0.012912866849571145, + "loss": 0.7965, + "num_input_tokens_seen": 45994400, + "step": 79225 + }, + { + "epoch": 11.80071492403932, + "grad_norm": 0.019287109375, + "learning_rate": 0.01291093617975788, + "loss": 0.8442, + "num_input_tokens_seen": 45997408, + "step": 79230 + }, + { + "epoch": 11.80145963658028, + "grad_norm": 0.03076171875, + "learning_rate": 0.012909005545237232, + "loss": 0.7869, + "num_input_tokens_seen": 46000480, + "step": 79235 + }, + { + "epoch": 11.80220434912124, + "grad_norm": 0.0302734375, + "learning_rate": 0.012907074946041807, + "loss": 0.8018, + "num_input_tokens_seen": 46003360, + "step": 79240 + }, + { + "epoch": 11.802949061662199, + "grad_norm": 0.0291748046875, + "learning_rate": 0.012905144382204227, + "loss": 0.8012, + "num_input_tokens_seen": 46006240, + "step": 79245 + }, + { + "epoch": 11.803693774203158, + "grad_norm": 0.03857421875, + "learning_rate": 0.012903213853757093, + "loss": 0.8261, + "num_input_tokens_seen": 46009088, + "step": 79250 + }, + { + "epoch": 11.804438486744116, + "grad_norm": 0.03955078125, + "learning_rate": 0.012901283360733042, + "loss": 0.8005, + "num_input_tokens_seen": 46012032, + "step": 79255 + }, + { + "epoch": 11.805183199285075, + "grad_norm": 0.04736328125, + "learning_rate": 0.012899352903164666, + "loss": 0.7842, + "num_input_tokens_seen": 46015008, + "step": 79260 + }, + { + "epoch": 11.805927911826036, + "grad_norm": 0.031005859375, + "learning_rate": 0.012897422481084594, + "loss": 0.7751, + "num_input_tokens_seen": 46017632, + "step": 79265 + }, + { + "epoch": 11.806672624366994, + "grad_norm": 0.045654296875, + "learning_rate": 0.012895492094525432, + "loss": 0.7903, + "num_input_tokens_seen": 46020384, + "step": 79270 + }, + { + "epoch": 11.807417336907953, + "grad_norm": 0.06201171875, + "learning_rate": 0.012893561743519792, + "loss": 0.8181, + "num_input_tokens_seen": 46023424, + "step": 79275 + }, + { + "epoch": 11.808162049448912, + "grad_norm": 0.024658203125, + "learning_rate": 0.012891631428100287, + "loss": 0.8107, + "num_input_tokens_seen": 46026208, + "step": 79280 + }, + { + "epoch": 11.808906761989872, + "grad_norm": 0.0311279296875, + "learning_rate": 0.012889701148299518, + "loss": 0.7928, + "num_input_tokens_seen": 46028768, + "step": 79285 + }, + { + "epoch": 11.809651474530831, + "grad_norm": 0.033447265625, + "learning_rate": 0.012887770904150112, + "loss": 0.7919, + "num_input_tokens_seen": 46031456, + "step": 79290 + }, + { + "epoch": 11.81039618707179, + "grad_norm": 0.03955078125, + "learning_rate": 0.012885840695684663, + "loss": 0.7984, + "num_input_tokens_seen": 46034304, + "step": 79295 + }, + { + "epoch": 11.811140899612749, + "grad_norm": 0.0211181640625, + "learning_rate": 0.012883910522935794, + "loss": 0.7952, + "num_input_tokens_seen": 46037408, + "step": 79300 + }, + { + "epoch": 11.81188561215371, + "grad_norm": 0.032958984375, + "learning_rate": 0.012881980385936104, + "loss": 0.7915, + "num_input_tokens_seen": 46040512, + "step": 79305 + }, + { + "epoch": 11.812630324694668, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0128800502847182, + "loss": 0.7993, + "num_input_tokens_seen": 46043616, + "step": 79310 + }, + { + "epoch": 11.813375037235627, + "grad_norm": 0.044921875, + "learning_rate": 0.012878120219314695, + "loss": 0.8197, + "num_input_tokens_seen": 46046432, + "step": 79315 + }, + { + "epoch": 11.814119749776586, + "grad_norm": 0.0283203125, + "learning_rate": 0.012876190189758186, + "loss": 0.7905, + "num_input_tokens_seen": 46049248, + "step": 79320 + }, + { + "epoch": 11.814864462317546, + "grad_norm": 0.043701171875, + "learning_rate": 0.012874260196081292, + "loss": 0.8001, + "num_input_tokens_seen": 46052160, + "step": 79325 + }, + { + "epoch": 11.815609174858505, + "grad_norm": 0.0311279296875, + "learning_rate": 0.012872330238316606, + "loss": 0.8151, + "num_input_tokens_seen": 46055264, + "step": 79330 + }, + { + "epoch": 11.816353887399464, + "grad_norm": 0.050537109375, + "learning_rate": 0.012870400316496743, + "loss": 0.797, + "num_input_tokens_seen": 46057952, + "step": 79335 + }, + { + "epoch": 11.817098599940422, + "grad_norm": 0.0341796875, + "learning_rate": 0.012868470430654299, + "loss": 0.8049, + "num_input_tokens_seen": 46061056, + "step": 79340 + }, + { + "epoch": 11.817843312481383, + "grad_norm": 0.0439453125, + "learning_rate": 0.012866540580821885, + "loss": 0.8038, + "num_input_tokens_seen": 46064000, + "step": 79345 + }, + { + "epoch": 11.818588025022342, + "grad_norm": 0.04296875, + "learning_rate": 0.012864610767032098, + "loss": 0.8134, + "num_input_tokens_seen": 46066816, + "step": 79350 + }, + { + "epoch": 11.8193327375633, + "grad_norm": 0.04736328125, + "learning_rate": 0.012862680989317537, + "loss": 0.7796, + "num_input_tokens_seen": 46069728, + "step": 79355 + }, + { + "epoch": 11.82007745010426, + "grad_norm": 0.0322265625, + "learning_rate": 0.012860751247710814, + "loss": 0.8025, + "num_input_tokens_seen": 46072288, + "step": 79360 + }, + { + "epoch": 11.82082216264522, + "grad_norm": 0.02197265625, + "learning_rate": 0.012858821542244518, + "loss": 0.7982, + "num_input_tokens_seen": 46075232, + "step": 79365 + }, + { + "epoch": 11.821566875186178, + "grad_norm": 0.0283203125, + "learning_rate": 0.01285689187295126, + "loss": 0.797, + "num_input_tokens_seen": 46078112, + "step": 79370 + }, + { + "epoch": 11.822311587727137, + "grad_norm": 0.02099609375, + "learning_rate": 0.012854962239863637, + "loss": 0.7929, + "num_input_tokens_seen": 46081376, + "step": 79375 + }, + { + "epoch": 11.823056300268096, + "grad_norm": 0.027099609375, + "learning_rate": 0.012853032643014248, + "loss": 0.7738, + "num_input_tokens_seen": 46084192, + "step": 79380 + }, + { + "epoch": 11.823801012809056, + "grad_norm": 0.0284423828125, + "learning_rate": 0.012851103082435691, + "loss": 0.7977, + "num_input_tokens_seen": 46086816, + "step": 79385 + }, + { + "epoch": 11.824545725350015, + "grad_norm": 0.029052734375, + "learning_rate": 0.012849173558160554, + "loss": 0.7974, + "num_input_tokens_seen": 46089504, + "step": 79390 + }, + { + "epoch": 11.825290437890974, + "grad_norm": 0.044921875, + "learning_rate": 0.012847244070221454, + "loss": 0.7949, + "num_input_tokens_seen": 46092512, + "step": 79395 + }, + { + "epoch": 11.826035150431933, + "grad_norm": 0.031494140625, + "learning_rate": 0.012845314618650969, + "loss": 0.8048, + "num_input_tokens_seen": 46095744, + "step": 79400 + }, + { + "epoch": 11.826779862972892, + "grad_norm": 0.0380859375, + "learning_rate": 0.01284338520348171, + "loss": 0.8258, + "num_input_tokens_seen": 46098720, + "step": 79405 + }, + { + "epoch": 11.827524575513852, + "grad_norm": 0.033935546875, + "learning_rate": 0.012841455824746262, + "loss": 0.7911, + "num_input_tokens_seen": 46101536, + "step": 79410 + }, + { + "epoch": 11.82826928805481, + "grad_norm": 0.0341796875, + "learning_rate": 0.01283952648247723, + "loss": 0.7982, + "num_input_tokens_seen": 46104608, + "step": 79415 + }, + { + "epoch": 11.82901400059577, + "grad_norm": 0.050048828125, + "learning_rate": 0.012837597176707193, + "loss": 0.7966, + "num_input_tokens_seen": 46107584, + "step": 79420 + }, + { + "epoch": 11.82975871313673, + "grad_norm": 0.03662109375, + "learning_rate": 0.012835667907468764, + "loss": 0.7942, + "num_input_tokens_seen": 46110592, + "step": 79425 + }, + { + "epoch": 11.830503425677689, + "grad_norm": 0.0289306640625, + "learning_rate": 0.012833738674794523, + "loss": 0.8084, + "num_input_tokens_seen": 46113600, + "step": 79430 + }, + { + "epoch": 11.831248138218648, + "grad_norm": 0.035400390625, + "learning_rate": 0.012831809478717062, + "loss": 0.795, + "num_input_tokens_seen": 46116512, + "step": 79435 + }, + { + "epoch": 11.831992850759606, + "grad_norm": 0.037109375, + "learning_rate": 0.012829880319268978, + "loss": 0.7734, + "num_input_tokens_seen": 46119328, + "step": 79440 + }, + { + "epoch": 11.832737563300565, + "grad_norm": 0.02978515625, + "learning_rate": 0.012827951196482858, + "loss": 0.7854, + "num_input_tokens_seen": 46122144, + "step": 79445 + }, + { + "epoch": 11.833482275841526, + "grad_norm": 0.046875, + "learning_rate": 0.0128260221103913, + "loss": 0.8038, + "num_input_tokens_seen": 46125216, + "step": 79450 + }, + { + "epoch": 11.834226988382484, + "grad_norm": 0.04248046875, + "learning_rate": 0.012824093061026881, + "loss": 0.8016, + "num_input_tokens_seen": 46128032, + "step": 79455 + }, + { + "epoch": 11.834971700923443, + "grad_norm": 0.022216796875, + "learning_rate": 0.012822164048422208, + "loss": 0.7994, + "num_input_tokens_seen": 46130816, + "step": 79460 + }, + { + "epoch": 11.835716413464402, + "grad_norm": 0.0220947265625, + "learning_rate": 0.012820235072609857, + "loss": 0.806, + "num_input_tokens_seen": 46133952, + "step": 79465 + }, + { + "epoch": 11.836461126005362, + "grad_norm": 0.037109375, + "learning_rate": 0.012818306133622417, + "loss": 0.7993, + "num_input_tokens_seen": 46136864, + "step": 79470 + }, + { + "epoch": 11.837205838546321, + "grad_norm": 0.0296630859375, + "learning_rate": 0.012816377231492477, + "loss": 0.8009, + "num_input_tokens_seen": 46139808, + "step": 79475 + }, + { + "epoch": 11.83795055108728, + "grad_norm": 0.0322265625, + "learning_rate": 0.012814448366252625, + "loss": 0.7913, + "num_input_tokens_seen": 46142848, + "step": 79480 + }, + { + "epoch": 11.838695263628239, + "grad_norm": 0.034423828125, + "learning_rate": 0.01281251953793545, + "loss": 0.788, + "num_input_tokens_seen": 46145344, + "step": 79485 + }, + { + "epoch": 11.8394399761692, + "grad_norm": 0.048828125, + "learning_rate": 0.012810590746573527, + "loss": 0.7961, + "num_input_tokens_seen": 46148480, + "step": 79490 + }, + { + "epoch": 11.840184688710158, + "grad_norm": 0.06689453125, + "learning_rate": 0.012808661992199452, + "loss": 0.7979, + "num_input_tokens_seen": 46151136, + "step": 79495 + }, + { + "epoch": 11.840929401251117, + "grad_norm": 0.03515625, + "learning_rate": 0.012806733274845807, + "loss": 0.8059, + "num_input_tokens_seen": 46154080, + "step": 79500 + }, + { + "epoch": 11.841674113792076, + "grad_norm": 0.031982421875, + "learning_rate": 0.012804804594545175, + "loss": 0.7915, + "num_input_tokens_seen": 46156928, + "step": 79505 + }, + { + "epoch": 11.842418826333036, + "grad_norm": 0.07763671875, + "learning_rate": 0.012802875951330138, + "loss": 0.8232, + "num_input_tokens_seen": 46159904, + "step": 79510 + }, + { + "epoch": 11.843163538873995, + "grad_norm": 0.0308837890625, + "learning_rate": 0.012800947345233273, + "loss": 0.7926, + "num_input_tokens_seen": 46162528, + "step": 79515 + }, + { + "epoch": 11.843908251414954, + "grad_norm": 0.0269775390625, + "learning_rate": 0.012799018776287174, + "loss": 0.8095, + "num_input_tokens_seen": 46165408, + "step": 79520 + }, + { + "epoch": 11.844652963955912, + "grad_norm": 0.036376953125, + "learning_rate": 0.012797090244524406, + "loss": 0.7952, + "num_input_tokens_seen": 46168384, + "step": 79525 + }, + { + "epoch": 11.845397676496873, + "grad_norm": 0.02197265625, + "learning_rate": 0.01279516174997757, + "loss": 0.808, + "num_input_tokens_seen": 46171552, + "step": 79530 + }, + { + "epoch": 11.846142389037832, + "grad_norm": 0.037841796875, + "learning_rate": 0.012793233292679231, + "loss": 0.7935, + "num_input_tokens_seen": 46174496, + "step": 79535 + }, + { + "epoch": 11.84688710157879, + "grad_norm": 0.0361328125, + "learning_rate": 0.012791304872661977, + "loss": 0.8387, + "num_input_tokens_seen": 46177472, + "step": 79540 + }, + { + "epoch": 11.84763181411975, + "grad_norm": 0.0439453125, + "learning_rate": 0.012789376489958381, + "loss": 0.7871, + "num_input_tokens_seen": 46180192, + "step": 79545 + }, + { + "epoch": 11.84837652666071, + "grad_norm": 0.0272216796875, + "learning_rate": 0.012787448144601015, + "loss": 0.8085, + "num_input_tokens_seen": 46183104, + "step": 79550 + }, + { + "epoch": 11.849121239201668, + "grad_norm": 0.02783203125, + "learning_rate": 0.012785519836622473, + "loss": 0.7943, + "num_input_tokens_seen": 46185984, + "step": 79555 + }, + { + "epoch": 11.849865951742627, + "grad_norm": 0.039794921875, + "learning_rate": 0.012783591566055316, + "loss": 0.7849, + "num_input_tokens_seen": 46189120, + "step": 79560 + }, + { + "epoch": 11.850610664283586, + "grad_norm": 0.041259765625, + "learning_rate": 0.01278166333293213, + "loss": 0.8008, + "num_input_tokens_seen": 46191936, + "step": 79565 + }, + { + "epoch": 11.851355376824547, + "grad_norm": 0.034423828125, + "learning_rate": 0.012779735137285487, + "loss": 0.7871, + "num_input_tokens_seen": 46194592, + "step": 79570 + }, + { + "epoch": 11.852100089365505, + "grad_norm": 0.033935546875, + "learning_rate": 0.012777806979147965, + "loss": 0.7951, + "num_input_tokens_seen": 46197376, + "step": 79575 + }, + { + "epoch": 11.852844801906464, + "grad_norm": 0.039306640625, + "learning_rate": 0.012775878858552128, + "loss": 0.8014, + "num_input_tokens_seen": 46200384, + "step": 79580 + }, + { + "epoch": 11.853589514447423, + "grad_norm": 0.04345703125, + "learning_rate": 0.012773950775530564, + "loss": 0.7903, + "num_input_tokens_seen": 46203136, + "step": 79585 + }, + { + "epoch": 11.854334226988382, + "grad_norm": 0.041259765625, + "learning_rate": 0.012772022730115842, + "loss": 0.8197, + "num_input_tokens_seen": 46206016, + "step": 79590 + }, + { + "epoch": 11.855078939529342, + "grad_norm": 0.046142578125, + "learning_rate": 0.012770094722340522, + "loss": 0.789, + "num_input_tokens_seen": 46208736, + "step": 79595 + }, + { + "epoch": 11.8558236520703, + "grad_norm": 0.0242919921875, + "learning_rate": 0.012768166752237192, + "loss": 0.8009, + "num_input_tokens_seen": 46211808, + "step": 79600 + }, + { + "epoch": 11.85656836461126, + "grad_norm": 0.044189453125, + "learning_rate": 0.012766238819838413, + "loss": 0.7925, + "num_input_tokens_seen": 46214848, + "step": 79605 + }, + { + "epoch": 11.85731307715222, + "grad_norm": 0.0269775390625, + "learning_rate": 0.012764310925176763, + "loss": 0.8012, + "num_input_tokens_seen": 46217664, + "step": 79610 + }, + { + "epoch": 11.858057789693179, + "grad_norm": 0.040283203125, + "learning_rate": 0.0127623830682848, + "loss": 0.815, + "num_input_tokens_seen": 46220736, + "step": 79615 + }, + { + "epoch": 11.858802502234138, + "grad_norm": 0.048583984375, + "learning_rate": 0.01276045524919511, + "loss": 0.776, + "num_input_tokens_seen": 46223648, + "step": 79620 + }, + { + "epoch": 11.859547214775096, + "grad_norm": 0.0291748046875, + "learning_rate": 0.01275852746794025, + "loss": 0.7907, + "num_input_tokens_seen": 46226560, + "step": 79625 + }, + { + "epoch": 11.860291927316055, + "grad_norm": 0.034423828125, + "learning_rate": 0.012756599724552784, + "loss": 0.8005, + "num_input_tokens_seen": 46229568, + "step": 79630 + }, + { + "epoch": 11.861036639857016, + "grad_norm": 0.048583984375, + "learning_rate": 0.012754672019065293, + "loss": 0.8086, + "num_input_tokens_seen": 46232416, + "step": 79635 + }, + { + "epoch": 11.861781352397974, + "grad_norm": 0.0390625, + "learning_rate": 0.012752744351510333, + "loss": 0.8028, + "num_input_tokens_seen": 46235392, + "step": 79640 + }, + { + "epoch": 11.862526064938933, + "grad_norm": 0.03466796875, + "learning_rate": 0.012750816721920475, + "loss": 0.8064, + "num_input_tokens_seen": 46238400, + "step": 79645 + }, + { + "epoch": 11.863270777479892, + "grad_norm": 0.0306396484375, + "learning_rate": 0.012748889130328277, + "loss": 0.7904, + "num_input_tokens_seen": 46241408, + "step": 79650 + }, + { + "epoch": 11.864015490020853, + "grad_norm": 0.0296630859375, + "learning_rate": 0.012746961576766316, + "loss": 0.7776, + "num_input_tokens_seen": 46244320, + "step": 79655 + }, + { + "epoch": 11.864760202561811, + "grad_norm": 0.03955078125, + "learning_rate": 0.012745034061267141, + "loss": 0.8091, + "num_input_tokens_seen": 46247008, + "step": 79660 + }, + { + "epoch": 11.86550491510277, + "grad_norm": 0.0419921875, + "learning_rate": 0.012743106583863333, + "loss": 0.792, + "num_input_tokens_seen": 46250112, + "step": 79665 + }, + { + "epoch": 11.866249627643729, + "grad_norm": 0.03466796875, + "learning_rate": 0.012741179144587443, + "loss": 0.7864, + "num_input_tokens_seen": 46253088, + "step": 79670 + }, + { + "epoch": 11.86699434018469, + "grad_norm": 0.0286865234375, + "learning_rate": 0.012739251743472034, + "loss": 0.8022, + "num_input_tokens_seen": 46256128, + "step": 79675 + }, + { + "epoch": 11.867739052725648, + "grad_norm": 0.0230712890625, + "learning_rate": 0.012737324380549671, + "loss": 0.7987, + "num_input_tokens_seen": 46258880, + "step": 79680 + }, + { + "epoch": 11.868483765266607, + "grad_norm": 0.031494140625, + "learning_rate": 0.012735397055852908, + "loss": 0.8083, + "num_input_tokens_seen": 46262112, + "step": 79685 + }, + { + "epoch": 11.869228477807566, + "grad_norm": 0.036376953125, + "learning_rate": 0.012733469769414314, + "loss": 0.8083, + "num_input_tokens_seen": 46265120, + "step": 79690 + }, + { + "epoch": 11.869973190348526, + "grad_norm": 0.031982421875, + "learning_rate": 0.01273154252126644, + "loss": 0.8007, + "num_input_tokens_seen": 46267712, + "step": 79695 + }, + { + "epoch": 11.870717902889485, + "grad_norm": 0.04296875, + "learning_rate": 0.012729615311441858, + "loss": 0.7931, + "num_input_tokens_seen": 46270560, + "step": 79700 + }, + { + "epoch": 11.871462615430444, + "grad_norm": 0.028076171875, + "learning_rate": 0.012727688139973114, + "loss": 0.7954, + "num_input_tokens_seen": 46273632, + "step": 79705 + }, + { + "epoch": 11.872207327971402, + "grad_norm": 0.04052734375, + "learning_rate": 0.01272576100689277, + "loss": 0.7732, + "num_input_tokens_seen": 46276416, + "step": 79710 + }, + { + "epoch": 11.872952040512363, + "grad_norm": 0.032958984375, + "learning_rate": 0.012723833912233383, + "loss": 0.7955, + "num_input_tokens_seen": 46279200, + "step": 79715 + }, + { + "epoch": 11.873696753053322, + "grad_norm": 0.03515625, + "learning_rate": 0.012721906856027502, + "loss": 0.7909, + "num_input_tokens_seen": 46282016, + "step": 79720 + }, + { + "epoch": 11.87444146559428, + "grad_norm": 0.04150390625, + "learning_rate": 0.012719979838307695, + "loss": 0.7973, + "num_input_tokens_seen": 46285024, + "step": 79725 + }, + { + "epoch": 11.87518617813524, + "grad_norm": 0.03662109375, + "learning_rate": 0.012718052859106509, + "loss": 0.8004, + "num_input_tokens_seen": 46287872, + "step": 79730 + }, + { + "epoch": 11.8759308906762, + "grad_norm": 0.042724609375, + "learning_rate": 0.012716125918456504, + "loss": 0.817, + "num_input_tokens_seen": 46291040, + "step": 79735 + }, + { + "epoch": 11.876675603217159, + "grad_norm": 0.04296875, + "learning_rate": 0.012714199016390227, + "loss": 0.7656, + "num_input_tokens_seen": 46293920, + "step": 79740 + }, + { + "epoch": 11.877420315758117, + "grad_norm": 0.048095703125, + "learning_rate": 0.012712272152940237, + "loss": 0.8165, + "num_input_tokens_seen": 46296768, + "step": 79745 + }, + { + "epoch": 11.878165028299076, + "grad_norm": 0.03271484375, + "learning_rate": 0.012710345328139087, + "loss": 0.7933, + "num_input_tokens_seen": 46299648, + "step": 79750 + }, + { + "epoch": 11.878909740840037, + "grad_norm": 0.041748046875, + "learning_rate": 0.012708418542019316, + "loss": 0.8047, + "num_input_tokens_seen": 46302592, + "step": 79755 + }, + { + "epoch": 11.879654453380995, + "grad_norm": 0.038818359375, + "learning_rate": 0.012706491794613492, + "loss": 0.8099, + "num_input_tokens_seen": 46305728, + "step": 79760 + }, + { + "epoch": 11.880399165921954, + "grad_norm": 0.0240478515625, + "learning_rate": 0.012704565085954156, + "loss": 0.7982, + "num_input_tokens_seen": 46308544, + "step": 79765 + }, + { + "epoch": 11.881143878462913, + "grad_norm": 0.033447265625, + "learning_rate": 0.012702638416073858, + "loss": 0.7873, + "num_input_tokens_seen": 46311520, + "step": 79770 + }, + { + "epoch": 11.881888591003872, + "grad_norm": 0.031494140625, + "learning_rate": 0.012700711785005151, + "loss": 0.7722, + "num_input_tokens_seen": 46314336, + "step": 79775 + }, + { + "epoch": 11.882633303544832, + "grad_norm": 0.0322265625, + "learning_rate": 0.012698785192780582, + "loss": 0.8037, + "num_input_tokens_seen": 46317344, + "step": 79780 + }, + { + "epoch": 11.883378016085791, + "grad_norm": 0.03662109375, + "learning_rate": 0.012696858639432701, + "loss": 0.788, + "num_input_tokens_seen": 46320224, + "step": 79785 + }, + { + "epoch": 11.88412272862675, + "grad_norm": 0.044921875, + "learning_rate": 0.012694932124994045, + "loss": 0.7788, + "num_input_tokens_seen": 46323104, + "step": 79790 + }, + { + "epoch": 11.884867441167708, + "grad_norm": 0.05029296875, + "learning_rate": 0.012693005649497175, + "loss": 0.7825, + "num_input_tokens_seen": 46326048, + "step": 79795 + }, + { + "epoch": 11.885612153708669, + "grad_norm": 0.039306640625, + "learning_rate": 0.012691079212974627, + "loss": 0.7919, + "num_input_tokens_seen": 46329376, + "step": 79800 + }, + { + "epoch": 11.886356866249628, + "grad_norm": 0.0400390625, + "learning_rate": 0.012689152815458947, + "loss": 0.7897, + "num_input_tokens_seen": 46332064, + "step": 79805 + }, + { + "epoch": 11.887101578790586, + "grad_norm": 0.068359375, + "learning_rate": 0.012687226456982684, + "loss": 0.7821, + "num_input_tokens_seen": 46334944, + "step": 79810 + }, + { + "epoch": 11.887846291331545, + "grad_norm": 0.03369140625, + "learning_rate": 0.012685300137578383, + "loss": 0.7882, + "num_input_tokens_seen": 46338016, + "step": 79815 + }, + { + "epoch": 11.888591003872506, + "grad_norm": 0.051513671875, + "learning_rate": 0.012683373857278574, + "loss": 0.8223, + "num_input_tokens_seen": 46340960, + "step": 79820 + }, + { + "epoch": 11.889335716413465, + "grad_norm": 0.04296875, + "learning_rate": 0.012681447616115817, + "loss": 0.8087, + "num_input_tokens_seen": 46343968, + "step": 79825 + }, + { + "epoch": 11.890080428954423, + "grad_norm": 0.07080078125, + "learning_rate": 0.012679521414122647, + "loss": 0.7927, + "num_input_tokens_seen": 46347168, + "step": 79830 + }, + { + "epoch": 11.890825141495382, + "grad_norm": 0.06396484375, + "learning_rate": 0.0126775952513316, + "loss": 0.8045, + "num_input_tokens_seen": 46349952, + "step": 79835 + }, + { + "epoch": 11.891569854036343, + "grad_norm": 0.06787109375, + "learning_rate": 0.012675669127775223, + "loss": 0.8, + "num_input_tokens_seen": 46352768, + "step": 79840 + }, + { + "epoch": 11.892314566577301, + "grad_norm": 0.038818359375, + "learning_rate": 0.01267374304348605, + "loss": 0.8156, + "num_input_tokens_seen": 46355776, + "step": 79845 + }, + { + "epoch": 11.89305927911826, + "grad_norm": 0.0311279296875, + "learning_rate": 0.012671816998496628, + "loss": 0.8, + "num_input_tokens_seen": 46358624, + "step": 79850 + }, + { + "epoch": 11.893803991659219, + "grad_norm": 0.0615234375, + "learning_rate": 0.012669890992839487, + "loss": 0.8049, + "num_input_tokens_seen": 46361504, + "step": 79855 + }, + { + "epoch": 11.89454870420018, + "grad_norm": 0.039306640625, + "learning_rate": 0.012667965026547174, + "loss": 0.7974, + "num_input_tokens_seen": 46364384, + "step": 79860 + }, + { + "epoch": 11.895293416741138, + "grad_norm": 0.0732421875, + "learning_rate": 0.012666039099652221, + "loss": 0.7814, + "num_input_tokens_seen": 46367168, + "step": 79865 + }, + { + "epoch": 11.896038129282097, + "grad_norm": 0.0291748046875, + "learning_rate": 0.012664113212187167, + "loss": 0.7922, + "num_input_tokens_seen": 46370048, + "step": 79870 + }, + { + "epoch": 11.896782841823056, + "grad_norm": 0.029541015625, + "learning_rate": 0.012662187364184545, + "loss": 0.7825, + "num_input_tokens_seen": 46372608, + "step": 79875 + }, + { + "epoch": 11.897527554364016, + "grad_norm": 0.029052734375, + "learning_rate": 0.012660261555676887, + "loss": 0.7786, + "num_input_tokens_seen": 46375584, + "step": 79880 + }, + { + "epoch": 11.898272266904975, + "grad_norm": 0.029052734375, + "learning_rate": 0.012658335786696739, + "loss": 0.7984, + "num_input_tokens_seen": 46378368, + "step": 79885 + }, + { + "epoch": 11.899016979445934, + "grad_norm": 0.024658203125, + "learning_rate": 0.012656410057276619, + "loss": 0.7868, + "num_input_tokens_seen": 46381248, + "step": 79890 + }, + { + "epoch": 11.899761691986892, + "grad_norm": 0.043701171875, + "learning_rate": 0.012654484367449077, + "loss": 0.7977, + "num_input_tokens_seen": 46384160, + "step": 79895 + }, + { + "epoch": 11.900506404527853, + "grad_norm": 0.0206298828125, + "learning_rate": 0.012652558717246635, + "loss": 0.7936, + "num_input_tokens_seen": 46386880, + "step": 79900 + }, + { + "epoch": 11.901251117068812, + "grad_norm": 0.039794921875, + "learning_rate": 0.01265063310670183, + "loss": 0.7999, + "num_input_tokens_seen": 46389408, + "step": 79905 + }, + { + "epoch": 11.90199582960977, + "grad_norm": 0.03662109375, + "learning_rate": 0.01264870753584719, + "loss": 0.8768, + "num_input_tokens_seen": 46392576, + "step": 79910 + }, + { + "epoch": 11.90274054215073, + "grad_norm": 0.058349609375, + "learning_rate": 0.012646782004715243, + "loss": 0.7713, + "num_input_tokens_seen": 46395456, + "step": 79915 + }, + { + "epoch": 11.90348525469169, + "grad_norm": 0.03125, + "learning_rate": 0.012644856513338526, + "loss": 0.797, + "num_input_tokens_seen": 46398304, + "step": 79920 + }, + { + "epoch": 11.904229967232649, + "grad_norm": 0.046142578125, + "learning_rate": 0.012642931061749558, + "loss": 0.8165, + "num_input_tokens_seen": 46401120, + "step": 79925 + }, + { + "epoch": 11.904974679773607, + "grad_norm": 0.064453125, + "learning_rate": 0.012641005649980882, + "loss": 0.7785, + "num_input_tokens_seen": 46404128, + "step": 79930 + }, + { + "epoch": 11.905719392314566, + "grad_norm": 0.038818359375, + "learning_rate": 0.012639080278065014, + "loss": 0.8107, + "num_input_tokens_seen": 46406880, + "step": 79935 + }, + { + "epoch": 11.906464104855527, + "grad_norm": 0.03955078125, + "learning_rate": 0.01263715494603449, + "loss": 0.8009, + "num_input_tokens_seen": 46409952, + "step": 79940 + }, + { + "epoch": 11.907208817396485, + "grad_norm": 0.046875, + "learning_rate": 0.012635229653921822, + "loss": 0.7999, + "num_input_tokens_seen": 46413088, + "step": 79945 + }, + { + "epoch": 11.907953529937444, + "grad_norm": 0.044921875, + "learning_rate": 0.012633304401759554, + "loss": 0.7885, + "num_input_tokens_seen": 46415968, + "step": 79950 + }, + { + "epoch": 11.908698242478403, + "grad_norm": 0.03515625, + "learning_rate": 0.012631379189580202, + "loss": 0.8014, + "num_input_tokens_seen": 46418656, + "step": 79955 + }, + { + "epoch": 11.909442955019362, + "grad_norm": 0.046630859375, + "learning_rate": 0.012629454017416287, + "loss": 0.8162, + "num_input_tokens_seen": 46421600, + "step": 79960 + }, + { + "epoch": 11.910187667560322, + "grad_norm": 0.02978515625, + "learning_rate": 0.01262752888530034, + "loss": 0.799, + "num_input_tokens_seen": 46424512, + "step": 79965 + }, + { + "epoch": 11.910932380101281, + "grad_norm": 0.045654296875, + "learning_rate": 0.01262560379326488, + "loss": 0.7944, + "num_input_tokens_seen": 46427488, + "step": 79970 + }, + { + "epoch": 11.91167709264224, + "grad_norm": 0.03076171875, + "learning_rate": 0.012623678741342435, + "loss": 0.7925, + "num_input_tokens_seen": 46430112, + "step": 79975 + }, + { + "epoch": 11.912421805183198, + "grad_norm": 0.03662109375, + "learning_rate": 0.012621753729565515, + "loss": 0.8018, + "num_input_tokens_seen": 46433024, + "step": 79980 + }, + { + "epoch": 11.913166517724159, + "grad_norm": 0.1748046875, + "learning_rate": 0.012619828757966656, + "loss": 0.8461, + "num_input_tokens_seen": 46435904, + "step": 79985 + }, + { + "epoch": 11.913911230265118, + "grad_norm": 0.058837890625, + "learning_rate": 0.01261790382657837, + "loss": 0.8068, + "num_input_tokens_seen": 46438688, + "step": 79990 + }, + { + "epoch": 11.914655942806077, + "grad_norm": 0.0272216796875, + "learning_rate": 0.01261597893543317, + "loss": 0.7875, + "num_input_tokens_seen": 46441312, + "step": 79995 + }, + { + "epoch": 11.915400655347035, + "grad_norm": 0.0223388671875, + "learning_rate": 0.01261405408456359, + "loss": 0.8, + "num_input_tokens_seen": 46444256, + "step": 80000 + }, + { + "epoch": 11.916145367887996, + "grad_norm": 0.030029296875, + "learning_rate": 0.012612129274002138, + "loss": 0.7962, + "num_input_tokens_seen": 46447200, + "step": 80005 + }, + { + "epoch": 11.916890080428955, + "grad_norm": 0.041748046875, + "learning_rate": 0.01261020450378134, + "loss": 0.7843, + "num_input_tokens_seen": 46450048, + "step": 80010 + }, + { + "epoch": 11.917634792969913, + "grad_norm": 0.0198974609375, + "learning_rate": 0.0126082797739337, + "loss": 0.804, + "num_input_tokens_seen": 46453088, + "step": 80015 + }, + { + "epoch": 11.918379505510872, + "grad_norm": 0.0361328125, + "learning_rate": 0.01260635508449175, + "loss": 0.794, + "num_input_tokens_seen": 46456256, + "step": 80020 + }, + { + "epoch": 11.919124218051833, + "grad_norm": 0.04638671875, + "learning_rate": 0.01260443043548799, + "loss": 0.7882, + "num_input_tokens_seen": 46459040, + "step": 80025 + }, + { + "epoch": 11.919868930592791, + "grad_norm": 0.032958984375, + "learning_rate": 0.01260250582695495, + "loss": 0.804, + "num_input_tokens_seen": 46461952, + "step": 80030 + }, + { + "epoch": 11.92061364313375, + "grad_norm": 0.0223388671875, + "learning_rate": 0.012600581258925138, + "loss": 0.8092, + "num_input_tokens_seen": 46464896, + "step": 80035 + }, + { + "epoch": 11.921358355674709, + "grad_norm": 0.0517578125, + "learning_rate": 0.012598656731431065, + "loss": 0.8066, + "num_input_tokens_seen": 46467680, + "step": 80040 + }, + { + "epoch": 11.92210306821567, + "grad_norm": 0.0205078125, + "learning_rate": 0.012596732244505247, + "loss": 0.7946, + "num_input_tokens_seen": 46470432, + "step": 80045 + }, + { + "epoch": 11.922847780756628, + "grad_norm": 0.0458984375, + "learning_rate": 0.012594807798180188, + "loss": 0.7997, + "num_input_tokens_seen": 46473408, + "step": 80050 + }, + { + "epoch": 11.923592493297587, + "grad_norm": 0.02392578125, + "learning_rate": 0.012592883392488414, + "loss": 0.7839, + "num_input_tokens_seen": 46476096, + "step": 80055 + }, + { + "epoch": 11.924337205838546, + "grad_norm": 0.0439453125, + "learning_rate": 0.012590959027462423, + "loss": 0.7807, + "num_input_tokens_seen": 46479168, + "step": 80060 + }, + { + "epoch": 11.925081918379506, + "grad_norm": 0.02490234375, + "learning_rate": 0.012589034703134739, + "loss": 0.7915, + "num_input_tokens_seen": 46482240, + "step": 80065 + }, + { + "epoch": 11.925826630920465, + "grad_norm": 0.037841796875, + "learning_rate": 0.012587110419537858, + "loss": 0.7999, + "num_input_tokens_seen": 46485088, + "step": 80070 + }, + { + "epoch": 11.926571343461424, + "grad_norm": 0.03125, + "learning_rate": 0.012585186176704295, + "loss": 0.8126, + "num_input_tokens_seen": 46487840, + "step": 80075 + }, + { + "epoch": 11.927316056002383, + "grad_norm": 0.049072265625, + "learning_rate": 0.01258326197466656, + "loss": 0.8028, + "num_input_tokens_seen": 46490976, + "step": 80080 + }, + { + "epoch": 11.928060768543343, + "grad_norm": 0.02734375, + "learning_rate": 0.01258133781345715, + "loss": 0.8044, + "num_input_tokens_seen": 46493536, + "step": 80085 + }, + { + "epoch": 11.928805481084302, + "grad_norm": 0.03515625, + "learning_rate": 0.012579413693108587, + "loss": 0.7938, + "num_input_tokens_seen": 46496416, + "step": 80090 + }, + { + "epoch": 11.92955019362526, + "grad_norm": 0.040283203125, + "learning_rate": 0.012577489613653365, + "loss": 0.7931, + "num_input_tokens_seen": 46499264, + "step": 80095 + }, + { + "epoch": 11.93029490616622, + "grad_norm": 0.0390625, + "learning_rate": 0.012575565575123996, + "loss": 0.8073, + "num_input_tokens_seen": 46501920, + "step": 80100 + }, + { + "epoch": 11.931039618707178, + "grad_norm": 0.04736328125, + "learning_rate": 0.012573641577552981, + "loss": 0.7924, + "num_input_tokens_seen": 46504704, + "step": 80105 + }, + { + "epoch": 11.931784331248139, + "grad_norm": 0.053466796875, + "learning_rate": 0.01257171762097283, + "loss": 0.7996, + "num_input_tokens_seen": 46507552, + "step": 80110 + }, + { + "epoch": 11.932529043789097, + "grad_norm": 0.034912109375, + "learning_rate": 0.012569793705416039, + "loss": 0.7977, + "num_input_tokens_seen": 46510080, + "step": 80115 + }, + { + "epoch": 11.933273756330056, + "grad_norm": 0.0380859375, + "learning_rate": 0.01256786983091511, + "loss": 0.8271, + "num_input_tokens_seen": 46512736, + "step": 80120 + }, + { + "epoch": 11.934018468871017, + "grad_norm": 0.0478515625, + "learning_rate": 0.012565945997502553, + "loss": 0.8051, + "num_input_tokens_seen": 46515584, + "step": 80125 + }, + { + "epoch": 11.934763181411975, + "grad_norm": 0.0556640625, + "learning_rate": 0.012564022205210862, + "loss": 0.7921, + "num_input_tokens_seen": 46518688, + "step": 80130 + }, + { + "epoch": 11.935507893952934, + "grad_norm": 0.076171875, + "learning_rate": 0.012562098454072539, + "loss": 0.7904, + "num_input_tokens_seen": 46521568, + "step": 80135 + }, + { + "epoch": 11.936252606493893, + "grad_norm": 0.054931640625, + "learning_rate": 0.012560174744120085, + "loss": 0.8122, + "num_input_tokens_seen": 46524480, + "step": 80140 + }, + { + "epoch": 11.936997319034852, + "grad_norm": 0.024658203125, + "learning_rate": 0.012558251075386004, + "loss": 0.7941, + "num_input_tokens_seen": 46527648, + "step": 80145 + }, + { + "epoch": 11.937742031575812, + "grad_norm": 0.059814453125, + "learning_rate": 0.012556327447902789, + "loss": 0.7916, + "num_input_tokens_seen": 46530784, + "step": 80150 + }, + { + "epoch": 11.938486744116771, + "grad_norm": 0.037353515625, + "learning_rate": 0.01255440386170293, + "loss": 0.807, + "num_input_tokens_seen": 46533664, + "step": 80155 + }, + { + "epoch": 11.93923145665773, + "grad_norm": 0.033935546875, + "learning_rate": 0.012552480316818939, + "loss": 0.8383, + "num_input_tokens_seen": 46536512, + "step": 80160 + }, + { + "epoch": 11.939976169198689, + "grad_norm": 0.072265625, + "learning_rate": 0.012550556813283303, + "loss": 0.8045, + "num_input_tokens_seen": 46539584, + "step": 80165 + }, + { + "epoch": 11.940720881739649, + "grad_norm": 0.0224609375, + "learning_rate": 0.012548633351128522, + "loss": 0.8106, + "num_input_tokens_seen": 46542624, + "step": 80170 + }, + { + "epoch": 11.941465594280608, + "grad_norm": 0.0322265625, + "learning_rate": 0.012546709930387085, + "loss": 0.7847, + "num_input_tokens_seen": 46545632, + "step": 80175 + }, + { + "epoch": 11.942210306821567, + "grad_norm": 0.0303955078125, + "learning_rate": 0.012544786551091496, + "loss": 0.8022, + "num_input_tokens_seen": 46548256, + "step": 80180 + }, + { + "epoch": 11.942955019362525, + "grad_norm": 0.02880859375, + "learning_rate": 0.012542863213274235, + "loss": 0.7875, + "num_input_tokens_seen": 46551040, + "step": 80185 + }, + { + "epoch": 11.943699731903486, + "grad_norm": 0.037841796875, + "learning_rate": 0.012540939916967812, + "loss": 0.7777, + "num_input_tokens_seen": 46553728, + "step": 80190 + }, + { + "epoch": 11.944444444444445, + "grad_norm": 0.047119140625, + "learning_rate": 0.012539016662204706, + "loss": 0.7794, + "num_input_tokens_seen": 46556800, + "step": 80195 + }, + { + "epoch": 11.945189156985403, + "grad_norm": 0.037841796875, + "learning_rate": 0.012537093449017411, + "loss": 0.7907, + "num_input_tokens_seen": 46559840, + "step": 80200 + }, + { + "epoch": 11.945933869526362, + "grad_norm": 0.0380859375, + "learning_rate": 0.012535170277438423, + "loss": 0.7949, + "num_input_tokens_seen": 46562336, + "step": 80205 + }, + { + "epoch": 11.946678582067323, + "grad_norm": 0.04541015625, + "learning_rate": 0.01253324714750022, + "loss": 0.8076, + "num_input_tokens_seen": 46565056, + "step": 80210 + }, + { + "epoch": 11.947423294608281, + "grad_norm": 0.030517578125, + "learning_rate": 0.012531324059235306, + "loss": 0.8042, + "num_input_tokens_seen": 46568032, + "step": 80215 + }, + { + "epoch": 11.94816800714924, + "grad_norm": 0.03662109375, + "learning_rate": 0.012529401012676158, + "loss": 0.8067, + "num_input_tokens_seen": 46570656, + "step": 80220 + }, + { + "epoch": 11.948912719690199, + "grad_norm": 0.037841796875, + "learning_rate": 0.012527478007855274, + "loss": 0.7887, + "num_input_tokens_seen": 46573568, + "step": 80225 + }, + { + "epoch": 11.94965743223116, + "grad_norm": 0.045166015625, + "learning_rate": 0.012525555044805136, + "loss": 0.7802, + "num_input_tokens_seen": 46576384, + "step": 80230 + }, + { + "epoch": 11.950402144772118, + "grad_norm": 0.035888671875, + "learning_rate": 0.012523632123558228, + "loss": 0.8109, + "num_input_tokens_seen": 46579424, + "step": 80235 + }, + { + "epoch": 11.951146857313077, + "grad_norm": 0.021240234375, + "learning_rate": 0.012521709244147042, + "loss": 0.8064, + "num_input_tokens_seen": 46582048, + "step": 80240 + }, + { + "epoch": 11.951891569854036, + "grad_norm": 0.0245361328125, + "learning_rate": 0.01251978640660405, + "loss": 0.7888, + "num_input_tokens_seen": 46584768, + "step": 80245 + }, + { + "epoch": 11.952636282394996, + "grad_norm": 0.040771484375, + "learning_rate": 0.012517863610961756, + "loss": 0.7834, + "num_input_tokens_seen": 46587872, + "step": 80250 + }, + { + "epoch": 11.953380994935955, + "grad_norm": 0.0281982421875, + "learning_rate": 0.012515940857252628, + "loss": 0.7995, + "num_input_tokens_seen": 46590880, + "step": 80255 + }, + { + "epoch": 11.954125707476914, + "grad_norm": 0.03271484375, + "learning_rate": 0.012514018145509157, + "loss": 0.7865, + "num_input_tokens_seen": 46593600, + "step": 80260 + }, + { + "epoch": 11.954870420017873, + "grad_norm": 0.04736328125, + "learning_rate": 0.012512095475763822, + "loss": 0.7928, + "num_input_tokens_seen": 46596320, + "step": 80265 + }, + { + "epoch": 11.955615132558833, + "grad_norm": 0.03662109375, + "learning_rate": 0.012510172848049108, + "loss": 0.8241, + "num_input_tokens_seen": 46599008, + "step": 80270 + }, + { + "epoch": 11.956359845099792, + "grad_norm": 0.03515625, + "learning_rate": 0.012508250262397493, + "loss": 0.8179, + "num_input_tokens_seen": 46602112, + "step": 80275 + }, + { + "epoch": 11.95710455764075, + "grad_norm": 0.044677734375, + "learning_rate": 0.012506327718841451, + "loss": 0.7929, + "num_input_tokens_seen": 46605216, + "step": 80280 + }, + { + "epoch": 11.95784927018171, + "grad_norm": 0.03173828125, + "learning_rate": 0.012504405217413476, + "loss": 0.7879, + "num_input_tokens_seen": 46608352, + "step": 80285 + }, + { + "epoch": 11.958593982722668, + "grad_norm": 0.0235595703125, + "learning_rate": 0.012502482758146029, + "loss": 0.8463, + "num_input_tokens_seen": 46611136, + "step": 80290 + }, + { + "epoch": 11.959338695263629, + "grad_norm": 0.0311279296875, + "learning_rate": 0.012500560341071604, + "loss": 0.7891, + "num_input_tokens_seen": 46613824, + "step": 80295 + }, + { + "epoch": 11.960083407804587, + "grad_norm": 0.0361328125, + "learning_rate": 0.012498637966222671, + "loss": 0.8172, + "num_input_tokens_seen": 46616896, + "step": 80300 + }, + { + "epoch": 11.960828120345546, + "grad_norm": 0.03759765625, + "learning_rate": 0.01249671563363171, + "loss": 0.8069, + "num_input_tokens_seen": 46619680, + "step": 80305 + }, + { + "epoch": 11.961572832886507, + "grad_norm": 0.0205078125, + "learning_rate": 0.012494793343331192, + "loss": 0.7944, + "num_input_tokens_seen": 46622336, + "step": 80310 + }, + { + "epoch": 11.962317545427466, + "grad_norm": 0.041259765625, + "learning_rate": 0.012492871095353589, + "loss": 0.8204, + "num_input_tokens_seen": 46625184, + "step": 80315 + }, + { + "epoch": 11.963062257968424, + "grad_norm": 0.0255126953125, + "learning_rate": 0.01249094888973139, + "loss": 0.7923, + "num_input_tokens_seen": 46627840, + "step": 80320 + }, + { + "epoch": 11.963806970509383, + "grad_norm": 0.0230712890625, + "learning_rate": 0.012489026726497049, + "loss": 0.8005, + "num_input_tokens_seen": 46630816, + "step": 80325 + }, + { + "epoch": 11.964551683050342, + "grad_norm": 0.037353515625, + "learning_rate": 0.012487104605683055, + "loss": 0.807, + "num_input_tokens_seen": 46634208, + "step": 80330 + }, + { + "epoch": 11.965296395591302, + "grad_norm": 0.037109375, + "learning_rate": 0.012485182527321875, + "loss": 0.7765, + "num_input_tokens_seen": 46637024, + "step": 80335 + }, + { + "epoch": 11.966041108132261, + "grad_norm": 0.0308837890625, + "learning_rate": 0.012483260491445982, + "loss": 0.7965, + "num_input_tokens_seen": 46639872, + "step": 80340 + }, + { + "epoch": 11.96678582067322, + "grad_norm": 0.044677734375, + "learning_rate": 0.012481338498087838, + "loss": 0.826, + "num_input_tokens_seen": 46642624, + "step": 80345 + }, + { + "epoch": 11.967530533214179, + "grad_norm": 0.0419921875, + "learning_rate": 0.012479416547279928, + "loss": 0.8101, + "num_input_tokens_seen": 46645216, + "step": 80350 + }, + { + "epoch": 11.96827524575514, + "grad_norm": 0.0296630859375, + "learning_rate": 0.012477494639054712, + "loss": 0.782, + "num_input_tokens_seen": 46648096, + "step": 80355 + }, + { + "epoch": 11.969019958296098, + "grad_norm": 0.0299072265625, + "learning_rate": 0.012475572773444656, + "loss": 0.8022, + "num_input_tokens_seen": 46650976, + "step": 80360 + }, + { + "epoch": 11.969764670837057, + "grad_norm": 0.032958984375, + "learning_rate": 0.012473650950482239, + "loss": 0.8056, + "num_input_tokens_seen": 46654208, + "step": 80365 + }, + { + "epoch": 11.970509383378015, + "grad_norm": 0.0380859375, + "learning_rate": 0.012471729170199916, + "loss": 0.7891, + "num_input_tokens_seen": 46656768, + "step": 80370 + }, + { + "epoch": 11.971254095918976, + "grad_norm": 0.0220947265625, + "learning_rate": 0.012469807432630164, + "loss": 0.8103, + "num_input_tokens_seen": 46659424, + "step": 80375 + }, + { + "epoch": 11.971998808459935, + "grad_norm": 0.036376953125, + "learning_rate": 0.012467885737805436, + "loss": 0.7874, + "num_input_tokens_seen": 46662336, + "step": 80380 + }, + { + "epoch": 11.972743521000893, + "grad_norm": 0.037353515625, + "learning_rate": 0.01246596408575821, + "loss": 0.7993, + "num_input_tokens_seen": 46665152, + "step": 80385 + }, + { + "epoch": 11.973488233541852, + "grad_norm": 0.029541015625, + "learning_rate": 0.01246404247652095, + "loss": 0.7777, + "num_input_tokens_seen": 46668032, + "step": 80390 + }, + { + "epoch": 11.974232946082813, + "grad_norm": 0.036865234375, + "learning_rate": 0.012462120910126103, + "loss": 0.7666, + "num_input_tokens_seen": 46670944, + "step": 80395 + }, + { + "epoch": 11.974977658623772, + "grad_norm": 0.04443359375, + "learning_rate": 0.012460199386606155, + "loss": 0.8124, + "num_input_tokens_seen": 46673664, + "step": 80400 + }, + { + "epoch": 11.97572237116473, + "grad_norm": 0.03125, + "learning_rate": 0.012458277905993551, + "loss": 0.8118, + "num_input_tokens_seen": 46676640, + "step": 80405 + }, + { + "epoch": 11.976467083705689, + "grad_norm": 0.0267333984375, + "learning_rate": 0.012456356468320762, + "loss": 0.8034, + "num_input_tokens_seen": 46679648, + "step": 80410 + }, + { + "epoch": 11.97721179624665, + "grad_norm": 0.03759765625, + "learning_rate": 0.012454435073620242, + "loss": 0.8161, + "num_input_tokens_seen": 46682528, + "step": 80415 + }, + { + "epoch": 11.977956508787608, + "grad_norm": 0.034423828125, + "learning_rate": 0.012452513721924458, + "loss": 0.7931, + "num_input_tokens_seen": 46685920, + "step": 80420 + }, + { + "epoch": 11.978701221328567, + "grad_norm": 0.031494140625, + "learning_rate": 0.01245059241326586, + "loss": 0.8041, + "num_input_tokens_seen": 46688672, + "step": 80425 + }, + { + "epoch": 11.979445933869526, + "grad_norm": 0.029541015625, + "learning_rate": 0.012448671147676916, + "loss": 0.8098, + "num_input_tokens_seen": 46691776, + "step": 80430 + }, + { + "epoch": 11.980190646410486, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01244674992519008, + "loss": 0.7976, + "num_input_tokens_seen": 46694656, + "step": 80435 + }, + { + "epoch": 11.980935358951445, + "grad_norm": 0.04443359375, + "learning_rate": 0.012444828745837808, + "loss": 0.8043, + "num_input_tokens_seen": 46697312, + "step": 80440 + }, + { + "epoch": 11.981680071492404, + "grad_norm": 0.037353515625, + "learning_rate": 0.01244290760965256, + "loss": 0.8, + "num_input_tokens_seen": 46700256, + "step": 80445 + }, + { + "epoch": 11.982424784033363, + "grad_norm": 0.03466796875, + "learning_rate": 0.012440986516666783, + "loss": 0.7977, + "num_input_tokens_seen": 46703136, + "step": 80450 + }, + { + "epoch": 11.983169496574323, + "grad_norm": 0.03857421875, + "learning_rate": 0.012439065466912945, + "loss": 0.7814, + "num_input_tokens_seen": 46706048, + "step": 80455 + }, + { + "epoch": 11.983914209115282, + "grad_norm": 0.035888671875, + "learning_rate": 0.012437144460423486, + "loss": 0.7988, + "num_input_tokens_seen": 46708832, + "step": 80460 + }, + { + "epoch": 11.98465892165624, + "grad_norm": 0.045166015625, + "learning_rate": 0.012435223497230872, + "loss": 0.8201, + "num_input_tokens_seen": 46711648, + "step": 80465 + }, + { + "epoch": 11.9854036341972, + "grad_norm": 0.029541015625, + "learning_rate": 0.012433302577367546, + "loss": 0.8027, + "num_input_tokens_seen": 46714752, + "step": 80470 + }, + { + "epoch": 11.986148346738158, + "grad_norm": 0.0322265625, + "learning_rate": 0.012431381700865971, + "loss": 0.7847, + "num_input_tokens_seen": 46717888, + "step": 80475 + }, + { + "epoch": 11.986893059279119, + "grad_norm": 0.0203857421875, + "learning_rate": 0.012429460867758589, + "loss": 0.7998, + "num_input_tokens_seen": 46720736, + "step": 80480 + }, + { + "epoch": 11.987637771820078, + "grad_norm": 0.02197265625, + "learning_rate": 0.012427540078077845, + "loss": 0.8045, + "num_input_tokens_seen": 46723520, + "step": 80485 + }, + { + "epoch": 11.988382484361036, + "grad_norm": 0.0458984375, + "learning_rate": 0.012425619331856206, + "loss": 0.8068, + "num_input_tokens_seen": 46727040, + "step": 80490 + }, + { + "epoch": 11.989127196901995, + "grad_norm": 0.03369140625, + "learning_rate": 0.012423698629126111, + "loss": 0.8035, + "num_input_tokens_seen": 46729824, + "step": 80495 + }, + { + "epoch": 11.989871909442956, + "grad_norm": 0.03515625, + "learning_rate": 0.012421777969920009, + "loss": 0.7926, + "num_input_tokens_seen": 46732608, + "step": 80500 + }, + { + "epoch": 11.990616621983914, + "grad_norm": 0.0390625, + "learning_rate": 0.01241985735427034, + "loss": 0.7844, + "num_input_tokens_seen": 46735456, + "step": 80505 + }, + { + "epoch": 11.991361334524873, + "grad_norm": 0.025634765625, + "learning_rate": 0.012417936782209569, + "loss": 0.7812, + "num_input_tokens_seen": 46738656, + "step": 80510 + }, + { + "epoch": 11.992106047065832, + "grad_norm": 0.040771484375, + "learning_rate": 0.012416016253770128, + "loss": 0.7982, + "num_input_tokens_seen": 46741888, + "step": 80515 + }, + { + "epoch": 11.992850759606792, + "grad_norm": 0.0303955078125, + "learning_rate": 0.012414095768984465, + "loss": 0.8002, + "num_input_tokens_seen": 46744704, + "step": 80520 + }, + { + "epoch": 11.993595472147751, + "grad_norm": 0.04541015625, + "learning_rate": 0.012412175327885027, + "loss": 0.8086, + "num_input_tokens_seen": 46747264, + "step": 80525 + }, + { + "epoch": 11.99434018468871, + "grad_norm": 0.046142578125, + "learning_rate": 0.012410254930504254, + "loss": 0.8046, + "num_input_tokens_seen": 46750240, + "step": 80530 + }, + { + "epoch": 11.995084897229669, + "grad_norm": 0.031494140625, + "learning_rate": 0.012408334576874597, + "loss": 0.7924, + "num_input_tokens_seen": 46753344, + "step": 80535 + }, + { + "epoch": 11.99582960977063, + "grad_norm": 0.041748046875, + "learning_rate": 0.012406414267028487, + "loss": 0.8147, + "num_input_tokens_seen": 46756128, + "step": 80540 + }, + { + "epoch": 11.996574322311588, + "grad_norm": 0.041015625, + "learning_rate": 0.012404494000998377, + "loss": 0.7798, + "num_input_tokens_seen": 46759168, + "step": 80545 + }, + { + "epoch": 11.997319034852547, + "grad_norm": 0.04052734375, + "learning_rate": 0.012402573778816699, + "loss": 0.7931, + "num_input_tokens_seen": 46762144, + "step": 80550 + }, + { + "epoch": 11.998063747393505, + "grad_norm": 0.04638671875, + "learning_rate": 0.0124006536005159, + "loss": 0.8008, + "num_input_tokens_seen": 46765024, + "step": 80555 + }, + { + "epoch": 11.998808459934466, + "grad_norm": 0.0223388671875, + "learning_rate": 0.01239873346612842, + "loss": 0.812, + "num_input_tokens_seen": 46768192, + "step": 80560 + }, + { + "epoch": 11.999553172475425, + "grad_norm": 0.0279541015625, + "learning_rate": 0.012396813375686688, + "loss": 0.7982, + "num_input_tokens_seen": 46771104, + "step": 80565 + }, + { + "epoch": 12.0, + "eval_loss": 0.7999743819236755, + "eval_runtime": 70.6809, + "eval_samples_per_second": 42.218, + "eval_steps_per_second": 10.554, + "num_input_tokens_seen": 46772480, + "step": 80568 + }, + { + "epoch": 12.000297885016384, + "grad_norm": 0.03857421875, + "learning_rate": 0.012394893329223153, + "loss": 0.7892, + "num_input_tokens_seen": 46773440, + "step": 80570 + }, + { + "epoch": 12.001042597557342, + "grad_norm": 0.040283203125, + "learning_rate": 0.012392973326770241, + "loss": 0.7992, + "num_input_tokens_seen": 46776384, + "step": 80575 + }, + { + "epoch": 12.001787310098303, + "grad_norm": 0.031982421875, + "learning_rate": 0.012391053368360402, + "loss": 0.7996, + "num_input_tokens_seen": 46779200, + "step": 80580 + }, + { + "epoch": 12.002532022639262, + "grad_norm": 0.038330078125, + "learning_rate": 0.012389133454026058, + "loss": 0.7984, + "num_input_tokens_seen": 46782176, + "step": 80585 + }, + { + "epoch": 12.00327673518022, + "grad_norm": 0.0419921875, + "learning_rate": 0.012387213583799656, + "loss": 0.7891, + "num_input_tokens_seen": 46784992, + "step": 80590 + }, + { + "epoch": 12.004021447721179, + "grad_norm": 0.04248046875, + "learning_rate": 0.012385293757713625, + "loss": 0.7764, + "num_input_tokens_seen": 46788032, + "step": 80595 + }, + { + "epoch": 12.00476616026214, + "grad_norm": 0.0303955078125, + "learning_rate": 0.012383373975800395, + "loss": 0.7867, + "num_input_tokens_seen": 46790880, + "step": 80600 + }, + { + "epoch": 12.005510872803098, + "grad_norm": 0.0238037109375, + "learning_rate": 0.012381454238092409, + "loss": 0.8056, + "num_input_tokens_seen": 46794464, + "step": 80605 + }, + { + "epoch": 12.006255585344057, + "grad_norm": 0.03466796875, + "learning_rate": 0.012379534544622081, + "loss": 0.8003, + "num_input_tokens_seen": 46797536, + "step": 80610 + }, + { + "epoch": 12.007000297885016, + "grad_norm": 0.0341796875, + "learning_rate": 0.012377614895421862, + "loss": 0.804, + "num_input_tokens_seen": 46800896, + "step": 80615 + }, + { + "epoch": 12.007745010425976, + "grad_norm": 0.04052734375, + "learning_rate": 0.01237569529052417, + "loss": 0.7843, + "num_input_tokens_seen": 46803648, + "step": 80620 + }, + { + "epoch": 12.008489722966935, + "grad_norm": 0.032470703125, + "learning_rate": 0.012373775729961442, + "loss": 0.7956, + "num_input_tokens_seen": 46806400, + "step": 80625 + }, + { + "epoch": 12.009234435507894, + "grad_norm": 0.07373046875, + "learning_rate": 0.0123718562137661, + "loss": 0.8143, + "num_input_tokens_seen": 46809088, + "step": 80630 + }, + { + "epoch": 12.009979148048853, + "grad_norm": 0.03662109375, + "learning_rate": 0.012369936741970584, + "loss": 0.7928, + "num_input_tokens_seen": 46812096, + "step": 80635 + }, + { + "epoch": 12.010723860589811, + "grad_norm": 0.046142578125, + "learning_rate": 0.01236801731460731, + "loss": 0.809, + "num_input_tokens_seen": 46814720, + "step": 80640 + }, + { + "epoch": 12.011468573130772, + "grad_norm": 0.03662109375, + "learning_rate": 0.012366097931708703, + "loss": 0.7923, + "num_input_tokens_seen": 46817600, + "step": 80645 + }, + { + "epoch": 12.01221328567173, + "grad_norm": 0.03271484375, + "learning_rate": 0.0123641785933072, + "loss": 0.7923, + "num_input_tokens_seen": 46820288, + "step": 80650 + }, + { + "epoch": 12.01295799821269, + "grad_norm": 0.03369140625, + "learning_rate": 0.012362259299435215, + "loss": 0.8033, + "num_input_tokens_seen": 46823104, + "step": 80655 + }, + { + "epoch": 12.013702710753648, + "grad_norm": 0.038818359375, + "learning_rate": 0.012360340050125186, + "loss": 0.8061, + "num_input_tokens_seen": 46825888, + "step": 80660 + }, + { + "epoch": 12.014447423294609, + "grad_norm": 0.03515625, + "learning_rate": 0.012358420845409524, + "loss": 0.8353, + "num_input_tokens_seen": 46828864, + "step": 80665 + }, + { + "epoch": 12.015192135835568, + "grad_norm": 0.0361328125, + "learning_rate": 0.01235650168532066, + "loss": 0.8106, + "num_input_tokens_seen": 46831712, + "step": 80670 + }, + { + "epoch": 12.015936848376526, + "grad_norm": 0.04248046875, + "learning_rate": 0.012354582569891014, + "loss": 0.7923, + "num_input_tokens_seen": 46834720, + "step": 80675 + }, + { + "epoch": 12.016681560917485, + "grad_norm": 0.046142578125, + "learning_rate": 0.012352663499153002, + "loss": 0.8025, + "num_input_tokens_seen": 46837728, + "step": 80680 + }, + { + "epoch": 12.017426273458446, + "grad_norm": 0.023681640625, + "learning_rate": 0.012350744473139054, + "loss": 0.8081, + "num_input_tokens_seen": 46840640, + "step": 80685 + }, + { + "epoch": 12.018170985999404, + "grad_norm": 0.04052734375, + "learning_rate": 0.01234882549188158, + "loss": 0.7879, + "num_input_tokens_seen": 46844128, + "step": 80690 + }, + { + "epoch": 12.018915698540363, + "grad_norm": 0.037109375, + "learning_rate": 0.01234690655541301, + "loss": 0.7992, + "num_input_tokens_seen": 46846912, + "step": 80695 + }, + { + "epoch": 12.019660411081322, + "grad_norm": 0.040283203125, + "learning_rate": 0.012344987663765754, + "loss": 0.7803, + "num_input_tokens_seen": 46849632, + "step": 80700 + }, + { + "epoch": 12.020405123622282, + "grad_norm": 0.03271484375, + "learning_rate": 0.012343068816972239, + "loss": 0.7927, + "num_input_tokens_seen": 46852416, + "step": 80705 + }, + { + "epoch": 12.021149836163241, + "grad_norm": 0.046630859375, + "learning_rate": 0.012341150015064866, + "loss": 0.798, + "num_input_tokens_seen": 46855520, + "step": 80710 + }, + { + "epoch": 12.0218945487042, + "grad_norm": 0.0400390625, + "learning_rate": 0.012339231258076068, + "loss": 0.798, + "num_input_tokens_seen": 46858240, + "step": 80715 + }, + { + "epoch": 12.022639261245159, + "grad_norm": 0.031982421875, + "learning_rate": 0.012337312546038256, + "loss": 0.7923, + "num_input_tokens_seen": 46861152, + "step": 80720 + }, + { + "epoch": 12.02338397378612, + "grad_norm": 0.052490234375, + "learning_rate": 0.012335393878983835, + "loss": 0.7941, + "num_input_tokens_seen": 46863904, + "step": 80725 + }, + { + "epoch": 12.024128686327078, + "grad_norm": 0.06396484375, + "learning_rate": 0.012333475256945232, + "loss": 0.8045, + "num_input_tokens_seen": 46867200, + "step": 80730 + }, + { + "epoch": 12.024873398868037, + "grad_norm": 0.03125, + "learning_rate": 0.012331556679954852, + "loss": 0.7878, + "num_input_tokens_seen": 46870240, + "step": 80735 + }, + { + "epoch": 12.025618111408996, + "grad_norm": 0.0267333984375, + "learning_rate": 0.012329638148045113, + "loss": 0.7995, + "num_input_tokens_seen": 46873184, + "step": 80740 + }, + { + "epoch": 12.026362823949956, + "grad_norm": 0.050537109375, + "learning_rate": 0.012327719661248415, + "loss": 0.7917, + "num_input_tokens_seen": 46876544, + "step": 80745 + }, + { + "epoch": 12.027107536490915, + "grad_norm": 0.03564453125, + "learning_rate": 0.012325801219597187, + "loss": 0.7906, + "num_input_tokens_seen": 46879520, + "step": 80750 + }, + { + "epoch": 12.027852249031874, + "grad_norm": 0.0341796875, + "learning_rate": 0.012323882823123826, + "loss": 0.7753, + "num_input_tokens_seen": 46882688, + "step": 80755 + }, + { + "epoch": 12.028596961572832, + "grad_norm": 0.035400390625, + "learning_rate": 0.012321964471860741, + "loss": 0.7853, + "num_input_tokens_seen": 46886720, + "step": 80760 + }, + { + "epoch": 12.029341674113793, + "grad_norm": 0.03857421875, + "learning_rate": 0.01232004616584035, + "loss": 0.8075, + "num_input_tokens_seen": 46889760, + "step": 80765 + }, + { + "epoch": 12.030086386654752, + "grad_norm": 0.0390625, + "learning_rate": 0.012318127905095051, + "loss": 0.7935, + "num_input_tokens_seen": 46892448, + "step": 80770 + }, + { + "epoch": 12.03083109919571, + "grad_norm": 0.050048828125, + "learning_rate": 0.01231620968965726, + "loss": 0.7878, + "num_input_tokens_seen": 46895168, + "step": 80775 + }, + { + "epoch": 12.03157581173667, + "grad_norm": 0.042236328125, + "learning_rate": 0.012314291519559371, + "loss": 0.7682, + "num_input_tokens_seen": 46898336, + "step": 80780 + }, + { + "epoch": 12.03232052427763, + "grad_norm": 0.0859375, + "learning_rate": 0.012312373394833804, + "loss": 0.8127, + "num_input_tokens_seen": 46901184, + "step": 80785 + }, + { + "epoch": 12.033065236818588, + "grad_norm": 0.05517578125, + "learning_rate": 0.012310455315512956, + "loss": 0.8055, + "num_input_tokens_seen": 46904448, + "step": 80790 + }, + { + "epoch": 12.033809949359547, + "grad_norm": 0.04345703125, + "learning_rate": 0.012308537281629228, + "loss": 0.7748, + "num_input_tokens_seen": 46907360, + "step": 80795 + }, + { + "epoch": 12.034554661900506, + "grad_norm": 0.06689453125, + "learning_rate": 0.012306619293215033, + "loss": 0.7653, + "num_input_tokens_seen": 46910240, + "step": 80800 + }, + { + "epoch": 12.035299374441466, + "grad_norm": 0.06396484375, + "learning_rate": 0.012304701350302764, + "loss": 0.7958, + "num_input_tokens_seen": 46913024, + "step": 80805 + }, + { + "epoch": 12.036044086982425, + "grad_norm": 0.06298828125, + "learning_rate": 0.012302783452924829, + "loss": 0.8367, + "num_input_tokens_seen": 46915840, + "step": 80810 + }, + { + "epoch": 12.036788799523384, + "grad_norm": 0.042724609375, + "learning_rate": 0.01230086560111362, + "loss": 0.7844, + "num_input_tokens_seen": 46918848, + "step": 80815 + }, + { + "epoch": 12.037533512064343, + "grad_norm": 0.06640625, + "learning_rate": 0.01229894779490155, + "loss": 0.813, + "num_input_tokens_seen": 46921888, + "step": 80820 + }, + { + "epoch": 12.038278224605302, + "grad_norm": 0.042724609375, + "learning_rate": 0.012297030034321008, + "loss": 0.7872, + "num_input_tokens_seen": 46924896, + "step": 80825 + }, + { + "epoch": 12.039022937146262, + "grad_norm": 0.03466796875, + "learning_rate": 0.0122951123194044, + "loss": 0.7787, + "num_input_tokens_seen": 46927872, + "step": 80830 + }, + { + "epoch": 12.03976764968722, + "grad_norm": 0.034912109375, + "learning_rate": 0.012293194650184117, + "loss": 0.7738, + "num_input_tokens_seen": 46930848, + "step": 80835 + }, + { + "epoch": 12.04051236222818, + "grad_norm": 0.031494140625, + "learning_rate": 0.01229127702669256, + "loss": 0.7865, + "num_input_tokens_seen": 46933984, + "step": 80840 + }, + { + "epoch": 12.041257074769138, + "grad_norm": 0.040771484375, + "learning_rate": 0.012289359448962126, + "loss": 0.7958, + "num_input_tokens_seen": 46936736, + "step": 80845 + }, + { + "epoch": 12.042001787310099, + "grad_norm": 0.036865234375, + "learning_rate": 0.012287441917025204, + "loss": 0.7926, + "num_input_tokens_seen": 46939552, + "step": 80850 + }, + { + "epoch": 12.042746499851058, + "grad_norm": 0.041259765625, + "learning_rate": 0.012285524430914201, + "loss": 0.8338, + "num_input_tokens_seen": 46942304, + "step": 80855 + }, + { + "epoch": 12.043491212392016, + "grad_norm": 0.03564453125, + "learning_rate": 0.0122836069906615, + "loss": 0.7727, + "num_input_tokens_seen": 46945088, + "step": 80860 + }, + { + "epoch": 12.044235924932975, + "grad_norm": 0.0673828125, + "learning_rate": 0.0122816895962995, + "loss": 0.8057, + "num_input_tokens_seen": 46947680, + "step": 80865 + }, + { + "epoch": 12.044980637473936, + "grad_norm": 0.045654296875, + "learning_rate": 0.012279772247860586, + "loss": 0.7812, + "num_input_tokens_seen": 46950336, + "step": 80870 + }, + { + "epoch": 12.045725350014894, + "grad_norm": 0.0322265625, + "learning_rate": 0.012277854945377163, + "loss": 0.7789, + "num_input_tokens_seen": 46953280, + "step": 80875 + }, + { + "epoch": 12.046470062555853, + "grad_norm": 0.054931640625, + "learning_rate": 0.012275937688881612, + "loss": 0.782, + "num_input_tokens_seen": 46956064, + "step": 80880 + }, + { + "epoch": 12.047214775096812, + "grad_norm": 0.047607421875, + "learning_rate": 0.01227402047840632, + "loss": 0.8589, + "num_input_tokens_seen": 46958848, + "step": 80885 + }, + { + "epoch": 12.047959487637772, + "grad_norm": 0.032958984375, + "learning_rate": 0.012272103313983687, + "loss": 0.7995, + "num_input_tokens_seen": 46961760, + "step": 80890 + }, + { + "epoch": 12.048704200178731, + "grad_norm": 0.028564453125, + "learning_rate": 0.012270186195646094, + "loss": 0.7872, + "num_input_tokens_seen": 46964672, + "step": 80895 + }, + { + "epoch": 12.04944891271969, + "grad_norm": 0.04638671875, + "learning_rate": 0.012268269123425931, + "loss": 0.7912, + "num_input_tokens_seen": 46967328, + "step": 80900 + }, + { + "epoch": 12.050193625260649, + "grad_norm": 0.039306640625, + "learning_rate": 0.012266352097355583, + "loss": 0.7877, + "num_input_tokens_seen": 46970112, + "step": 80905 + }, + { + "epoch": 12.05093833780161, + "grad_norm": 0.034423828125, + "learning_rate": 0.012264435117467441, + "loss": 0.8095, + "num_input_tokens_seen": 46972800, + "step": 80910 + }, + { + "epoch": 12.051683050342568, + "grad_norm": 0.033203125, + "learning_rate": 0.012262518183793888, + "loss": 0.8007, + "num_input_tokens_seen": 46976032, + "step": 80915 + }, + { + "epoch": 12.052427762883527, + "grad_norm": 0.03662109375, + "learning_rate": 0.012260601296367302, + "loss": 0.7887, + "num_input_tokens_seen": 46979232, + "step": 80920 + }, + { + "epoch": 12.053172475424486, + "grad_norm": 0.04150390625, + "learning_rate": 0.012258684455220078, + "loss": 0.8053, + "num_input_tokens_seen": 46981984, + "step": 80925 + }, + { + "epoch": 12.053917187965446, + "grad_norm": 0.048095703125, + "learning_rate": 0.01225676766038459, + "loss": 0.7925, + "num_input_tokens_seen": 46985056, + "step": 80930 + }, + { + "epoch": 12.054661900506405, + "grad_norm": 0.03173828125, + "learning_rate": 0.012254850911893232, + "loss": 0.7761, + "num_input_tokens_seen": 46987968, + "step": 80935 + }, + { + "epoch": 12.055406613047364, + "grad_norm": 0.06005859375, + "learning_rate": 0.012252934209778368, + "loss": 0.8186, + "num_input_tokens_seen": 46990752, + "step": 80940 + }, + { + "epoch": 12.056151325588322, + "grad_norm": 0.04296875, + "learning_rate": 0.012251017554072397, + "loss": 0.7957, + "num_input_tokens_seen": 46993888, + "step": 80945 + }, + { + "epoch": 12.056896038129283, + "grad_norm": 0.048583984375, + "learning_rate": 0.012249100944807682, + "loss": 0.781, + "num_input_tokens_seen": 46996736, + "step": 80950 + }, + { + "epoch": 12.057640750670242, + "grad_norm": 0.04833984375, + "learning_rate": 0.012247184382016617, + "loss": 0.7928, + "num_input_tokens_seen": 46999584, + "step": 80955 + }, + { + "epoch": 12.0583854632112, + "grad_norm": 0.034423828125, + "learning_rate": 0.012245267865731576, + "loss": 0.7961, + "num_input_tokens_seen": 47002208, + "step": 80960 + }, + { + "epoch": 12.05913017575216, + "grad_norm": 0.026611328125, + "learning_rate": 0.01224335139598493, + "loss": 0.8244, + "num_input_tokens_seen": 47005088, + "step": 80965 + }, + { + "epoch": 12.05987488829312, + "grad_norm": 0.043212890625, + "learning_rate": 0.012241434972809065, + "loss": 0.7643, + "num_input_tokens_seen": 47007872, + "step": 80970 + }, + { + "epoch": 12.060619600834078, + "grad_norm": 0.03466796875, + "learning_rate": 0.012239518596236345, + "loss": 0.8882, + "num_input_tokens_seen": 47010656, + "step": 80975 + }, + { + "epoch": 12.061364313375037, + "grad_norm": 0.053466796875, + "learning_rate": 0.012237602266299158, + "loss": 0.8059, + "num_input_tokens_seen": 47013536, + "step": 80980 + }, + { + "epoch": 12.062109025915996, + "grad_norm": 0.0299072265625, + "learning_rate": 0.012235685983029867, + "loss": 0.7781, + "num_input_tokens_seen": 47016576, + "step": 80985 + }, + { + "epoch": 12.062853738456955, + "grad_norm": 0.045166015625, + "learning_rate": 0.01223376974646086, + "loss": 0.8037, + "num_input_tokens_seen": 47019424, + "step": 80990 + }, + { + "epoch": 12.063598450997915, + "grad_norm": 0.045654296875, + "learning_rate": 0.012231853556624503, + "loss": 0.7979, + "num_input_tokens_seen": 47022528, + "step": 80995 + }, + { + "epoch": 12.064343163538874, + "grad_norm": 0.030517578125, + "learning_rate": 0.01222993741355316, + "loss": 0.7869, + "num_input_tokens_seen": 47025440, + "step": 81000 + }, + { + "epoch": 12.065087876079833, + "grad_norm": 0.044677734375, + "learning_rate": 0.012228021317279214, + "loss": 0.7909, + "num_input_tokens_seen": 47028288, + "step": 81005 + }, + { + "epoch": 12.065832588620792, + "grad_norm": 0.042724609375, + "learning_rate": 0.012226105267835023, + "loss": 0.7894, + "num_input_tokens_seen": 47031360, + "step": 81010 + }, + { + "epoch": 12.066577301161752, + "grad_norm": 0.060302734375, + "learning_rate": 0.01222418926525297, + "loss": 0.787, + "num_input_tokens_seen": 47034208, + "step": 81015 + }, + { + "epoch": 12.06732201370271, + "grad_norm": 0.04248046875, + "learning_rate": 0.012222273309565414, + "loss": 0.7907, + "num_input_tokens_seen": 47036864, + "step": 81020 + }, + { + "epoch": 12.06806672624367, + "grad_norm": 0.05908203125, + "learning_rate": 0.012220357400804733, + "loss": 0.8191, + "num_input_tokens_seen": 47039456, + "step": 81025 + }, + { + "epoch": 12.068811438784628, + "grad_norm": 0.05029296875, + "learning_rate": 0.012218441539003285, + "loss": 0.7963, + "num_input_tokens_seen": 47042528, + "step": 81030 + }, + { + "epoch": 12.069556151325589, + "grad_norm": 0.04833984375, + "learning_rate": 0.012216525724193444, + "loss": 0.8113, + "num_input_tokens_seen": 47045440, + "step": 81035 + }, + { + "epoch": 12.070300863866548, + "grad_norm": 0.0284423828125, + "learning_rate": 0.01221460995640757, + "loss": 0.7681, + "num_input_tokens_seen": 47048384, + "step": 81040 + }, + { + "epoch": 12.071045576407506, + "grad_norm": 0.049072265625, + "learning_rate": 0.012212694235678025, + "loss": 0.7888, + "num_input_tokens_seen": 47051328, + "step": 81045 + }, + { + "epoch": 12.071790288948465, + "grad_norm": 0.040771484375, + "learning_rate": 0.012210778562037184, + "loss": 0.8107, + "num_input_tokens_seen": 47054304, + "step": 81050 + }, + { + "epoch": 12.072535001489426, + "grad_norm": 0.0625, + "learning_rate": 0.012208862935517399, + "loss": 0.8303, + "num_input_tokens_seen": 47056960, + "step": 81055 + }, + { + "epoch": 12.073279714030384, + "grad_norm": 0.062255859375, + "learning_rate": 0.012206947356151044, + "loss": 0.792, + "num_input_tokens_seen": 47060224, + "step": 81060 + }, + { + "epoch": 12.074024426571343, + "grad_norm": 0.04248046875, + "learning_rate": 0.01220503182397047, + "loss": 0.7723, + "num_input_tokens_seen": 47063360, + "step": 81065 + }, + { + "epoch": 12.074769139112302, + "grad_norm": 0.0284423828125, + "learning_rate": 0.012203116339008047, + "loss": 0.7888, + "num_input_tokens_seen": 47066080, + "step": 81070 + }, + { + "epoch": 12.075513851653263, + "grad_norm": 0.059326171875, + "learning_rate": 0.012201200901296123, + "loss": 0.781, + "num_input_tokens_seen": 47068864, + "step": 81075 + }, + { + "epoch": 12.076258564194221, + "grad_norm": 0.0242919921875, + "learning_rate": 0.012199285510867073, + "loss": 0.7854, + "num_input_tokens_seen": 47071680, + "step": 81080 + }, + { + "epoch": 12.07700327673518, + "grad_norm": 0.054931640625, + "learning_rate": 0.012197370167753248, + "loss": 0.8144, + "num_input_tokens_seen": 47074688, + "step": 81085 + }, + { + "epoch": 12.077747989276139, + "grad_norm": 0.03173828125, + "learning_rate": 0.012195454871987, + "loss": 0.7836, + "num_input_tokens_seen": 47077760, + "step": 81090 + }, + { + "epoch": 12.0784927018171, + "grad_norm": 0.0458984375, + "learning_rate": 0.012193539623600697, + "loss": 0.7949, + "num_input_tokens_seen": 47080864, + "step": 81095 + }, + { + "epoch": 12.079237414358058, + "grad_norm": 0.0233154296875, + "learning_rate": 0.012191624422626684, + "loss": 0.7934, + "num_input_tokens_seen": 47083744, + "step": 81100 + }, + { + "epoch": 12.079982126899017, + "grad_norm": 0.08935546875, + "learning_rate": 0.01218970926909733, + "loss": 0.8304, + "num_input_tokens_seen": 47086720, + "step": 81105 + }, + { + "epoch": 12.080726839439976, + "grad_norm": 0.0380859375, + "learning_rate": 0.01218779416304497, + "loss": 0.7997, + "num_input_tokens_seen": 47089536, + "step": 81110 + }, + { + "epoch": 12.081471551980936, + "grad_norm": 0.047607421875, + "learning_rate": 0.01218587910450198, + "loss": 0.7894, + "num_input_tokens_seen": 47092288, + "step": 81115 + }, + { + "epoch": 12.082216264521895, + "grad_norm": 0.030029296875, + "learning_rate": 0.0121839640935007, + "loss": 0.7828, + "num_input_tokens_seen": 47094976, + "step": 81120 + }, + { + "epoch": 12.082960977062854, + "grad_norm": 0.056884765625, + "learning_rate": 0.01218204913007348, + "loss": 0.8196, + "num_input_tokens_seen": 47098016, + "step": 81125 + }, + { + "epoch": 12.083705689603812, + "grad_norm": 0.032958984375, + "learning_rate": 0.01218013421425268, + "loss": 0.7859, + "num_input_tokens_seen": 47100832, + "step": 81130 + }, + { + "epoch": 12.084450402144773, + "grad_norm": 0.035888671875, + "learning_rate": 0.012178219346070645, + "loss": 0.8299, + "num_input_tokens_seen": 47103744, + "step": 81135 + }, + { + "epoch": 12.085195114685732, + "grad_norm": 0.027099609375, + "learning_rate": 0.01217630452555973, + "loss": 0.7934, + "num_input_tokens_seen": 47106560, + "step": 81140 + }, + { + "epoch": 12.08593982722669, + "grad_norm": 0.037841796875, + "learning_rate": 0.012174389752752273, + "loss": 0.794, + "num_input_tokens_seen": 47109568, + "step": 81145 + }, + { + "epoch": 12.08668453976765, + "grad_norm": 0.033203125, + "learning_rate": 0.012172475027680636, + "loss": 0.7947, + "num_input_tokens_seen": 47112288, + "step": 81150 + }, + { + "epoch": 12.08742925230861, + "grad_norm": 0.0390625, + "learning_rate": 0.012170560350377153, + "loss": 0.7746, + "num_input_tokens_seen": 47115424, + "step": 81155 + }, + { + "epoch": 12.088173964849569, + "grad_norm": 0.0517578125, + "learning_rate": 0.012168645720874186, + "loss": 0.8133, + "num_input_tokens_seen": 47118336, + "step": 81160 + }, + { + "epoch": 12.088918677390527, + "grad_norm": 0.033447265625, + "learning_rate": 0.012166731139204065, + "loss": 0.8001, + "num_input_tokens_seen": 47121248, + "step": 81165 + }, + { + "epoch": 12.089663389931486, + "grad_norm": 0.0306396484375, + "learning_rate": 0.012164816605399142, + "loss": 0.7999, + "num_input_tokens_seen": 47124128, + "step": 81170 + }, + { + "epoch": 12.090408102472445, + "grad_norm": 0.03955078125, + "learning_rate": 0.012162902119491765, + "loss": 0.7903, + "num_input_tokens_seen": 47127008, + "step": 81175 + }, + { + "epoch": 12.091152815013405, + "grad_norm": 0.05029296875, + "learning_rate": 0.012160987681514268, + "loss": 0.7892, + "num_input_tokens_seen": 47129504, + "step": 81180 + }, + { + "epoch": 12.091897527554364, + "grad_norm": 0.036376953125, + "learning_rate": 0.012159073291499003, + "loss": 0.7689, + "num_input_tokens_seen": 47132352, + "step": 81185 + }, + { + "epoch": 12.092642240095323, + "grad_norm": 0.04150390625, + "learning_rate": 0.012157158949478306, + "loss": 0.7832, + "num_input_tokens_seen": 47135200, + "step": 81190 + }, + { + "epoch": 12.093386952636282, + "grad_norm": 0.0262451171875, + "learning_rate": 0.012155244655484522, + "loss": 0.7827, + "num_input_tokens_seen": 47137888, + "step": 81195 + }, + { + "epoch": 12.094131665177242, + "grad_norm": 0.05419921875, + "learning_rate": 0.012153330409549986, + "loss": 0.8128, + "num_input_tokens_seen": 47140960, + "step": 81200 + }, + { + "epoch": 12.094876377718201, + "grad_norm": 0.0478515625, + "learning_rate": 0.012151416211707036, + "loss": 0.8018, + "num_input_tokens_seen": 47143904, + "step": 81205 + }, + { + "epoch": 12.09562109025916, + "grad_norm": 0.039306640625, + "learning_rate": 0.012149502061988017, + "loss": 0.7991, + "num_input_tokens_seen": 47147008, + "step": 81210 + }, + { + "epoch": 12.096365802800118, + "grad_norm": 0.0260009765625, + "learning_rate": 0.012147587960425262, + "loss": 0.8109, + "num_input_tokens_seen": 47149504, + "step": 81215 + }, + { + "epoch": 12.097110515341079, + "grad_norm": 0.036376953125, + "learning_rate": 0.01214567390705111, + "loss": 0.8141, + "num_input_tokens_seen": 47152544, + "step": 81220 + }, + { + "epoch": 12.097855227882038, + "grad_norm": 0.0322265625, + "learning_rate": 0.012143759901897897, + "loss": 0.8121, + "num_input_tokens_seen": 47155680, + "step": 81225 + }, + { + "epoch": 12.098599940422996, + "grad_norm": 0.033935546875, + "learning_rate": 0.01214184594499796, + "loss": 0.772, + "num_input_tokens_seen": 47158784, + "step": 81230 + }, + { + "epoch": 12.099344652963955, + "grad_norm": 0.060546875, + "learning_rate": 0.012139932036383623, + "loss": 0.7696, + "num_input_tokens_seen": 47161536, + "step": 81235 + }, + { + "epoch": 12.100089365504916, + "grad_norm": 0.04541015625, + "learning_rate": 0.012138018176087237, + "loss": 0.7793, + "num_input_tokens_seen": 47164448, + "step": 81240 + }, + { + "epoch": 12.100834078045875, + "grad_norm": 0.03857421875, + "learning_rate": 0.012136104364141123, + "loss": 0.7787, + "num_input_tokens_seen": 47167424, + "step": 81245 + }, + { + "epoch": 12.101578790586833, + "grad_norm": 0.0341796875, + "learning_rate": 0.012134190600577608, + "loss": 0.7797, + "num_input_tokens_seen": 47170336, + "step": 81250 + }, + { + "epoch": 12.102323503127792, + "grad_norm": 0.0439453125, + "learning_rate": 0.012132276885429037, + "loss": 0.8253, + "num_input_tokens_seen": 47173152, + "step": 81255 + }, + { + "epoch": 12.103068215668753, + "grad_norm": 0.0361328125, + "learning_rate": 0.01213036321872773, + "loss": 0.7788, + "num_input_tokens_seen": 47176192, + "step": 81260 + }, + { + "epoch": 12.103812928209711, + "grad_norm": 0.046630859375, + "learning_rate": 0.012128449600506025, + "loss": 0.7765, + "num_input_tokens_seen": 47179104, + "step": 81265 + }, + { + "epoch": 12.10455764075067, + "grad_norm": 0.023193359375, + "learning_rate": 0.012126536030796236, + "loss": 0.7731, + "num_input_tokens_seen": 47181792, + "step": 81270 + }, + { + "epoch": 12.105302353291629, + "grad_norm": 0.064453125, + "learning_rate": 0.01212462250963071, + "loss": 0.7731, + "num_input_tokens_seen": 47184640, + "step": 81275 + }, + { + "epoch": 12.10604706583259, + "grad_norm": 0.0537109375, + "learning_rate": 0.012122709037041763, + "loss": 0.8313, + "num_input_tokens_seen": 47187936, + "step": 81280 + }, + { + "epoch": 12.106791778373548, + "grad_norm": 0.044921875, + "learning_rate": 0.012120795613061714, + "loss": 0.8088, + "num_input_tokens_seen": 47191200, + "step": 81285 + }, + { + "epoch": 12.107536490914507, + "grad_norm": 0.033203125, + "learning_rate": 0.012118882237722905, + "loss": 0.7894, + "num_input_tokens_seen": 47194112, + "step": 81290 + }, + { + "epoch": 12.108281203455466, + "grad_norm": 0.11083984375, + "learning_rate": 0.01211696891105765, + "loss": 0.8531, + "num_input_tokens_seen": 47197248, + "step": 81295 + }, + { + "epoch": 12.109025915996426, + "grad_norm": 0.05859375, + "learning_rate": 0.012115055633098278, + "loss": 0.8113, + "num_input_tokens_seen": 47199968, + "step": 81300 + }, + { + "epoch": 12.109770628537385, + "grad_norm": 0.043212890625, + "learning_rate": 0.012113142403877102, + "loss": 0.8156, + "num_input_tokens_seen": 47203136, + "step": 81305 + }, + { + "epoch": 12.110515341078344, + "grad_norm": 0.05224609375, + "learning_rate": 0.012111229223426456, + "loss": 0.8027, + "num_input_tokens_seen": 47206176, + "step": 81310 + }, + { + "epoch": 12.111260053619302, + "grad_norm": 0.0361328125, + "learning_rate": 0.01210931609177865, + "loss": 0.7996, + "num_input_tokens_seen": 47209088, + "step": 81315 + }, + { + "epoch": 12.112004766160263, + "grad_norm": 0.03955078125, + "learning_rate": 0.012107403008966022, + "loss": 0.7975, + "num_input_tokens_seen": 47211936, + "step": 81320 + }, + { + "epoch": 12.112749478701222, + "grad_norm": 0.046875, + "learning_rate": 0.012105489975020875, + "loss": 0.8054, + "num_input_tokens_seen": 47215072, + "step": 81325 + }, + { + "epoch": 12.11349419124218, + "grad_norm": 0.045166015625, + "learning_rate": 0.01210357698997553, + "loss": 0.8015, + "num_input_tokens_seen": 47217920, + "step": 81330 + }, + { + "epoch": 12.11423890378314, + "grad_norm": 0.0478515625, + "learning_rate": 0.012101664053862314, + "loss": 0.7892, + "num_input_tokens_seen": 47220672, + "step": 81335 + }, + { + "epoch": 12.114983616324098, + "grad_norm": 0.040283203125, + "learning_rate": 0.01209975116671353, + "loss": 0.7935, + "num_input_tokens_seen": 47223584, + "step": 81340 + }, + { + "epoch": 12.115728328865059, + "grad_norm": 0.053466796875, + "learning_rate": 0.01209783832856151, + "loss": 0.7914, + "num_input_tokens_seen": 47226528, + "step": 81345 + }, + { + "epoch": 12.116473041406017, + "grad_norm": 0.06103515625, + "learning_rate": 0.012095925539438553, + "loss": 0.8445, + "num_input_tokens_seen": 47229472, + "step": 81350 + }, + { + "epoch": 12.117217753946976, + "grad_norm": 0.054931640625, + "learning_rate": 0.01209401279937699, + "loss": 0.7967, + "num_input_tokens_seen": 47232544, + "step": 81355 + }, + { + "epoch": 12.117962466487935, + "grad_norm": 0.0267333984375, + "learning_rate": 0.012092100108409127, + "loss": 0.8093, + "num_input_tokens_seen": 47235488, + "step": 81360 + }, + { + "epoch": 12.118707179028895, + "grad_norm": 0.042724609375, + "learning_rate": 0.012090187466567275, + "loss": 0.8229, + "num_input_tokens_seen": 47238400, + "step": 81365 + }, + { + "epoch": 12.119451891569854, + "grad_norm": 0.06298828125, + "learning_rate": 0.012088274873883751, + "loss": 0.7993, + "num_input_tokens_seen": 47241152, + "step": 81370 + }, + { + "epoch": 12.120196604110813, + "grad_norm": 0.03955078125, + "learning_rate": 0.012086362330390859, + "loss": 0.8002, + "num_input_tokens_seen": 47244000, + "step": 81375 + }, + { + "epoch": 12.120941316651772, + "grad_norm": 0.0380859375, + "learning_rate": 0.012084449836120916, + "loss": 0.8128, + "num_input_tokens_seen": 47246720, + "step": 81380 + }, + { + "epoch": 12.121686029192732, + "grad_norm": 0.041015625, + "learning_rate": 0.012082537391106228, + "loss": 0.7921, + "num_input_tokens_seen": 47249472, + "step": 81385 + }, + { + "epoch": 12.122430741733691, + "grad_norm": 0.039306640625, + "learning_rate": 0.012080624995379109, + "loss": 0.7993, + "num_input_tokens_seen": 47252768, + "step": 81390 + }, + { + "epoch": 12.12317545427465, + "grad_norm": 0.038330078125, + "learning_rate": 0.01207871264897186, + "loss": 0.808, + "num_input_tokens_seen": 47255872, + "step": 81395 + }, + { + "epoch": 12.123920166815608, + "grad_norm": 0.052490234375, + "learning_rate": 0.012076800351916798, + "loss": 0.8, + "num_input_tokens_seen": 47258816, + "step": 81400 + }, + { + "epoch": 12.124664879356569, + "grad_norm": 0.05126953125, + "learning_rate": 0.012074888104246223, + "loss": 0.8057, + "num_input_tokens_seen": 47262048, + "step": 81405 + }, + { + "epoch": 12.125409591897528, + "grad_norm": 0.0279541015625, + "learning_rate": 0.012072975905992432, + "loss": 0.7955, + "num_input_tokens_seen": 47264896, + "step": 81410 + }, + { + "epoch": 12.126154304438487, + "grad_norm": 0.0546875, + "learning_rate": 0.012071063757187745, + "loss": 0.7836, + "num_input_tokens_seen": 47267776, + "step": 81415 + }, + { + "epoch": 12.126899016979445, + "grad_norm": 0.06787109375, + "learning_rate": 0.012069151657864452, + "loss": 0.8157, + "num_input_tokens_seen": 47270912, + "step": 81420 + }, + { + "epoch": 12.127643729520406, + "grad_norm": 0.04541015625, + "learning_rate": 0.01206723960805487, + "loss": 0.806, + "num_input_tokens_seen": 47273888, + "step": 81425 + }, + { + "epoch": 12.128388442061365, + "grad_norm": 0.056396484375, + "learning_rate": 0.012065327607791292, + "loss": 0.7833, + "num_input_tokens_seen": 47276672, + "step": 81430 + }, + { + "epoch": 12.129133154602323, + "grad_norm": 0.06298828125, + "learning_rate": 0.012063415657106025, + "loss": 0.7826, + "num_input_tokens_seen": 47279424, + "step": 81435 + }, + { + "epoch": 12.129877867143282, + "grad_norm": 0.041259765625, + "learning_rate": 0.012061503756031362, + "loss": 0.79, + "num_input_tokens_seen": 47282432, + "step": 81440 + }, + { + "epoch": 12.130622579684243, + "grad_norm": 0.036865234375, + "learning_rate": 0.012059591904599605, + "loss": 0.7952, + "num_input_tokens_seen": 47285472, + "step": 81445 + }, + { + "epoch": 12.131367292225201, + "grad_norm": 0.05322265625, + "learning_rate": 0.012057680102843056, + "loss": 0.8012, + "num_input_tokens_seen": 47288224, + "step": 81450 + }, + { + "epoch": 12.13211200476616, + "grad_norm": 0.03271484375, + "learning_rate": 0.012055768350794008, + "loss": 0.7645, + "num_input_tokens_seen": 47290976, + "step": 81455 + }, + { + "epoch": 12.132856717307119, + "grad_norm": 0.033447265625, + "learning_rate": 0.012053856648484766, + "loss": 0.7992, + "num_input_tokens_seen": 47294048, + "step": 81460 + }, + { + "epoch": 12.13360142984808, + "grad_norm": 0.02294921875, + "learning_rate": 0.01205194499594762, + "loss": 0.7834, + "num_input_tokens_seen": 47296800, + "step": 81465 + }, + { + "epoch": 12.134346142389038, + "grad_norm": 0.03564453125, + "learning_rate": 0.01205003339321487, + "loss": 0.7825, + "num_input_tokens_seen": 47299648, + "step": 81470 + }, + { + "epoch": 12.135090854929997, + "grad_norm": 0.030029296875, + "learning_rate": 0.0120481218403188, + "loss": 0.8042, + "num_input_tokens_seen": 47302528, + "step": 81475 + }, + { + "epoch": 12.135835567470956, + "grad_norm": 0.0478515625, + "learning_rate": 0.012046210337291719, + "loss": 0.7883, + "num_input_tokens_seen": 47305280, + "step": 81480 + }, + { + "epoch": 12.136580280011916, + "grad_norm": 0.04736328125, + "learning_rate": 0.012044298884165913, + "loss": 0.8055, + "num_input_tokens_seen": 47308032, + "step": 81485 + }, + { + "epoch": 12.137324992552875, + "grad_norm": 0.042236328125, + "learning_rate": 0.012042387480973667, + "loss": 0.7934, + "num_input_tokens_seen": 47310944, + "step": 81490 + }, + { + "epoch": 12.138069705093834, + "grad_norm": 0.034423828125, + "learning_rate": 0.012040476127747281, + "loss": 0.7894, + "num_input_tokens_seen": 47314336, + "step": 81495 + }, + { + "epoch": 12.138814417634793, + "grad_norm": 0.033447265625, + "learning_rate": 0.012038564824519041, + "loss": 0.7725, + "num_input_tokens_seen": 47317312, + "step": 81500 + }, + { + "epoch": 12.139559130175751, + "grad_norm": 0.037841796875, + "learning_rate": 0.012036653571321245, + "loss": 0.7966, + "num_input_tokens_seen": 47320480, + "step": 81505 + }, + { + "epoch": 12.140303842716712, + "grad_norm": 0.053466796875, + "learning_rate": 0.012034742368186165, + "loss": 0.8289, + "num_input_tokens_seen": 47323648, + "step": 81510 + }, + { + "epoch": 12.14104855525767, + "grad_norm": 0.038330078125, + "learning_rate": 0.012032831215146107, + "loss": 0.7883, + "num_input_tokens_seen": 47326688, + "step": 81515 + }, + { + "epoch": 12.14179326779863, + "grad_norm": 0.05419921875, + "learning_rate": 0.01203092011223335, + "loss": 0.7952, + "num_input_tokens_seen": 47329568, + "step": 81520 + }, + { + "epoch": 12.142537980339588, + "grad_norm": 0.0361328125, + "learning_rate": 0.012029009059480176, + "loss": 0.801, + "num_input_tokens_seen": 47332352, + "step": 81525 + }, + { + "epoch": 12.143282692880549, + "grad_norm": 0.042724609375, + "learning_rate": 0.01202709805691888, + "loss": 0.8077, + "num_input_tokens_seen": 47335200, + "step": 81530 + }, + { + "epoch": 12.144027405421507, + "grad_norm": 0.046142578125, + "learning_rate": 0.012025187104581732, + "loss": 0.7749, + "num_input_tokens_seen": 47337984, + "step": 81535 + }, + { + "epoch": 12.144772117962466, + "grad_norm": 0.025146484375, + "learning_rate": 0.012023276202501031, + "loss": 0.8012, + "num_input_tokens_seen": 47340864, + "step": 81540 + }, + { + "epoch": 12.145516830503425, + "grad_norm": 0.044921875, + "learning_rate": 0.012021365350709048, + "loss": 0.7872, + "num_input_tokens_seen": 47343584, + "step": 81545 + }, + { + "epoch": 12.146261543044385, + "grad_norm": 0.0238037109375, + "learning_rate": 0.012019454549238074, + "loss": 0.8037, + "num_input_tokens_seen": 47346368, + "step": 81550 + }, + { + "epoch": 12.147006255585344, + "grad_norm": 0.07275390625, + "learning_rate": 0.012017543798120386, + "loss": 0.8045, + "num_input_tokens_seen": 47349120, + "step": 81555 + }, + { + "epoch": 12.147750968126303, + "grad_norm": 0.033447265625, + "learning_rate": 0.012015633097388265, + "loss": 0.8126, + "num_input_tokens_seen": 47352032, + "step": 81560 + }, + { + "epoch": 12.148495680667262, + "grad_norm": 0.048583984375, + "learning_rate": 0.012013722447073992, + "loss": 0.7871, + "num_input_tokens_seen": 47354688, + "step": 81565 + }, + { + "epoch": 12.149240393208222, + "grad_norm": 0.033935546875, + "learning_rate": 0.012011811847209834, + "loss": 0.8152, + "num_input_tokens_seen": 47357568, + "step": 81570 + }, + { + "epoch": 12.149985105749181, + "grad_norm": 0.042236328125, + "learning_rate": 0.012009901297828087, + "loss": 0.7913, + "num_input_tokens_seen": 47360960, + "step": 81575 + }, + { + "epoch": 12.15072981829014, + "grad_norm": 0.0439453125, + "learning_rate": 0.01200799079896101, + "loss": 0.8117, + "num_input_tokens_seen": 47363712, + "step": 81580 + }, + { + "epoch": 12.151474530831099, + "grad_norm": 0.050537109375, + "learning_rate": 0.012006080350640894, + "loss": 0.7828, + "num_input_tokens_seen": 47366432, + "step": 81585 + }, + { + "epoch": 12.152219243372059, + "grad_norm": 0.0205078125, + "learning_rate": 0.012004169952900006, + "loss": 0.8153, + "num_input_tokens_seen": 47369088, + "step": 81590 + }, + { + "epoch": 12.152963955913018, + "grad_norm": 0.033447265625, + "learning_rate": 0.012002259605770624, + "loss": 0.7923, + "num_input_tokens_seen": 47371936, + "step": 81595 + }, + { + "epoch": 12.153708668453977, + "grad_norm": 0.052490234375, + "learning_rate": 0.012000349309285018, + "loss": 0.784, + "num_input_tokens_seen": 47374752, + "step": 81600 + }, + { + "epoch": 12.154453380994935, + "grad_norm": 0.022216796875, + "learning_rate": 0.011998439063475457, + "loss": 0.7952, + "num_input_tokens_seen": 47377504, + "step": 81605 + }, + { + "epoch": 12.155198093535896, + "grad_norm": 0.033935546875, + "learning_rate": 0.011996528868374224, + "loss": 0.7985, + "num_input_tokens_seen": 47380384, + "step": 81610 + }, + { + "epoch": 12.155942806076855, + "grad_norm": 0.0272216796875, + "learning_rate": 0.011994618724013577, + "loss": 0.7923, + "num_input_tokens_seen": 47383456, + "step": 81615 + }, + { + "epoch": 12.156687518617813, + "grad_norm": 0.0478515625, + "learning_rate": 0.011992708630425796, + "loss": 0.7784, + "num_input_tokens_seen": 47386464, + "step": 81620 + }, + { + "epoch": 12.157432231158772, + "grad_norm": 0.03759765625, + "learning_rate": 0.011990798587643145, + "loss": 0.7831, + "num_input_tokens_seen": 47389504, + "step": 81625 + }, + { + "epoch": 12.158176943699733, + "grad_norm": 0.03125, + "learning_rate": 0.011988888595697899, + "loss": 0.7766, + "num_input_tokens_seen": 47392192, + "step": 81630 + }, + { + "epoch": 12.158921656240691, + "grad_norm": 0.048095703125, + "learning_rate": 0.01198697865462231, + "loss": 0.777, + "num_input_tokens_seen": 47395168, + "step": 81635 + }, + { + "epoch": 12.15966636878165, + "grad_norm": 0.028564453125, + "learning_rate": 0.01198506876444866, + "loss": 0.7566, + "num_input_tokens_seen": 47398208, + "step": 81640 + }, + { + "epoch": 12.160411081322609, + "grad_norm": 0.057861328125, + "learning_rate": 0.011983158925209212, + "loss": 0.781, + "num_input_tokens_seen": 47401216, + "step": 81645 + }, + { + "epoch": 12.16115579386357, + "grad_norm": 0.03759765625, + "learning_rate": 0.011981249136936219, + "loss": 0.778, + "num_input_tokens_seen": 47404320, + "step": 81650 + }, + { + "epoch": 12.161900506404528, + "grad_norm": 0.06298828125, + "learning_rate": 0.01197933939966196, + "loss": 0.79, + "num_input_tokens_seen": 47407296, + "step": 81655 + }, + { + "epoch": 12.162645218945487, + "grad_norm": 0.04931640625, + "learning_rate": 0.011977429713418692, + "loss": 0.8286, + "num_input_tokens_seen": 47410048, + "step": 81660 + }, + { + "epoch": 12.163389931486446, + "grad_norm": 0.046875, + "learning_rate": 0.011975520078238676, + "loss": 0.834, + "num_input_tokens_seen": 47412896, + "step": 81665 + }, + { + "epoch": 12.164134644027406, + "grad_norm": 0.02880859375, + "learning_rate": 0.01197361049415417, + "loss": 0.7772, + "num_input_tokens_seen": 47415840, + "step": 81670 + }, + { + "epoch": 12.164879356568365, + "grad_norm": 0.0260009765625, + "learning_rate": 0.011971700961197445, + "loss": 0.7893, + "num_input_tokens_seen": 47418528, + "step": 81675 + }, + { + "epoch": 12.165624069109324, + "grad_norm": 0.033935546875, + "learning_rate": 0.011969791479400747, + "loss": 0.7985, + "num_input_tokens_seen": 47421376, + "step": 81680 + }, + { + "epoch": 12.166368781650283, + "grad_norm": 0.055908203125, + "learning_rate": 0.01196788204879635, + "loss": 0.8065, + "num_input_tokens_seen": 47424768, + "step": 81685 + }, + { + "epoch": 12.167113494191241, + "grad_norm": 0.052734375, + "learning_rate": 0.011965972669416505, + "loss": 0.8034, + "num_input_tokens_seen": 47427744, + "step": 81690 + }, + { + "epoch": 12.167858206732202, + "grad_norm": 0.0303955078125, + "learning_rate": 0.011964063341293461, + "loss": 0.8647, + "num_input_tokens_seen": 47430464, + "step": 81695 + }, + { + "epoch": 12.16860291927316, + "grad_norm": 0.049072265625, + "learning_rate": 0.011962154064459487, + "loss": 0.7838, + "num_input_tokens_seen": 47433184, + "step": 81700 + }, + { + "epoch": 12.16934763181412, + "grad_norm": 0.046142578125, + "learning_rate": 0.011960244838946829, + "loss": 0.8094, + "num_input_tokens_seen": 47435968, + "step": 81705 + }, + { + "epoch": 12.170092344355078, + "grad_norm": 0.04931640625, + "learning_rate": 0.011958335664787746, + "loss": 0.842, + "num_input_tokens_seen": 47438880, + "step": 81710 + }, + { + "epoch": 12.170837056896039, + "grad_norm": 0.05224609375, + "learning_rate": 0.011956426542014488, + "loss": 0.8036, + "num_input_tokens_seen": 47441824, + "step": 81715 + }, + { + "epoch": 12.171581769436997, + "grad_norm": 0.0267333984375, + "learning_rate": 0.011954517470659315, + "loss": 0.7945, + "num_input_tokens_seen": 47444672, + "step": 81720 + }, + { + "epoch": 12.172326481977956, + "grad_norm": 0.0242919921875, + "learning_rate": 0.011952608450754474, + "loss": 0.8048, + "num_input_tokens_seen": 47447360, + "step": 81725 + }, + { + "epoch": 12.173071194518915, + "grad_norm": 0.041748046875, + "learning_rate": 0.011950699482332213, + "loss": 0.7878, + "num_input_tokens_seen": 47450176, + "step": 81730 + }, + { + "epoch": 12.173815907059875, + "grad_norm": 0.0400390625, + "learning_rate": 0.011948790565424788, + "loss": 0.8386, + "num_input_tokens_seen": 47453088, + "step": 81735 + }, + { + "epoch": 12.174560619600834, + "grad_norm": 0.0458984375, + "learning_rate": 0.011946881700064438, + "loss": 0.8075, + "num_input_tokens_seen": 47455936, + "step": 81740 + }, + { + "epoch": 12.175305332141793, + "grad_norm": 0.037109375, + "learning_rate": 0.011944972886283428, + "loss": 0.7768, + "num_input_tokens_seen": 47458880, + "step": 81745 + }, + { + "epoch": 12.176050044682752, + "grad_norm": 0.037109375, + "learning_rate": 0.011943064124113988, + "loss": 0.7853, + "num_input_tokens_seen": 47462112, + "step": 81750 + }, + { + "epoch": 12.176794757223712, + "grad_norm": 0.0189208984375, + "learning_rate": 0.011941155413588379, + "loss": 0.803, + "num_input_tokens_seen": 47464832, + "step": 81755 + }, + { + "epoch": 12.177539469764671, + "grad_norm": 0.021240234375, + "learning_rate": 0.011939246754738838, + "loss": 0.8019, + "num_input_tokens_seen": 47467808, + "step": 81760 + }, + { + "epoch": 12.17828418230563, + "grad_norm": 0.037353515625, + "learning_rate": 0.011937338147597613, + "loss": 0.7961, + "num_input_tokens_seen": 47470752, + "step": 81765 + }, + { + "epoch": 12.179028894846589, + "grad_norm": 0.031494140625, + "learning_rate": 0.01193542959219695, + "loss": 0.7987, + "num_input_tokens_seen": 47473696, + "step": 81770 + }, + { + "epoch": 12.179773607387549, + "grad_norm": 0.0478515625, + "learning_rate": 0.011933521088569082, + "loss": 0.7979, + "num_input_tokens_seen": 47476864, + "step": 81775 + }, + { + "epoch": 12.180518319928508, + "grad_norm": 0.037353515625, + "learning_rate": 0.011931612636746265, + "loss": 0.8211, + "num_input_tokens_seen": 47479744, + "step": 81780 + }, + { + "epoch": 12.181263032469467, + "grad_norm": 0.037109375, + "learning_rate": 0.011929704236760727, + "loss": 0.806, + "num_input_tokens_seen": 47482464, + "step": 81785 + }, + { + "epoch": 12.182007745010425, + "grad_norm": 0.044921875, + "learning_rate": 0.011927795888644722, + "loss": 0.8041, + "num_input_tokens_seen": 47485408, + "step": 81790 + }, + { + "epoch": 12.182752457551386, + "grad_norm": 0.064453125, + "learning_rate": 0.011925887592430481, + "loss": 0.7706, + "num_input_tokens_seen": 47488192, + "step": 81795 + }, + { + "epoch": 12.183497170092345, + "grad_norm": 0.0303955078125, + "learning_rate": 0.011923979348150246, + "loss": 0.8079, + "num_input_tokens_seen": 47491040, + "step": 81800 + }, + { + "epoch": 12.184241882633303, + "grad_norm": 0.044189453125, + "learning_rate": 0.011922071155836255, + "loss": 0.7954, + "num_input_tokens_seen": 47493856, + "step": 81805 + }, + { + "epoch": 12.184986595174262, + "grad_norm": 0.0174560546875, + "learning_rate": 0.011920163015520737, + "loss": 0.7937, + "num_input_tokens_seen": 47496608, + "step": 81810 + }, + { + "epoch": 12.185731307715223, + "grad_norm": 0.054931640625, + "learning_rate": 0.01191825492723594, + "loss": 0.7916, + "num_input_tokens_seen": 47499616, + "step": 81815 + }, + { + "epoch": 12.186476020256181, + "grad_norm": 0.044189453125, + "learning_rate": 0.011916346891014092, + "loss": 0.803, + "num_input_tokens_seen": 47502528, + "step": 81820 + }, + { + "epoch": 12.18722073279714, + "grad_norm": 0.038818359375, + "learning_rate": 0.01191443890688743, + "loss": 0.7913, + "num_input_tokens_seen": 47505376, + "step": 81825 + }, + { + "epoch": 12.187965445338099, + "grad_norm": 0.0302734375, + "learning_rate": 0.011912530974888185, + "loss": 0.7952, + "num_input_tokens_seen": 47508448, + "step": 81830 + }, + { + "epoch": 12.18871015787906, + "grad_norm": 0.0211181640625, + "learning_rate": 0.011910623095048594, + "loss": 0.8057, + "num_input_tokens_seen": 47511296, + "step": 81835 + }, + { + "epoch": 12.189454870420018, + "grad_norm": 0.030517578125, + "learning_rate": 0.01190871526740088, + "loss": 0.8099, + "num_input_tokens_seen": 47514176, + "step": 81840 + }, + { + "epoch": 12.190199582960977, + "grad_norm": 0.0537109375, + "learning_rate": 0.011906807491977285, + "loss": 0.7919, + "num_input_tokens_seen": 47517216, + "step": 81845 + }, + { + "epoch": 12.190944295501936, + "grad_norm": 0.04736328125, + "learning_rate": 0.011904899768810037, + "loss": 0.8121, + "num_input_tokens_seen": 47520256, + "step": 81850 + }, + { + "epoch": 12.191689008042895, + "grad_norm": 0.1298828125, + "learning_rate": 0.011902992097931355, + "loss": 0.8362, + "num_input_tokens_seen": 47523008, + "step": 81855 + }, + { + "epoch": 12.192433720583855, + "grad_norm": 0.0260009765625, + "learning_rate": 0.01190108447937348, + "loss": 0.7913, + "num_input_tokens_seen": 47526048, + "step": 81860 + }, + { + "epoch": 12.193178433124814, + "grad_norm": 0.031982421875, + "learning_rate": 0.011899176913168625, + "loss": 0.8015, + "num_input_tokens_seen": 47528928, + "step": 81865 + }, + { + "epoch": 12.193923145665773, + "grad_norm": 0.047119140625, + "learning_rate": 0.01189726939934903, + "loss": 0.7895, + "num_input_tokens_seen": 47531904, + "step": 81870 + }, + { + "epoch": 12.194667858206731, + "grad_norm": 0.026611328125, + "learning_rate": 0.01189536193794691, + "loss": 0.8043, + "num_input_tokens_seen": 47534848, + "step": 81875 + }, + { + "epoch": 12.195412570747692, + "grad_norm": 0.029296875, + "learning_rate": 0.0118934545289945, + "loss": 0.8015, + "num_input_tokens_seen": 47537824, + "step": 81880 + }, + { + "epoch": 12.19615728328865, + "grad_norm": 0.0250244140625, + "learning_rate": 0.011891547172524022, + "loss": 0.8006, + "num_input_tokens_seen": 47540768, + "step": 81885 + }, + { + "epoch": 12.19690199582961, + "grad_norm": 0.031982421875, + "learning_rate": 0.011889639868567687, + "loss": 0.7803, + "num_input_tokens_seen": 47543968, + "step": 81890 + }, + { + "epoch": 12.197646708370568, + "grad_norm": 0.03369140625, + "learning_rate": 0.011887732617157733, + "loss": 0.7747, + "num_input_tokens_seen": 47546592, + "step": 81895 + }, + { + "epoch": 12.198391420911529, + "grad_norm": 0.031982421875, + "learning_rate": 0.011885825418326362, + "loss": 0.7988, + "num_input_tokens_seen": 47549728, + "step": 81900 + }, + { + "epoch": 12.199136133452487, + "grad_norm": 0.052001953125, + "learning_rate": 0.011883918272105814, + "loss": 0.7961, + "num_input_tokens_seen": 47554112, + "step": 81905 + }, + { + "epoch": 12.199880845993446, + "grad_norm": 0.07080078125, + "learning_rate": 0.011882011178528294, + "loss": 0.7871, + "num_input_tokens_seen": 47556992, + "step": 81910 + }, + { + "epoch": 12.200625558534405, + "grad_norm": 0.03369140625, + "learning_rate": 0.011880104137626033, + "loss": 0.8035, + "num_input_tokens_seen": 47559808, + "step": 81915 + }, + { + "epoch": 12.201370271075366, + "grad_norm": 0.036865234375, + "learning_rate": 0.011878197149431237, + "loss": 0.7957, + "num_input_tokens_seen": 47562976, + "step": 81920 + }, + { + "epoch": 12.202114983616324, + "grad_norm": 0.043701171875, + "learning_rate": 0.011876290213976131, + "loss": 0.7871, + "num_input_tokens_seen": 47566016, + "step": 81925 + }, + { + "epoch": 12.202859696157283, + "grad_norm": 0.046875, + "learning_rate": 0.01187438333129293, + "loss": 0.7855, + "num_input_tokens_seen": 47569088, + "step": 81930 + }, + { + "epoch": 12.203604408698242, + "grad_norm": 0.042236328125, + "learning_rate": 0.011872476501413834, + "loss": 0.7999, + "num_input_tokens_seen": 47572000, + "step": 81935 + }, + { + "epoch": 12.204349121239202, + "grad_norm": 0.0286865234375, + "learning_rate": 0.011870569724371078, + "loss": 0.8219, + "num_input_tokens_seen": 47574752, + "step": 81940 + }, + { + "epoch": 12.205093833780161, + "grad_norm": 0.044921875, + "learning_rate": 0.01186866300019686, + "loss": 0.7995, + "num_input_tokens_seen": 47577920, + "step": 81945 + }, + { + "epoch": 12.20583854632112, + "grad_norm": 0.06298828125, + "learning_rate": 0.011866756328923403, + "loss": 0.7963, + "num_input_tokens_seen": 47580992, + "step": 81950 + }, + { + "epoch": 12.206583258862079, + "grad_norm": 0.0244140625, + "learning_rate": 0.011864849710582912, + "loss": 0.7985, + "num_input_tokens_seen": 47583808, + "step": 81955 + }, + { + "epoch": 12.20732797140304, + "grad_norm": 0.038818359375, + "learning_rate": 0.011862943145207602, + "loss": 0.8075, + "num_input_tokens_seen": 47586784, + "step": 81960 + }, + { + "epoch": 12.208072683943998, + "grad_norm": 0.03076171875, + "learning_rate": 0.011861036632829678, + "loss": 0.7953, + "num_input_tokens_seen": 47589728, + "step": 81965 + }, + { + "epoch": 12.208817396484957, + "grad_norm": 0.03759765625, + "learning_rate": 0.011859130173481343, + "loss": 0.8033, + "num_input_tokens_seen": 47592672, + "step": 81970 + }, + { + "epoch": 12.209562109025915, + "grad_norm": 0.031982421875, + "learning_rate": 0.011857223767194818, + "loss": 0.7957, + "num_input_tokens_seen": 47595808, + "step": 81975 + }, + { + "epoch": 12.210306821566876, + "grad_norm": 0.035400390625, + "learning_rate": 0.011855317414002298, + "loss": 0.8024, + "num_input_tokens_seen": 47598816, + "step": 81980 + }, + { + "epoch": 12.211051534107835, + "grad_norm": 0.050048828125, + "learning_rate": 0.011853411113935999, + "loss": 0.8123, + "num_input_tokens_seen": 47601952, + "step": 81985 + }, + { + "epoch": 12.211796246648793, + "grad_norm": 0.045654296875, + "learning_rate": 0.011851504867028118, + "loss": 0.8436, + "num_input_tokens_seen": 47604736, + "step": 81990 + }, + { + "epoch": 12.212540959189752, + "grad_norm": 0.049560546875, + "learning_rate": 0.011849598673310867, + "loss": 0.7872, + "num_input_tokens_seen": 47607680, + "step": 81995 + }, + { + "epoch": 12.213285671730713, + "grad_norm": 0.040771484375, + "learning_rate": 0.011847692532816438, + "loss": 0.8035, + "num_input_tokens_seen": 47610560, + "step": 82000 + }, + { + "epoch": 12.214030384271672, + "grad_norm": 0.056640625, + "learning_rate": 0.011845786445577045, + "loss": 0.7923, + "num_input_tokens_seen": 47613376, + "step": 82005 + }, + { + "epoch": 12.21477509681263, + "grad_norm": 0.041748046875, + "learning_rate": 0.011843880411624885, + "loss": 0.7994, + "num_input_tokens_seen": 47616128, + "step": 82010 + }, + { + "epoch": 12.215519809353589, + "grad_norm": 0.041259765625, + "learning_rate": 0.011841974430992152, + "loss": 0.7819, + "num_input_tokens_seen": 47619136, + "step": 82015 + }, + { + "epoch": 12.216264521894548, + "grad_norm": 0.044677734375, + "learning_rate": 0.011840068503711056, + "loss": 0.7957, + "num_input_tokens_seen": 47622080, + "step": 82020 + }, + { + "epoch": 12.217009234435508, + "grad_norm": 0.041015625, + "learning_rate": 0.011838162629813789, + "loss": 0.7942, + "num_input_tokens_seen": 47624832, + "step": 82025 + }, + { + "epoch": 12.217753946976467, + "grad_norm": 0.03564453125, + "learning_rate": 0.011836256809332554, + "loss": 0.7995, + "num_input_tokens_seen": 47627712, + "step": 82030 + }, + { + "epoch": 12.218498659517426, + "grad_norm": 0.043212890625, + "learning_rate": 0.011834351042299537, + "loss": 0.7913, + "num_input_tokens_seen": 47630688, + "step": 82035 + }, + { + "epoch": 12.219243372058385, + "grad_norm": 0.039794921875, + "learning_rate": 0.01183244532874695, + "loss": 0.8099, + "num_input_tokens_seen": 47633824, + "step": 82040 + }, + { + "epoch": 12.219988084599345, + "grad_norm": 0.047119140625, + "learning_rate": 0.01183053966870698, + "loss": 0.7823, + "num_input_tokens_seen": 47636576, + "step": 82045 + }, + { + "epoch": 12.220732797140304, + "grad_norm": 0.04248046875, + "learning_rate": 0.011828634062211813, + "loss": 0.7864, + "num_input_tokens_seen": 47639232, + "step": 82050 + }, + { + "epoch": 12.221477509681263, + "grad_norm": 0.051513671875, + "learning_rate": 0.011826728509293656, + "loss": 0.7938, + "num_input_tokens_seen": 47642240, + "step": 82055 + }, + { + "epoch": 12.222222222222221, + "grad_norm": 0.032958984375, + "learning_rate": 0.011824823009984695, + "loss": 0.7783, + "num_input_tokens_seen": 47644960, + "step": 82060 + }, + { + "epoch": 12.222966934763182, + "grad_norm": 0.031005859375, + "learning_rate": 0.011822917564317123, + "loss": 0.7957, + "num_input_tokens_seen": 47647904, + "step": 82065 + }, + { + "epoch": 12.22371164730414, + "grad_norm": 0.02978515625, + "learning_rate": 0.011821012172323127, + "loss": 0.8157, + "num_input_tokens_seen": 47650752, + "step": 82070 + }, + { + "epoch": 12.2244563598451, + "grad_norm": 0.0419921875, + "learning_rate": 0.011819106834034903, + "loss": 0.7931, + "num_input_tokens_seen": 47653952, + "step": 82075 + }, + { + "epoch": 12.225201072386058, + "grad_norm": 0.032470703125, + "learning_rate": 0.011817201549484629, + "loss": 0.7809, + "num_input_tokens_seen": 47656832, + "step": 82080 + }, + { + "epoch": 12.225945784927019, + "grad_norm": 0.03173828125, + "learning_rate": 0.011815296318704507, + "loss": 0.8034, + "num_input_tokens_seen": 47659584, + "step": 82085 + }, + { + "epoch": 12.226690497467978, + "grad_norm": 0.042724609375, + "learning_rate": 0.011813391141726717, + "loss": 0.7853, + "num_input_tokens_seen": 47662400, + "step": 82090 + }, + { + "epoch": 12.227435210008936, + "grad_norm": 0.07275390625, + "learning_rate": 0.011811486018583442, + "loss": 0.8713, + "num_input_tokens_seen": 47665600, + "step": 82095 + }, + { + "epoch": 12.228179922549895, + "grad_norm": 0.051513671875, + "learning_rate": 0.011809580949306873, + "loss": 0.8088, + "num_input_tokens_seen": 47668768, + "step": 82100 + }, + { + "epoch": 12.228924635090856, + "grad_norm": 0.032470703125, + "learning_rate": 0.011807675933929184, + "loss": 0.8156, + "num_input_tokens_seen": 47671616, + "step": 82105 + }, + { + "epoch": 12.229669347631814, + "grad_norm": 0.0400390625, + "learning_rate": 0.011805770972482574, + "loss": 0.8139, + "num_input_tokens_seen": 47674432, + "step": 82110 + }, + { + "epoch": 12.230414060172773, + "grad_norm": 0.041748046875, + "learning_rate": 0.011803866064999209, + "loss": 0.784, + "num_input_tokens_seen": 47677152, + "step": 82115 + }, + { + "epoch": 12.231158772713732, + "grad_norm": 0.0242919921875, + "learning_rate": 0.011801961211511286, + "loss": 0.7921, + "num_input_tokens_seen": 47680160, + "step": 82120 + }, + { + "epoch": 12.231903485254692, + "grad_norm": 0.0289306640625, + "learning_rate": 0.011800056412050976, + "loss": 0.7976, + "num_input_tokens_seen": 47682848, + "step": 82125 + }, + { + "epoch": 12.232648197795651, + "grad_norm": 0.0303955078125, + "learning_rate": 0.01179815166665046, + "loss": 0.7912, + "num_input_tokens_seen": 47685728, + "step": 82130 + }, + { + "epoch": 12.23339291033661, + "grad_norm": 0.0267333984375, + "learning_rate": 0.011796246975341918, + "loss": 0.7967, + "num_input_tokens_seen": 47688384, + "step": 82135 + }, + { + "epoch": 12.234137622877569, + "grad_norm": 0.07763671875, + "learning_rate": 0.011794342338157524, + "loss": 0.8025, + "num_input_tokens_seen": 47691136, + "step": 82140 + }, + { + "epoch": 12.23488233541853, + "grad_norm": 0.052490234375, + "learning_rate": 0.011792437755129462, + "loss": 0.7908, + "num_input_tokens_seen": 47694144, + "step": 82145 + }, + { + "epoch": 12.235627047959488, + "grad_norm": 0.0380859375, + "learning_rate": 0.0117905332262899, + "loss": 0.791, + "num_input_tokens_seen": 47697408, + "step": 82150 + }, + { + "epoch": 12.236371760500447, + "grad_norm": 0.040771484375, + "learning_rate": 0.01178862875167102, + "loss": 0.8305, + "num_input_tokens_seen": 47700384, + "step": 82155 + }, + { + "epoch": 12.237116473041405, + "grad_norm": 0.050048828125, + "learning_rate": 0.011786724331304992, + "loss": 0.8005, + "num_input_tokens_seen": 47704032, + "step": 82160 + }, + { + "epoch": 12.237861185582366, + "grad_norm": 0.02099609375, + "learning_rate": 0.011784819965223993, + "loss": 0.811, + "num_input_tokens_seen": 47706880, + "step": 82165 + }, + { + "epoch": 12.238605898123325, + "grad_norm": 0.022216796875, + "learning_rate": 0.011782915653460195, + "loss": 0.8036, + "num_input_tokens_seen": 47709696, + "step": 82170 + }, + { + "epoch": 12.239350610664284, + "grad_norm": 0.0380859375, + "learning_rate": 0.01178101139604576, + "loss": 0.7969, + "num_input_tokens_seen": 47712448, + "step": 82175 + }, + { + "epoch": 12.240095323205242, + "grad_norm": 0.0284423828125, + "learning_rate": 0.011779107193012872, + "loss": 0.8001, + "num_input_tokens_seen": 47715328, + "step": 82180 + }, + { + "epoch": 12.240840035746203, + "grad_norm": 0.050048828125, + "learning_rate": 0.011777203044393692, + "loss": 0.796, + "num_input_tokens_seen": 47718272, + "step": 82185 + }, + { + "epoch": 12.241584748287162, + "grad_norm": 0.04736328125, + "learning_rate": 0.011775298950220392, + "loss": 0.7725, + "num_input_tokens_seen": 47721248, + "step": 82190 + }, + { + "epoch": 12.24232946082812, + "grad_norm": 0.049072265625, + "learning_rate": 0.011773394910525134, + "loss": 0.7716, + "num_input_tokens_seen": 47724000, + "step": 82195 + }, + { + "epoch": 12.243074173369079, + "grad_norm": 0.03759765625, + "learning_rate": 0.011771490925340095, + "loss": 0.7865, + "num_input_tokens_seen": 47726848, + "step": 82200 + }, + { + "epoch": 12.243818885910038, + "grad_norm": 0.045166015625, + "learning_rate": 0.011769586994697438, + "loss": 0.7848, + "num_input_tokens_seen": 47730048, + "step": 82205 + }, + { + "epoch": 12.244563598450998, + "grad_norm": 0.04833984375, + "learning_rate": 0.011767683118629317, + "loss": 0.8251, + "num_input_tokens_seen": 47732608, + "step": 82210 + }, + { + "epoch": 12.245308310991957, + "grad_norm": 0.02734375, + "learning_rate": 0.01176577929716791, + "loss": 0.794, + "num_input_tokens_seen": 47735840, + "step": 82215 + }, + { + "epoch": 12.246053023532916, + "grad_norm": 0.0303955078125, + "learning_rate": 0.011763875530345375, + "loss": 0.7926, + "num_input_tokens_seen": 47738816, + "step": 82220 + }, + { + "epoch": 12.246797736073875, + "grad_norm": 0.032958984375, + "learning_rate": 0.011761971818193875, + "loss": 0.7944, + "num_input_tokens_seen": 47741952, + "step": 82225 + }, + { + "epoch": 12.247542448614835, + "grad_norm": 0.05078125, + "learning_rate": 0.011760068160745563, + "loss": 0.7979, + "num_input_tokens_seen": 47744928, + "step": 82230 + }, + { + "epoch": 12.248287161155794, + "grad_norm": 0.04150390625, + "learning_rate": 0.011758164558032615, + "loss": 0.7865, + "num_input_tokens_seen": 47747616, + "step": 82235 + }, + { + "epoch": 12.249031873696753, + "grad_norm": 0.043701171875, + "learning_rate": 0.011756261010087173, + "loss": 0.8043, + "num_input_tokens_seen": 47750656, + "step": 82240 + }, + { + "epoch": 12.249776586237711, + "grad_norm": 0.044189453125, + "learning_rate": 0.011754357516941412, + "loss": 0.7956, + "num_input_tokens_seen": 47753504, + "step": 82245 + }, + { + "epoch": 12.250521298778672, + "grad_norm": 0.059326171875, + "learning_rate": 0.011752454078627482, + "loss": 0.7864, + "num_input_tokens_seen": 47756256, + "step": 82250 + }, + { + "epoch": 12.25126601131963, + "grad_norm": 0.037841796875, + "learning_rate": 0.011750550695177536, + "loss": 0.7934, + "num_input_tokens_seen": 47759200, + "step": 82255 + }, + { + "epoch": 12.25201072386059, + "grad_norm": 0.02294921875, + "learning_rate": 0.011748647366623738, + "loss": 0.7836, + "num_input_tokens_seen": 47762240, + "step": 82260 + }, + { + "epoch": 12.252755436401548, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01174674409299823, + "loss": 0.806, + "num_input_tokens_seen": 47765408, + "step": 82265 + }, + { + "epoch": 12.253500148942509, + "grad_norm": 0.035400390625, + "learning_rate": 0.011744840874333182, + "loss": 0.8063, + "num_input_tokens_seen": 47768224, + "step": 82270 + }, + { + "epoch": 12.254244861483468, + "grad_norm": 0.0203857421875, + "learning_rate": 0.011742937710660732, + "loss": 0.8048, + "num_input_tokens_seen": 47771296, + "step": 82275 + }, + { + "epoch": 12.254989574024426, + "grad_norm": 0.03076171875, + "learning_rate": 0.011741034602013044, + "loss": 0.7881, + "num_input_tokens_seen": 47774432, + "step": 82280 + }, + { + "epoch": 12.255734286565385, + "grad_norm": 0.028076171875, + "learning_rate": 0.011739131548422262, + "loss": 0.8076, + "num_input_tokens_seen": 47777216, + "step": 82285 + }, + { + "epoch": 12.256478999106346, + "grad_norm": 0.0252685546875, + "learning_rate": 0.011737228549920544, + "loss": 0.7774, + "num_input_tokens_seen": 47779872, + "step": 82290 + }, + { + "epoch": 12.257223711647304, + "grad_norm": 0.0322265625, + "learning_rate": 0.01173532560654003, + "loss": 0.8041, + "num_input_tokens_seen": 47782720, + "step": 82295 + }, + { + "epoch": 12.257968424188263, + "grad_norm": 0.029541015625, + "learning_rate": 0.011733422718312866, + "loss": 0.7998, + "num_input_tokens_seen": 47785280, + "step": 82300 + }, + { + "epoch": 12.258713136729222, + "grad_norm": 0.044921875, + "learning_rate": 0.011731519885271213, + "loss": 0.7754, + "num_input_tokens_seen": 47788096, + "step": 82305 + }, + { + "epoch": 12.259457849270182, + "grad_norm": 0.043212890625, + "learning_rate": 0.011729617107447203, + "loss": 0.8103, + "num_input_tokens_seen": 47791232, + "step": 82310 + }, + { + "epoch": 12.260202561811141, + "grad_norm": 0.026123046875, + "learning_rate": 0.011727714384872995, + "loss": 0.7888, + "num_input_tokens_seen": 47794336, + "step": 82315 + }, + { + "epoch": 12.2609472743521, + "grad_norm": 0.02490234375, + "learning_rate": 0.011725811717580721, + "loss": 0.7945, + "num_input_tokens_seen": 47797280, + "step": 82320 + }, + { + "epoch": 12.261691986893059, + "grad_norm": 0.0654296875, + "learning_rate": 0.011723909105602535, + "loss": 0.8307, + "num_input_tokens_seen": 47800224, + "step": 82325 + }, + { + "epoch": 12.26243669943402, + "grad_norm": 0.0281982421875, + "learning_rate": 0.011722006548970577, + "loss": 0.8292, + "num_input_tokens_seen": 47802880, + "step": 82330 + }, + { + "epoch": 12.263181411974978, + "grad_norm": 0.0264892578125, + "learning_rate": 0.011720104047716976, + "loss": 0.7956, + "num_input_tokens_seen": 47805664, + "step": 82335 + }, + { + "epoch": 12.263926124515937, + "grad_norm": 0.0712890625, + "learning_rate": 0.011718201601873892, + "loss": 0.8464, + "num_input_tokens_seen": 47808384, + "step": 82340 + }, + { + "epoch": 12.264670837056896, + "grad_norm": 0.0274658203125, + "learning_rate": 0.011716299211473453, + "loss": 0.7893, + "num_input_tokens_seen": 47811296, + "step": 82345 + }, + { + "epoch": 12.265415549597856, + "grad_norm": 0.037109375, + "learning_rate": 0.011714396876547805, + "loss": 0.8009, + "num_input_tokens_seen": 47814048, + "step": 82350 + }, + { + "epoch": 12.266160262138815, + "grad_norm": 0.034912109375, + "learning_rate": 0.011712494597129077, + "loss": 0.7931, + "num_input_tokens_seen": 47816800, + "step": 82355 + }, + { + "epoch": 12.266904974679774, + "grad_norm": 0.021484375, + "learning_rate": 0.011710592373249418, + "loss": 0.8091, + "num_input_tokens_seen": 47819456, + "step": 82360 + }, + { + "epoch": 12.267649687220732, + "grad_norm": 0.03369140625, + "learning_rate": 0.011708690204940952, + "loss": 0.7942, + "num_input_tokens_seen": 47822176, + "step": 82365 + }, + { + "epoch": 12.268394399761693, + "grad_norm": 0.0220947265625, + "learning_rate": 0.011706788092235822, + "loss": 0.8009, + "num_input_tokens_seen": 47825088, + "step": 82370 + }, + { + "epoch": 12.269139112302652, + "grad_norm": 0.037841796875, + "learning_rate": 0.011704886035166165, + "loss": 0.8019, + "num_input_tokens_seen": 47827840, + "step": 82375 + }, + { + "epoch": 12.26988382484361, + "grad_norm": 0.035888671875, + "learning_rate": 0.011702984033764099, + "loss": 0.7976, + "num_input_tokens_seen": 47830912, + "step": 82380 + }, + { + "epoch": 12.27062853738457, + "grad_norm": 0.03564453125, + "learning_rate": 0.011701082088061776, + "loss": 0.8104, + "num_input_tokens_seen": 47833984, + "step": 82385 + }, + { + "epoch": 12.271373249925528, + "grad_norm": 0.0274658203125, + "learning_rate": 0.011699180198091316, + "loss": 0.8017, + "num_input_tokens_seen": 47836736, + "step": 82390 + }, + { + "epoch": 12.272117962466488, + "grad_norm": 0.032470703125, + "learning_rate": 0.011697278363884851, + "loss": 0.8075, + "num_input_tokens_seen": 47839552, + "step": 82395 + }, + { + "epoch": 12.272862675007447, + "grad_norm": 0.0673828125, + "learning_rate": 0.011695376585474508, + "loss": 0.8078, + "num_input_tokens_seen": 47842304, + "step": 82400 + }, + { + "epoch": 12.273607387548406, + "grad_norm": 0.028076171875, + "learning_rate": 0.011693474862892423, + "loss": 0.7991, + "num_input_tokens_seen": 47845184, + "step": 82405 + }, + { + "epoch": 12.274352100089365, + "grad_norm": 0.0274658203125, + "learning_rate": 0.011691573196170722, + "loss": 0.8117, + "num_input_tokens_seen": 47848160, + "step": 82410 + }, + { + "epoch": 12.275096812630325, + "grad_norm": 0.03515625, + "learning_rate": 0.01168967158534152, + "loss": 0.8032, + "num_input_tokens_seen": 47851360, + "step": 82415 + }, + { + "epoch": 12.275841525171284, + "grad_norm": 0.0322265625, + "learning_rate": 0.011687770030436962, + "loss": 0.7859, + "num_input_tokens_seen": 47854176, + "step": 82420 + }, + { + "epoch": 12.276586237712243, + "grad_norm": 0.0341796875, + "learning_rate": 0.011685868531489157, + "loss": 0.8015, + "num_input_tokens_seen": 47857088, + "step": 82425 + }, + { + "epoch": 12.277330950253202, + "grad_norm": 0.025634765625, + "learning_rate": 0.011683967088530239, + "loss": 0.7982, + "num_input_tokens_seen": 47859712, + "step": 82430 + }, + { + "epoch": 12.278075662794162, + "grad_norm": 0.052734375, + "learning_rate": 0.011682065701592323, + "loss": 0.813, + "num_input_tokens_seen": 47862880, + "step": 82435 + }, + { + "epoch": 12.27882037533512, + "grad_norm": 0.0252685546875, + "learning_rate": 0.011680164370707537, + "loss": 0.8114, + "num_input_tokens_seen": 47866080, + "step": 82440 + }, + { + "epoch": 12.27956508787608, + "grad_norm": 0.04345703125, + "learning_rate": 0.011678263095907995, + "loss": 0.7939, + "num_input_tokens_seen": 47868832, + "step": 82445 + }, + { + "epoch": 12.280309800417038, + "grad_norm": 0.0267333984375, + "learning_rate": 0.011676361877225828, + "loss": 0.786, + "num_input_tokens_seen": 47871648, + "step": 82450 + }, + { + "epoch": 12.281054512957999, + "grad_norm": 0.14453125, + "learning_rate": 0.011674460714693148, + "loss": 0.7578, + "num_input_tokens_seen": 47874592, + "step": 82455 + }, + { + "epoch": 12.281799225498958, + "grad_norm": 0.037841796875, + "learning_rate": 0.011672559608342074, + "loss": 0.7962, + "num_input_tokens_seen": 47877952, + "step": 82460 + }, + { + "epoch": 12.282543938039916, + "grad_norm": 0.03662109375, + "learning_rate": 0.011670658558204726, + "loss": 0.8281, + "num_input_tokens_seen": 47881152, + "step": 82465 + }, + { + "epoch": 12.283288650580875, + "grad_norm": 0.0390625, + "learning_rate": 0.01166875756431321, + "loss": 0.7907, + "num_input_tokens_seen": 47883904, + "step": 82470 + }, + { + "epoch": 12.284033363121836, + "grad_norm": 0.0263671875, + "learning_rate": 0.011666856626699657, + "loss": 0.7978, + "num_input_tokens_seen": 47886880, + "step": 82475 + }, + { + "epoch": 12.284778075662794, + "grad_norm": 0.0291748046875, + "learning_rate": 0.01166495574539617, + "loss": 0.7983, + "num_input_tokens_seen": 47890048, + "step": 82480 + }, + { + "epoch": 12.285522788203753, + "grad_norm": 0.04833984375, + "learning_rate": 0.011663054920434866, + "loss": 0.788, + "num_input_tokens_seen": 47892960, + "step": 82485 + }, + { + "epoch": 12.286267500744712, + "grad_norm": 0.0380859375, + "learning_rate": 0.01166115415184786, + "loss": 0.7914, + "num_input_tokens_seen": 47895872, + "step": 82490 + }, + { + "epoch": 12.287012213285673, + "grad_norm": 0.0291748046875, + "learning_rate": 0.011659253439667259, + "loss": 0.7843, + "num_input_tokens_seen": 47898912, + "step": 82495 + }, + { + "epoch": 12.287756925826631, + "grad_norm": 0.03271484375, + "learning_rate": 0.011657352783925178, + "loss": 0.8001, + "num_input_tokens_seen": 47901760, + "step": 82500 + }, + { + "epoch": 12.28850163836759, + "grad_norm": 0.043212890625, + "learning_rate": 0.01165545218465372, + "loss": 0.7806, + "num_input_tokens_seen": 47904640, + "step": 82505 + }, + { + "epoch": 12.289246350908549, + "grad_norm": 0.032470703125, + "learning_rate": 0.011653551641885002, + "loss": 0.8148, + "num_input_tokens_seen": 47907680, + "step": 82510 + }, + { + "epoch": 12.28999106344951, + "grad_norm": 0.028076171875, + "learning_rate": 0.011651651155651126, + "loss": 0.7982, + "num_input_tokens_seen": 47910848, + "step": 82515 + }, + { + "epoch": 12.290735775990468, + "grad_norm": 0.03564453125, + "learning_rate": 0.011649750725984204, + "loss": 0.8017, + "num_input_tokens_seen": 47913792, + "step": 82520 + }, + { + "epoch": 12.291480488531427, + "grad_norm": 0.03564453125, + "learning_rate": 0.011647850352916331, + "loss": 0.8209, + "num_input_tokens_seen": 47916672, + "step": 82525 + }, + { + "epoch": 12.292225201072386, + "grad_norm": 0.02978515625, + "learning_rate": 0.011645950036479627, + "loss": 0.7972, + "num_input_tokens_seen": 47919712, + "step": 82530 + }, + { + "epoch": 12.292969913613344, + "grad_norm": 0.04296875, + "learning_rate": 0.011644049776706186, + "loss": 0.7991, + "num_input_tokens_seen": 47922592, + "step": 82535 + }, + { + "epoch": 12.293714626154305, + "grad_norm": 0.0185546875, + "learning_rate": 0.01164214957362811, + "loss": 0.7788, + "num_input_tokens_seen": 47925376, + "step": 82540 + }, + { + "epoch": 12.294459338695264, + "grad_norm": 0.0269775390625, + "learning_rate": 0.011640249427277509, + "loss": 0.7841, + "num_input_tokens_seen": 47928256, + "step": 82545 + }, + { + "epoch": 12.295204051236222, + "grad_norm": 0.037841796875, + "learning_rate": 0.011638349337686475, + "loss": 0.7821, + "num_input_tokens_seen": 47931072, + "step": 82550 + }, + { + "epoch": 12.295948763777181, + "grad_norm": 0.02685546875, + "learning_rate": 0.011636449304887116, + "loss": 0.7925, + "num_input_tokens_seen": 47933760, + "step": 82555 + }, + { + "epoch": 12.296693476318142, + "grad_norm": 0.030517578125, + "learning_rate": 0.01163454932891152, + "loss": 0.8245, + "num_input_tokens_seen": 47936832, + "step": 82560 + }, + { + "epoch": 12.2974381888591, + "grad_norm": 0.0260009765625, + "learning_rate": 0.011632649409791803, + "loss": 0.7965, + "num_input_tokens_seen": 47939872, + "step": 82565 + }, + { + "epoch": 12.29818290140006, + "grad_norm": 0.03271484375, + "learning_rate": 0.011630749547560049, + "loss": 0.7872, + "num_input_tokens_seen": 47943072, + "step": 82570 + }, + { + "epoch": 12.298927613941018, + "grad_norm": 0.0281982421875, + "learning_rate": 0.01162884974224835, + "loss": 0.7952, + "num_input_tokens_seen": 47945888, + "step": 82575 + }, + { + "epoch": 12.299672326481979, + "grad_norm": 0.0498046875, + "learning_rate": 0.011626949993888816, + "loss": 0.7932, + "num_input_tokens_seen": 47948864, + "step": 82580 + }, + { + "epoch": 12.300417039022937, + "grad_norm": 0.05078125, + "learning_rate": 0.01162505030251353, + "loss": 0.7965, + "num_input_tokens_seen": 47951648, + "step": 82585 + }, + { + "epoch": 12.301161751563896, + "grad_norm": 0.0233154296875, + "learning_rate": 0.011623150668154593, + "loss": 0.8059, + "num_input_tokens_seen": 47954560, + "step": 82590 + }, + { + "epoch": 12.301906464104855, + "grad_norm": 0.05322265625, + "learning_rate": 0.011621251090844085, + "loss": 0.8033, + "num_input_tokens_seen": 47957536, + "step": 82595 + }, + { + "epoch": 12.302651176645815, + "grad_norm": 0.044677734375, + "learning_rate": 0.011619351570614113, + "loss": 0.7944, + "num_input_tokens_seen": 47960256, + "step": 82600 + }, + { + "epoch": 12.303395889186774, + "grad_norm": 0.02880859375, + "learning_rate": 0.011617452107496757, + "loss": 0.7919, + "num_input_tokens_seen": 47963008, + "step": 82605 + }, + { + "epoch": 12.304140601727733, + "grad_norm": 0.04443359375, + "learning_rate": 0.011615552701524113, + "loss": 0.8049, + "num_input_tokens_seen": 47965824, + "step": 82610 + }, + { + "epoch": 12.304885314268692, + "grad_norm": 0.03662109375, + "learning_rate": 0.011613653352728265, + "loss": 0.7945, + "num_input_tokens_seen": 47968576, + "step": 82615 + }, + { + "epoch": 12.305630026809652, + "grad_norm": 0.04736328125, + "learning_rate": 0.0116117540611413, + "loss": 0.7816, + "num_input_tokens_seen": 47971712, + "step": 82620 + }, + { + "epoch": 12.30637473935061, + "grad_norm": 0.041015625, + "learning_rate": 0.011609854826795312, + "loss": 0.7992, + "num_input_tokens_seen": 47974464, + "step": 82625 + }, + { + "epoch": 12.30711945189157, + "grad_norm": 0.03955078125, + "learning_rate": 0.011607955649722373, + "loss": 0.8108, + "num_input_tokens_seen": 47977408, + "step": 82630 + }, + { + "epoch": 12.307864164432528, + "grad_norm": 0.049072265625, + "learning_rate": 0.01160605652995458, + "loss": 0.7889, + "num_input_tokens_seen": 47980320, + "step": 82635 + }, + { + "epoch": 12.308608876973489, + "grad_norm": 0.031982421875, + "learning_rate": 0.011604157467524011, + "loss": 0.837, + "num_input_tokens_seen": 47983328, + "step": 82640 + }, + { + "epoch": 12.309353589514448, + "grad_norm": 0.03076171875, + "learning_rate": 0.011602258462462753, + "loss": 0.8161, + "num_input_tokens_seen": 47986240, + "step": 82645 + }, + { + "epoch": 12.310098302055406, + "grad_norm": 0.041015625, + "learning_rate": 0.011600359514802885, + "loss": 0.7841, + "num_input_tokens_seen": 47989312, + "step": 82650 + }, + { + "epoch": 12.310843014596365, + "grad_norm": 0.028564453125, + "learning_rate": 0.011598460624576486, + "loss": 0.7785, + "num_input_tokens_seen": 47992224, + "step": 82655 + }, + { + "epoch": 12.311587727137326, + "grad_norm": 0.031005859375, + "learning_rate": 0.011596561791815639, + "loss": 0.7759, + "num_input_tokens_seen": 47995136, + "step": 82660 + }, + { + "epoch": 12.312332439678285, + "grad_norm": 0.033203125, + "learning_rate": 0.011594663016552417, + "loss": 0.7751, + "num_input_tokens_seen": 47998016, + "step": 82665 + }, + { + "epoch": 12.313077152219243, + "grad_norm": 0.034423828125, + "learning_rate": 0.011592764298818907, + "loss": 0.8124, + "num_input_tokens_seen": 48000864, + "step": 82670 + }, + { + "epoch": 12.313821864760202, + "grad_norm": 0.039306640625, + "learning_rate": 0.011590865638647177, + "loss": 0.7742, + "num_input_tokens_seen": 48004000, + "step": 82675 + }, + { + "epoch": 12.314566577301163, + "grad_norm": 0.041015625, + "learning_rate": 0.011588967036069311, + "loss": 0.7922, + "num_input_tokens_seen": 48006848, + "step": 82680 + }, + { + "epoch": 12.315311289842121, + "grad_norm": 0.0233154296875, + "learning_rate": 0.011587068491117378, + "loss": 0.7998, + "num_input_tokens_seen": 48009728, + "step": 82685 + }, + { + "epoch": 12.31605600238308, + "grad_norm": 0.03955078125, + "learning_rate": 0.011585170003823458, + "loss": 0.7761, + "num_input_tokens_seen": 48012480, + "step": 82690 + }, + { + "epoch": 12.316800714924039, + "grad_norm": 0.0286865234375, + "learning_rate": 0.011583271574219619, + "loss": 0.7896, + "num_input_tokens_seen": 48015296, + "step": 82695 + }, + { + "epoch": 12.317545427465, + "grad_norm": 0.020751953125, + "learning_rate": 0.011581373202337928, + "loss": 0.8, + "num_input_tokens_seen": 48018048, + "step": 82700 + }, + { + "epoch": 12.318290140005958, + "grad_norm": 0.04296875, + "learning_rate": 0.011579474888210468, + "loss": 0.7867, + "num_input_tokens_seen": 48020928, + "step": 82705 + }, + { + "epoch": 12.319034852546917, + "grad_norm": 0.0225830078125, + "learning_rate": 0.011577576631869299, + "loss": 0.7945, + "num_input_tokens_seen": 48023616, + "step": 82710 + }, + { + "epoch": 12.319779565087876, + "grad_norm": 0.03369140625, + "learning_rate": 0.011575678433346498, + "loss": 0.8221, + "num_input_tokens_seen": 48026656, + "step": 82715 + }, + { + "epoch": 12.320524277628834, + "grad_norm": 0.0322265625, + "learning_rate": 0.01157378029267413, + "loss": 0.7814, + "num_input_tokens_seen": 48029440, + "step": 82720 + }, + { + "epoch": 12.321268990169795, + "grad_norm": 0.035888671875, + "learning_rate": 0.011571882209884263, + "loss": 0.7877, + "num_input_tokens_seen": 48032512, + "step": 82725 + }, + { + "epoch": 12.322013702710754, + "grad_norm": 0.042724609375, + "learning_rate": 0.011569984185008961, + "loss": 0.7735, + "num_input_tokens_seen": 48035392, + "step": 82730 + }, + { + "epoch": 12.322758415251712, + "grad_norm": 0.0262451171875, + "learning_rate": 0.011568086218080283, + "loss": 0.8028, + "num_input_tokens_seen": 48038336, + "step": 82735 + }, + { + "epoch": 12.323503127792671, + "grad_norm": 0.03369140625, + "learning_rate": 0.011566188309130308, + "loss": 0.7925, + "num_input_tokens_seen": 48041600, + "step": 82740 + }, + { + "epoch": 12.324247840333632, + "grad_norm": 0.035400390625, + "learning_rate": 0.011564290458191085, + "loss": 0.8123, + "num_input_tokens_seen": 48044448, + "step": 82745 + }, + { + "epoch": 12.32499255287459, + "grad_norm": 0.0277099609375, + "learning_rate": 0.011562392665294692, + "loss": 0.805, + "num_input_tokens_seen": 48047168, + "step": 82750 + }, + { + "epoch": 12.32573726541555, + "grad_norm": 0.06494140625, + "learning_rate": 0.011560494930473173, + "loss": 0.8033, + "num_input_tokens_seen": 48050016, + "step": 82755 + }, + { + "epoch": 12.326481977956508, + "grad_norm": 0.042236328125, + "learning_rate": 0.011558597253758601, + "loss": 0.7877, + "num_input_tokens_seen": 48052992, + "step": 82760 + }, + { + "epoch": 12.327226690497469, + "grad_norm": 0.0208740234375, + "learning_rate": 0.011556699635183028, + "loss": 0.8173, + "num_input_tokens_seen": 48055776, + "step": 82765 + }, + { + "epoch": 12.327971403038427, + "grad_norm": 0.0234375, + "learning_rate": 0.011554802074778516, + "loss": 0.7917, + "num_input_tokens_seen": 48058720, + "step": 82770 + }, + { + "epoch": 12.328716115579386, + "grad_norm": 0.055908203125, + "learning_rate": 0.011552904572577126, + "loss": 0.8143, + "num_input_tokens_seen": 48061600, + "step": 82775 + }, + { + "epoch": 12.329460828120345, + "grad_norm": 0.03466796875, + "learning_rate": 0.011551007128610902, + "loss": 0.7915, + "num_input_tokens_seen": 48064352, + "step": 82780 + }, + { + "epoch": 12.330205540661305, + "grad_norm": 0.0186767578125, + "learning_rate": 0.011549109742911911, + "loss": 0.8085, + "num_input_tokens_seen": 48067040, + "step": 82785 + }, + { + "epoch": 12.330950253202264, + "grad_norm": 0.041748046875, + "learning_rate": 0.011547212415512204, + "loss": 0.7854, + "num_input_tokens_seen": 48070208, + "step": 82790 + }, + { + "epoch": 12.331694965743223, + "grad_norm": 0.0439453125, + "learning_rate": 0.011545315146443835, + "loss": 0.8039, + "num_input_tokens_seen": 48073280, + "step": 82795 + }, + { + "epoch": 12.332439678284182, + "grad_norm": 0.02685546875, + "learning_rate": 0.01154341793573885, + "loss": 0.7888, + "num_input_tokens_seen": 48076096, + "step": 82800 + }, + { + "epoch": 12.333184390825142, + "grad_norm": 0.039306640625, + "learning_rate": 0.011541520783429312, + "loss": 0.8384, + "num_input_tokens_seen": 48078944, + "step": 82805 + }, + { + "epoch": 12.333929103366101, + "grad_norm": 0.0286865234375, + "learning_rate": 0.011539623689547264, + "loss": 0.7921, + "num_input_tokens_seen": 48081952, + "step": 82810 + }, + { + "epoch": 12.33467381590706, + "grad_norm": 0.03173828125, + "learning_rate": 0.01153772665412475, + "loss": 0.8015, + "num_input_tokens_seen": 48084768, + "step": 82815 + }, + { + "epoch": 12.335418528448018, + "grad_norm": 0.053466796875, + "learning_rate": 0.011535829677193837, + "loss": 0.805, + "num_input_tokens_seen": 48087584, + "step": 82820 + }, + { + "epoch": 12.336163240988979, + "grad_norm": 0.045166015625, + "learning_rate": 0.011533932758786551, + "loss": 0.7665, + "num_input_tokens_seen": 48090656, + "step": 82825 + }, + { + "epoch": 12.336907953529938, + "grad_norm": 0.033447265625, + "learning_rate": 0.011532035898934955, + "loss": 0.821, + "num_input_tokens_seen": 48093696, + "step": 82830 + }, + { + "epoch": 12.337652666070897, + "grad_norm": 0.03125, + "learning_rate": 0.01153013909767108, + "loss": 0.7883, + "num_input_tokens_seen": 48096448, + "step": 82835 + }, + { + "epoch": 12.338397378611855, + "grad_norm": 0.0260009765625, + "learning_rate": 0.011528242355026986, + "loss": 0.7918, + "num_input_tokens_seen": 48099712, + "step": 82840 + }, + { + "epoch": 12.339142091152816, + "grad_norm": 0.049072265625, + "learning_rate": 0.011526345671034706, + "loss": 0.7926, + "num_input_tokens_seen": 48102592, + "step": 82845 + }, + { + "epoch": 12.339886803693775, + "grad_norm": 0.0322265625, + "learning_rate": 0.011524449045726286, + "loss": 0.8276, + "num_input_tokens_seen": 48105632, + "step": 82850 + }, + { + "epoch": 12.340631516234733, + "grad_norm": 0.036376953125, + "learning_rate": 0.011522552479133766, + "loss": 0.806, + "num_input_tokens_seen": 48108768, + "step": 82855 + }, + { + "epoch": 12.341376228775692, + "grad_norm": 0.027099609375, + "learning_rate": 0.011520655971289187, + "loss": 0.806, + "num_input_tokens_seen": 48111872, + "step": 82860 + }, + { + "epoch": 12.342120941316653, + "grad_norm": 0.048583984375, + "learning_rate": 0.011518759522224593, + "loss": 0.7716, + "num_input_tokens_seen": 48114976, + "step": 82865 + }, + { + "epoch": 12.342865653857611, + "grad_norm": 0.030029296875, + "learning_rate": 0.011516863131972012, + "loss": 0.7949, + "num_input_tokens_seen": 48117728, + "step": 82870 + }, + { + "epoch": 12.34361036639857, + "grad_norm": 0.033935546875, + "learning_rate": 0.011514966800563494, + "loss": 0.7849, + "num_input_tokens_seen": 48120576, + "step": 82875 + }, + { + "epoch": 12.344355078939529, + "grad_norm": 0.039306640625, + "learning_rate": 0.011513070528031065, + "loss": 0.7987, + "num_input_tokens_seen": 48123456, + "step": 82880 + }, + { + "epoch": 12.34509979148049, + "grad_norm": 0.03125, + "learning_rate": 0.011511174314406774, + "loss": 0.7973, + "num_input_tokens_seen": 48126400, + "step": 82885 + }, + { + "epoch": 12.345844504021448, + "grad_norm": 0.04638671875, + "learning_rate": 0.011509278159722636, + "loss": 0.7791, + "num_input_tokens_seen": 48129280, + "step": 82890 + }, + { + "epoch": 12.346589216562407, + "grad_norm": 0.038330078125, + "learning_rate": 0.011507382064010703, + "loss": 0.8131, + "num_input_tokens_seen": 48131968, + "step": 82895 + }, + { + "epoch": 12.347333929103366, + "grad_norm": 0.0419921875, + "learning_rate": 0.011505486027303002, + "loss": 0.7921, + "num_input_tokens_seen": 48134784, + "step": 82900 + }, + { + "epoch": 12.348078641644324, + "grad_norm": 0.03955078125, + "learning_rate": 0.011503590049631555, + "loss": 0.792, + "num_input_tokens_seen": 48137600, + "step": 82905 + }, + { + "epoch": 12.348823354185285, + "grad_norm": 0.04150390625, + "learning_rate": 0.011501694131028407, + "loss": 0.7716, + "num_input_tokens_seen": 48140288, + "step": 82910 + }, + { + "epoch": 12.349568066726244, + "grad_norm": 0.02587890625, + "learning_rate": 0.011499798271525577, + "loss": 0.786, + "num_input_tokens_seen": 48143424, + "step": 82915 + }, + { + "epoch": 12.350312779267203, + "grad_norm": 0.044189453125, + "learning_rate": 0.011497902471155102, + "loss": 0.7747, + "num_input_tokens_seen": 48146656, + "step": 82920 + }, + { + "epoch": 12.351057491808161, + "grad_norm": 0.044677734375, + "learning_rate": 0.011496006729949, + "loss": 0.796, + "num_input_tokens_seen": 48149472, + "step": 82925 + }, + { + "epoch": 12.351802204349122, + "grad_norm": 0.05126953125, + "learning_rate": 0.011494111047939308, + "loss": 0.8052, + "num_input_tokens_seen": 48152576, + "step": 82930 + }, + { + "epoch": 12.35254691689008, + "grad_norm": 0.0235595703125, + "learning_rate": 0.011492215425158046, + "loss": 0.7904, + "num_input_tokens_seen": 48155648, + "step": 82935 + }, + { + "epoch": 12.35329162943104, + "grad_norm": 0.027587890625, + "learning_rate": 0.011490319861637234, + "loss": 0.7622, + "num_input_tokens_seen": 48158752, + "step": 82940 + }, + { + "epoch": 12.354036341971998, + "grad_norm": 0.03369140625, + "learning_rate": 0.011488424357408909, + "loss": 0.7824, + "num_input_tokens_seen": 48161568, + "step": 82945 + }, + { + "epoch": 12.354781054512959, + "grad_norm": 0.0439453125, + "learning_rate": 0.01148652891250508, + "loss": 0.8091, + "num_input_tokens_seen": 48164352, + "step": 82950 + }, + { + "epoch": 12.355525767053917, + "grad_norm": 0.0281982421875, + "learning_rate": 0.011484633526957777, + "loss": 0.7952, + "num_input_tokens_seen": 48167328, + "step": 82955 + }, + { + "epoch": 12.356270479594876, + "grad_norm": 0.0269775390625, + "learning_rate": 0.011482738200799012, + "loss": 0.8001, + "num_input_tokens_seen": 48170432, + "step": 82960 + }, + { + "epoch": 12.357015192135835, + "grad_norm": 0.03857421875, + "learning_rate": 0.011480842934060816, + "loss": 0.7847, + "num_input_tokens_seen": 48173408, + "step": 82965 + }, + { + "epoch": 12.357759904676795, + "grad_norm": 0.042724609375, + "learning_rate": 0.011478947726775197, + "loss": 0.7758, + "num_input_tokens_seen": 48176512, + "step": 82970 + }, + { + "epoch": 12.358504617217754, + "grad_norm": 0.0277099609375, + "learning_rate": 0.011477052578974183, + "loss": 0.7995, + "num_input_tokens_seen": 48179552, + "step": 82975 + }, + { + "epoch": 12.359249329758713, + "grad_norm": 0.03173828125, + "learning_rate": 0.011475157490689785, + "loss": 0.8161, + "num_input_tokens_seen": 48182368, + "step": 82980 + }, + { + "epoch": 12.359994042299672, + "grad_norm": 0.03173828125, + "learning_rate": 0.011473262461954015, + "loss": 0.7948, + "num_input_tokens_seen": 48185504, + "step": 82985 + }, + { + "epoch": 12.360738754840632, + "grad_norm": 0.03125, + "learning_rate": 0.011471367492798897, + "loss": 0.7999, + "num_input_tokens_seen": 48188256, + "step": 82990 + }, + { + "epoch": 12.361483467381591, + "grad_norm": 0.0390625, + "learning_rate": 0.01146947258325643, + "loss": 0.7899, + "num_input_tokens_seen": 48190912, + "step": 82995 + }, + { + "epoch": 12.36222817992255, + "grad_norm": 0.0390625, + "learning_rate": 0.011467577733358643, + "loss": 0.7903, + "num_input_tokens_seen": 48193696, + "step": 83000 + }, + { + "epoch": 12.362972892463509, + "grad_norm": 0.0322265625, + "learning_rate": 0.011465682943137533, + "loss": 0.8121, + "num_input_tokens_seen": 48196544, + "step": 83005 + }, + { + "epoch": 12.363717605004469, + "grad_norm": 0.0458984375, + "learning_rate": 0.011463788212625125, + "loss": 0.8076, + "num_input_tokens_seen": 48199264, + "step": 83010 + }, + { + "epoch": 12.364462317545428, + "grad_norm": 0.03515625, + "learning_rate": 0.01146189354185342, + "loss": 0.8225, + "num_input_tokens_seen": 48202016, + "step": 83015 + }, + { + "epoch": 12.365207030086387, + "grad_norm": 0.03173828125, + "learning_rate": 0.011459998930854424, + "loss": 0.7927, + "num_input_tokens_seen": 48205024, + "step": 83020 + }, + { + "epoch": 12.365951742627345, + "grad_norm": 0.041748046875, + "learning_rate": 0.011458104379660152, + "loss": 0.7832, + "num_input_tokens_seen": 48208192, + "step": 83025 + }, + { + "epoch": 12.366696455168306, + "grad_norm": 0.039794921875, + "learning_rate": 0.0114562098883026, + "loss": 0.7816, + "num_input_tokens_seen": 48210720, + "step": 83030 + }, + { + "epoch": 12.367441167709265, + "grad_norm": 0.03759765625, + "learning_rate": 0.011454315456813788, + "loss": 0.8092, + "num_input_tokens_seen": 48213504, + "step": 83035 + }, + { + "epoch": 12.368185880250223, + "grad_norm": 0.032470703125, + "learning_rate": 0.011452421085225706, + "loss": 0.8025, + "num_input_tokens_seen": 48216320, + "step": 83040 + }, + { + "epoch": 12.368930592791182, + "grad_norm": 0.042724609375, + "learning_rate": 0.011450526773570368, + "loss": 0.7877, + "num_input_tokens_seen": 48219360, + "step": 83045 + }, + { + "epoch": 12.36967530533214, + "grad_norm": 0.043212890625, + "learning_rate": 0.01144863252187977, + "loss": 0.8184, + "num_input_tokens_seen": 48222240, + "step": 83050 + }, + { + "epoch": 12.370420017873101, + "grad_norm": 0.03759765625, + "learning_rate": 0.011446738330185922, + "loss": 0.7925, + "num_input_tokens_seen": 48225216, + "step": 83055 + }, + { + "epoch": 12.37116473041406, + "grad_norm": 0.029052734375, + "learning_rate": 0.011444844198520817, + "loss": 0.8314, + "num_input_tokens_seen": 48227840, + "step": 83060 + }, + { + "epoch": 12.371909442955019, + "grad_norm": 0.0281982421875, + "learning_rate": 0.011442950126916448, + "loss": 0.7939, + "num_input_tokens_seen": 48230656, + "step": 83065 + }, + { + "epoch": 12.37265415549598, + "grad_norm": 0.037841796875, + "learning_rate": 0.011441056115404831, + "loss": 0.7792, + "num_input_tokens_seen": 48233408, + "step": 83070 + }, + { + "epoch": 12.373398868036938, + "grad_norm": 0.020751953125, + "learning_rate": 0.011439162164017946, + "loss": 0.7897, + "num_input_tokens_seen": 48236416, + "step": 83075 + }, + { + "epoch": 12.374143580577897, + "grad_norm": 0.044921875, + "learning_rate": 0.011437268272787803, + "loss": 0.7892, + "num_input_tokens_seen": 48239200, + "step": 83080 + }, + { + "epoch": 12.374888293118856, + "grad_norm": 0.048828125, + "learning_rate": 0.011435374441746388, + "loss": 0.7814, + "num_input_tokens_seen": 48241952, + "step": 83085 + }, + { + "epoch": 12.375633005659815, + "grad_norm": 0.04150390625, + "learning_rate": 0.011433480670925703, + "loss": 0.7919, + "num_input_tokens_seen": 48244864, + "step": 83090 + }, + { + "epoch": 12.376377718200775, + "grad_norm": 0.02880859375, + "learning_rate": 0.011431586960357738, + "loss": 0.7837, + "num_input_tokens_seen": 48247680, + "step": 83095 + }, + { + "epoch": 12.377122430741734, + "grad_norm": 0.0380859375, + "learning_rate": 0.011429693310074476, + "loss": 0.8014, + "num_input_tokens_seen": 48250912, + "step": 83100 + }, + { + "epoch": 12.377867143282693, + "grad_norm": 0.0203857421875, + "learning_rate": 0.011427799720107925, + "loss": 0.7769, + "num_input_tokens_seen": 48253696, + "step": 83105 + }, + { + "epoch": 12.378611855823651, + "grad_norm": 0.03955078125, + "learning_rate": 0.01142590619049006, + "loss": 0.8167, + "num_input_tokens_seen": 48256320, + "step": 83110 + }, + { + "epoch": 12.379356568364612, + "grad_norm": 0.07177734375, + "learning_rate": 0.011424012721252883, + "loss": 0.8202, + "num_input_tokens_seen": 48259360, + "step": 83115 + }, + { + "epoch": 12.38010128090557, + "grad_norm": 0.04150390625, + "learning_rate": 0.011422119312428375, + "loss": 0.8441, + "num_input_tokens_seen": 48262336, + "step": 83120 + }, + { + "epoch": 12.38084599344653, + "grad_norm": 0.029541015625, + "learning_rate": 0.011420225964048526, + "loss": 0.7966, + "num_input_tokens_seen": 48265248, + "step": 83125 + }, + { + "epoch": 12.381590705987488, + "grad_norm": 0.04443359375, + "learning_rate": 0.011418332676145317, + "loss": 0.8106, + "num_input_tokens_seen": 48268128, + "step": 83130 + }, + { + "epoch": 12.382335418528449, + "grad_norm": 0.031494140625, + "learning_rate": 0.011416439448750744, + "loss": 0.7876, + "num_input_tokens_seen": 48271008, + "step": 83135 + }, + { + "epoch": 12.383080131069407, + "grad_norm": 0.0216064453125, + "learning_rate": 0.011414546281896781, + "loss": 0.7875, + "num_input_tokens_seen": 48274368, + "step": 83140 + }, + { + "epoch": 12.383824843610366, + "grad_norm": 0.033935546875, + "learning_rate": 0.01141265317561541, + "loss": 0.7905, + "num_input_tokens_seen": 48277088, + "step": 83145 + }, + { + "epoch": 12.384569556151325, + "grad_norm": 0.0250244140625, + "learning_rate": 0.011410760129938623, + "loss": 0.8035, + "num_input_tokens_seen": 48280000, + "step": 83150 + }, + { + "epoch": 12.385314268692285, + "grad_norm": 0.0277099609375, + "learning_rate": 0.011408867144898393, + "loss": 0.8016, + "num_input_tokens_seen": 48282752, + "step": 83155 + }, + { + "epoch": 12.386058981233244, + "grad_norm": 0.04443359375, + "learning_rate": 0.011406974220526707, + "loss": 0.8019, + "num_input_tokens_seen": 48285728, + "step": 83160 + }, + { + "epoch": 12.386803693774203, + "grad_norm": 0.02001953125, + "learning_rate": 0.011405081356855534, + "loss": 0.7989, + "num_input_tokens_seen": 48289408, + "step": 83165 + }, + { + "epoch": 12.387548406315162, + "grad_norm": 0.0400390625, + "learning_rate": 0.011403188553916862, + "loss": 0.8077, + "num_input_tokens_seen": 48292576, + "step": 83170 + }, + { + "epoch": 12.388293118856122, + "grad_norm": 0.03076171875, + "learning_rate": 0.011401295811742663, + "loss": 0.8115, + "num_input_tokens_seen": 48295872, + "step": 83175 + }, + { + "epoch": 12.389037831397081, + "grad_norm": 0.041748046875, + "learning_rate": 0.011399403130364912, + "loss": 0.779, + "num_input_tokens_seen": 48298336, + "step": 83180 + }, + { + "epoch": 12.38978254393804, + "grad_norm": 0.043212890625, + "learning_rate": 0.011397510509815587, + "loss": 0.7988, + "num_input_tokens_seen": 48300960, + "step": 83185 + }, + { + "epoch": 12.390527256478999, + "grad_norm": 0.038818359375, + "learning_rate": 0.01139561795012666, + "loss": 0.7874, + "num_input_tokens_seen": 48303616, + "step": 83190 + }, + { + "epoch": 12.391271969019959, + "grad_norm": 0.03173828125, + "learning_rate": 0.011393725451330106, + "loss": 0.8122, + "num_input_tokens_seen": 48306656, + "step": 83195 + }, + { + "epoch": 12.392016681560918, + "grad_norm": 0.03076171875, + "learning_rate": 0.011391833013457888, + "loss": 0.8118, + "num_input_tokens_seen": 48309696, + "step": 83200 + }, + { + "epoch": 12.392761394101877, + "grad_norm": 0.0361328125, + "learning_rate": 0.011389940636541994, + "loss": 0.8117, + "num_input_tokens_seen": 48312608, + "step": 83205 + }, + { + "epoch": 12.393506106642835, + "grad_norm": 0.033935546875, + "learning_rate": 0.011388048320614377, + "loss": 0.7941, + "num_input_tokens_seen": 48315456, + "step": 83210 + }, + { + "epoch": 12.394250819183796, + "grad_norm": 0.0341796875, + "learning_rate": 0.01138615606570702, + "loss": 0.8059, + "num_input_tokens_seen": 48318720, + "step": 83215 + }, + { + "epoch": 12.394995531724755, + "grad_norm": 0.03955078125, + "learning_rate": 0.01138426387185188, + "loss": 0.8014, + "num_input_tokens_seen": 48321792, + "step": 83220 + }, + { + "epoch": 12.395740244265713, + "grad_norm": 0.037109375, + "learning_rate": 0.01138237173908092, + "loss": 0.7861, + "num_input_tokens_seen": 48324544, + "step": 83225 + }, + { + "epoch": 12.396484956806672, + "grad_norm": 0.022216796875, + "learning_rate": 0.01138047966742612, + "loss": 0.8009, + "num_input_tokens_seen": 48327520, + "step": 83230 + }, + { + "epoch": 12.397229669347631, + "grad_norm": 0.0380859375, + "learning_rate": 0.011378587656919431, + "loss": 0.8191, + "num_input_tokens_seen": 48330080, + "step": 83235 + }, + { + "epoch": 12.397974381888591, + "grad_norm": 0.0322265625, + "learning_rate": 0.01137669570759283, + "loss": 0.7823, + "num_input_tokens_seen": 48333184, + "step": 83240 + }, + { + "epoch": 12.39871909442955, + "grad_norm": 0.0322265625, + "learning_rate": 0.01137480381947827, + "loss": 0.8195, + "num_input_tokens_seen": 48336160, + "step": 83245 + }, + { + "epoch": 12.399463806970509, + "grad_norm": 0.03125, + "learning_rate": 0.011372911992607718, + "loss": 0.7979, + "num_input_tokens_seen": 48339616, + "step": 83250 + }, + { + "epoch": 12.400208519511468, + "grad_norm": 0.042724609375, + "learning_rate": 0.011371020227013131, + "loss": 0.7992, + "num_input_tokens_seen": 48342592, + "step": 83255 + }, + { + "epoch": 12.400953232052428, + "grad_norm": 0.034912109375, + "learning_rate": 0.011369128522726462, + "loss": 0.8078, + "num_input_tokens_seen": 48345440, + "step": 83260 + }, + { + "epoch": 12.401697944593387, + "grad_norm": 0.031982421875, + "learning_rate": 0.01136723687977968, + "loss": 0.7932, + "num_input_tokens_seen": 48348448, + "step": 83265 + }, + { + "epoch": 12.402442657134346, + "grad_norm": 0.05078125, + "learning_rate": 0.011365345298204736, + "loss": 0.8078, + "num_input_tokens_seen": 48351072, + "step": 83270 + }, + { + "epoch": 12.403187369675305, + "grad_norm": 0.047119140625, + "learning_rate": 0.011363453778033597, + "loss": 0.7717, + "num_input_tokens_seen": 48353760, + "step": 83275 + }, + { + "epoch": 12.403932082216265, + "grad_norm": 0.03857421875, + "learning_rate": 0.011361562319298205, + "loss": 0.7908, + "num_input_tokens_seen": 48356768, + "step": 83280 + }, + { + "epoch": 12.404676794757224, + "grad_norm": 0.052490234375, + "learning_rate": 0.011359670922030522, + "loss": 0.7964, + "num_input_tokens_seen": 48359744, + "step": 83285 + }, + { + "epoch": 12.405421507298183, + "grad_norm": 0.042236328125, + "learning_rate": 0.011357779586262492, + "loss": 0.8004, + "num_input_tokens_seen": 48362720, + "step": 83290 + }, + { + "epoch": 12.406166219839141, + "grad_norm": 0.04248046875, + "learning_rate": 0.011355888312026082, + "loss": 0.7896, + "num_input_tokens_seen": 48366016, + "step": 83295 + }, + { + "epoch": 12.406910932380102, + "grad_norm": 0.0299072265625, + "learning_rate": 0.011353997099353235, + "loss": 0.7976, + "num_input_tokens_seen": 48369056, + "step": 83300 + }, + { + "epoch": 12.40765564492106, + "grad_norm": 0.035888671875, + "learning_rate": 0.011352105948275897, + "loss": 0.7938, + "num_input_tokens_seen": 48371968, + "step": 83305 + }, + { + "epoch": 12.40840035746202, + "grad_norm": 0.01953125, + "learning_rate": 0.011350214858826025, + "loss": 0.8035, + "num_input_tokens_seen": 48375008, + "step": 83310 + }, + { + "epoch": 12.409145070002978, + "grad_norm": 0.0208740234375, + "learning_rate": 0.01134832383103556, + "loss": 0.8043, + "num_input_tokens_seen": 48377792, + "step": 83315 + }, + { + "epoch": 12.409889782543939, + "grad_norm": 0.0478515625, + "learning_rate": 0.011346432864936458, + "loss": 0.7557, + "num_input_tokens_seen": 48380832, + "step": 83320 + }, + { + "epoch": 12.410634495084897, + "grad_norm": 0.04345703125, + "learning_rate": 0.011344541960560652, + "loss": 0.7968, + "num_input_tokens_seen": 48383616, + "step": 83325 + }, + { + "epoch": 12.411379207625856, + "grad_norm": 0.03466796875, + "learning_rate": 0.0113426511179401, + "loss": 0.8273, + "num_input_tokens_seen": 48386112, + "step": 83330 + }, + { + "epoch": 12.412123920166815, + "grad_norm": 0.0308837890625, + "learning_rate": 0.011340760337106742, + "loss": 0.8027, + "num_input_tokens_seen": 48388896, + "step": 83335 + }, + { + "epoch": 12.412868632707776, + "grad_norm": 0.0205078125, + "learning_rate": 0.011338869618092511, + "loss": 0.8269, + "num_input_tokens_seen": 48391616, + "step": 83340 + }, + { + "epoch": 12.413613345248734, + "grad_norm": 0.0262451171875, + "learning_rate": 0.011336978960929363, + "loss": 0.7871, + "num_input_tokens_seen": 48394400, + "step": 83345 + }, + { + "epoch": 12.414358057789693, + "grad_norm": 0.04345703125, + "learning_rate": 0.01133508836564923, + "loss": 0.8312, + "num_input_tokens_seen": 48397152, + "step": 83350 + }, + { + "epoch": 12.415102770330652, + "grad_norm": 0.03515625, + "learning_rate": 0.011333197832284059, + "loss": 0.7966, + "num_input_tokens_seen": 48400000, + "step": 83355 + }, + { + "epoch": 12.415847482871612, + "grad_norm": 0.0274658203125, + "learning_rate": 0.011331307360865775, + "loss": 0.7929, + "num_input_tokens_seen": 48402688, + "step": 83360 + }, + { + "epoch": 12.416592195412571, + "grad_norm": 0.054443359375, + "learning_rate": 0.011329416951426332, + "loss": 0.7711, + "num_input_tokens_seen": 48405536, + "step": 83365 + }, + { + "epoch": 12.41733690795353, + "grad_norm": 0.040283203125, + "learning_rate": 0.011327526603997651, + "loss": 0.8134, + "num_input_tokens_seen": 48408448, + "step": 83370 + }, + { + "epoch": 12.418081620494489, + "grad_norm": 0.038330078125, + "learning_rate": 0.011325636318611684, + "loss": 0.8082, + "num_input_tokens_seen": 48411264, + "step": 83375 + }, + { + "epoch": 12.41882633303545, + "grad_norm": 0.0208740234375, + "learning_rate": 0.011323746095300357, + "loss": 0.7804, + "num_input_tokens_seen": 48414304, + "step": 83380 + }, + { + "epoch": 12.419571045576408, + "grad_norm": 0.0311279296875, + "learning_rate": 0.011321855934095598, + "loss": 0.7966, + "num_input_tokens_seen": 48417376, + "step": 83385 + }, + { + "epoch": 12.420315758117367, + "grad_norm": 0.0517578125, + "learning_rate": 0.011319965835029349, + "loss": 0.8005, + "num_input_tokens_seen": 48420288, + "step": 83390 + }, + { + "epoch": 12.421060470658325, + "grad_norm": 0.033935546875, + "learning_rate": 0.01131807579813353, + "loss": 0.7871, + "num_input_tokens_seen": 48423296, + "step": 83395 + }, + { + "epoch": 12.421805183199286, + "grad_norm": 0.0654296875, + "learning_rate": 0.011316185823440086, + "loss": 0.8154, + "num_input_tokens_seen": 48426560, + "step": 83400 + }, + { + "epoch": 12.422549895740245, + "grad_norm": 0.048583984375, + "learning_rate": 0.011314295910980932, + "loss": 0.8127, + "num_input_tokens_seen": 48429504, + "step": 83405 + }, + { + "epoch": 12.423294608281203, + "grad_norm": 0.0537109375, + "learning_rate": 0.011312406060788009, + "loss": 0.7972, + "num_input_tokens_seen": 48432288, + "step": 83410 + }, + { + "epoch": 12.424039320822162, + "grad_norm": 0.05517578125, + "learning_rate": 0.011310516272893232, + "loss": 0.8119, + "num_input_tokens_seen": 48435072, + "step": 83415 + }, + { + "epoch": 12.424784033363121, + "grad_norm": 0.04541015625, + "learning_rate": 0.011308626547328539, + "loss": 0.7812, + "num_input_tokens_seen": 48437728, + "step": 83420 + }, + { + "epoch": 12.425528745904082, + "grad_norm": 0.0228271484375, + "learning_rate": 0.011306736884125848, + "loss": 0.7776, + "num_input_tokens_seen": 48440672, + "step": 83425 + }, + { + "epoch": 12.42627345844504, + "grad_norm": 0.357421875, + "learning_rate": 0.011304847283317078, + "loss": 0.8093, + "num_input_tokens_seen": 48443552, + "step": 83430 + }, + { + "epoch": 12.427018170985999, + "grad_norm": 0.035888671875, + "learning_rate": 0.01130295774493416, + "loss": 0.8095, + "num_input_tokens_seen": 48446368, + "step": 83435 + }, + { + "epoch": 12.427762883526958, + "grad_norm": 0.04296875, + "learning_rate": 0.011301068269009012, + "loss": 0.7781, + "num_input_tokens_seen": 48449248, + "step": 83440 + }, + { + "epoch": 12.428507596067918, + "grad_norm": 0.03466796875, + "learning_rate": 0.01129917885557356, + "loss": 0.807, + "num_input_tokens_seen": 48452320, + "step": 83445 + }, + { + "epoch": 12.429252308608877, + "grad_norm": 0.0673828125, + "learning_rate": 0.011297289504659715, + "loss": 0.8204, + "num_input_tokens_seen": 48455008, + "step": 83450 + }, + { + "epoch": 12.429997021149836, + "grad_norm": 0.038818359375, + "learning_rate": 0.011295400216299408, + "loss": 0.8123, + "num_input_tokens_seen": 48458080, + "step": 83455 + }, + { + "epoch": 12.430741733690795, + "grad_norm": 0.027587890625, + "learning_rate": 0.011293510990524547, + "loss": 0.834, + "num_input_tokens_seen": 48460928, + "step": 83460 + }, + { + "epoch": 12.431486446231755, + "grad_norm": 0.0277099609375, + "learning_rate": 0.011291621827367045, + "loss": 0.8056, + "num_input_tokens_seen": 48464448, + "step": 83465 + }, + { + "epoch": 12.432231158772714, + "grad_norm": 0.0281982421875, + "learning_rate": 0.011289732726858829, + "loss": 0.827, + "num_input_tokens_seen": 48467264, + "step": 83470 + }, + { + "epoch": 12.432975871313673, + "grad_norm": 0.043212890625, + "learning_rate": 0.0112878436890318, + "loss": 0.8008, + "num_input_tokens_seen": 48470080, + "step": 83475 + }, + { + "epoch": 12.433720583854631, + "grad_norm": 0.030517578125, + "learning_rate": 0.011285954713917886, + "loss": 0.8097, + "num_input_tokens_seen": 48473152, + "step": 83480 + }, + { + "epoch": 12.434465296395592, + "grad_norm": 0.046630859375, + "learning_rate": 0.01128406580154899, + "loss": 0.8204, + "num_input_tokens_seen": 48475840, + "step": 83485 + }, + { + "epoch": 12.43521000893655, + "grad_norm": 0.04736328125, + "learning_rate": 0.011282176951957028, + "loss": 0.8264, + "num_input_tokens_seen": 48479104, + "step": 83490 + }, + { + "epoch": 12.43595472147751, + "grad_norm": 0.056884765625, + "learning_rate": 0.011280288165173902, + "loss": 0.797, + "num_input_tokens_seen": 48482144, + "step": 83495 + }, + { + "epoch": 12.436699434018468, + "grad_norm": 0.04296875, + "learning_rate": 0.011278399441231534, + "loss": 0.8003, + "num_input_tokens_seen": 48484992, + "step": 83500 + }, + { + "epoch": 12.437444146559429, + "grad_norm": 0.03369140625, + "learning_rate": 0.011276510780161824, + "loss": 0.8127, + "num_input_tokens_seen": 48488032, + "step": 83505 + }, + { + "epoch": 12.438188859100388, + "grad_norm": 0.044677734375, + "learning_rate": 0.011274622181996676, + "loss": 0.8055, + "num_input_tokens_seen": 48491040, + "step": 83510 + }, + { + "epoch": 12.438933571641346, + "grad_norm": 0.033447265625, + "learning_rate": 0.011272733646768002, + "loss": 0.8116, + "num_input_tokens_seen": 48493792, + "step": 83515 + }, + { + "epoch": 12.439678284182305, + "grad_norm": 0.042236328125, + "learning_rate": 0.011270845174507703, + "loss": 0.8058, + "num_input_tokens_seen": 48496576, + "step": 83520 + }, + { + "epoch": 12.440422996723266, + "grad_norm": 0.0289306640625, + "learning_rate": 0.01126895676524769, + "loss": 0.7854, + "num_input_tokens_seen": 48499296, + "step": 83525 + }, + { + "epoch": 12.441167709264224, + "grad_norm": 0.0289306640625, + "learning_rate": 0.01126706841901985, + "loss": 0.8, + "num_input_tokens_seen": 48501984, + "step": 83530 + }, + { + "epoch": 12.441912421805183, + "grad_norm": 0.033447265625, + "learning_rate": 0.011265180135856104, + "loss": 0.813, + "num_input_tokens_seen": 48504800, + "step": 83535 + }, + { + "epoch": 12.442657134346142, + "grad_norm": 0.033935546875, + "learning_rate": 0.011263291915788344, + "loss": 0.8148, + "num_input_tokens_seen": 48507520, + "step": 83540 + }, + { + "epoch": 12.443401846887102, + "grad_norm": 0.03955078125, + "learning_rate": 0.011261403758848466, + "loss": 0.8186, + "num_input_tokens_seen": 48510560, + "step": 83545 + }, + { + "epoch": 12.444146559428061, + "grad_norm": 0.0225830078125, + "learning_rate": 0.011259515665068375, + "loss": 0.8074, + "num_input_tokens_seen": 48513440, + "step": 83550 + }, + { + "epoch": 12.44489127196902, + "grad_norm": 0.03857421875, + "learning_rate": 0.011257627634479959, + "loss": 0.8097, + "num_input_tokens_seen": 48516352, + "step": 83555 + }, + { + "epoch": 12.445635984509979, + "grad_norm": 0.036376953125, + "learning_rate": 0.011255739667115126, + "loss": 0.8023, + "num_input_tokens_seen": 48519584, + "step": 83560 + }, + { + "epoch": 12.44638069705094, + "grad_norm": 0.0216064453125, + "learning_rate": 0.01125385176300576, + "loss": 0.7865, + "num_input_tokens_seen": 48522240, + "step": 83565 + }, + { + "epoch": 12.447125409591898, + "grad_norm": 0.04443359375, + "learning_rate": 0.011251963922183767, + "loss": 0.7819, + "num_input_tokens_seen": 48525248, + "step": 83570 + }, + { + "epoch": 12.447870122132857, + "grad_norm": 0.04248046875, + "learning_rate": 0.011250076144681032, + "loss": 0.782, + "num_input_tokens_seen": 48528032, + "step": 83575 + }, + { + "epoch": 12.448614834673815, + "grad_norm": 0.0228271484375, + "learning_rate": 0.011248188430529452, + "loss": 0.8004, + "num_input_tokens_seen": 48530848, + "step": 83580 + }, + { + "epoch": 12.449359547214776, + "grad_norm": 0.040771484375, + "learning_rate": 0.011246300779760915, + "loss": 0.7779, + "num_input_tokens_seen": 48533632, + "step": 83585 + }, + { + "epoch": 12.450104259755735, + "grad_norm": 0.02392578125, + "learning_rate": 0.011244413192407304, + "loss": 0.7865, + "num_input_tokens_seen": 48536608, + "step": 83590 + }, + { + "epoch": 12.450848972296694, + "grad_norm": 0.05224609375, + "learning_rate": 0.011242525668500522, + "loss": 0.805, + "num_input_tokens_seen": 48539584, + "step": 83595 + }, + { + "epoch": 12.451593684837652, + "grad_norm": 0.03955078125, + "learning_rate": 0.011240638208072444, + "loss": 0.8099, + "num_input_tokens_seen": 48542688, + "step": 83600 + }, + { + "epoch": 12.452338397378611, + "grad_norm": 0.047607421875, + "learning_rate": 0.011238750811154968, + "loss": 0.7942, + "num_input_tokens_seen": 48545760, + "step": 83605 + }, + { + "epoch": 12.453083109919572, + "grad_norm": 0.03759765625, + "learning_rate": 0.011236863477779972, + "loss": 0.8037, + "num_input_tokens_seen": 48548608, + "step": 83610 + }, + { + "epoch": 12.45382782246053, + "grad_norm": 0.02490234375, + "learning_rate": 0.011234976207979348, + "loss": 0.7838, + "num_input_tokens_seen": 48551424, + "step": 83615 + }, + { + "epoch": 12.454572535001489, + "grad_norm": 0.02685546875, + "learning_rate": 0.011233089001784971, + "loss": 0.81, + "num_input_tokens_seen": 48554176, + "step": 83620 + }, + { + "epoch": 12.455317247542448, + "grad_norm": 0.02099609375, + "learning_rate": 0.011231201859228723, + "loss": 0.8062, + "num_input_tokens_seen": 48556672, + "step": 83625 + }, + { + "epoch": 12.456061960083408, + "grad_norm": 0.030029296875, + "learning_rate": 0.011229314780342494, + "loss": 0.7971, + "num_input_tokens_seen": 48559520, + "step": 83630 + }, + { + "epoch": 12.456806672624367, + "grad_norm": 0.0322265625, + "learning_rate": 0.011227427765158154, + "loss": 0.7909, + "num_input_tokens_seen": 48562592, + "step": 83635 + }, + { + "epoch": 12.457551385165326, + "grad_norm": 0.034423828125, + "learning_rate": 0.011225540813707592, + "loss": 0.8072, + "num_input_tokens_seen": 48565376, + "step": 83640 + }, + { + "epoch": 12.458296097706285, + "grad_norm": 0.0595703125, + "learning_rate": 0.01122365392602268, + "loss": 0.8107, + "num_input_tokens_seen": 48568192, + "step": 83645 + }, + { + "epoch": 12.459040810247245, + "grad_norm": 0.03173828125, + "learning_rate": 0.0112217671021353, + "loss": 0.8026, + "num_input_tokens_seen": 48571072, + "step": 83650 + }, + { + "epoch": 12.459785522788204, + "grad_norm": 0.02685546875, + "learning_rate": 0.01121988034207732, + "loss": 0.8245, + "num_input_tokens_seen": 48573888, + "step": 83655 + }, + { + "epoch": 12.460530235329163, + "grad_norm": 0.040771484375, + "learning_rate": 0.011217993645880622, + "loss": 0.8096, + "num_input_tokens_seen": 48576768, + "step": 83660 + }, + { + "epoch": 12.461274947870121, + "grad_norm": 0.03564453125, + "learning_rate": 0.011216107013577082, + "loss": 0.7831, + "num_input_tokens_seen": 48579712, + "step": 83665 + }, + { + "epoch": 12.462019660411082, + "grad_norm": 0.032470703125, + "learning_rate": 0.011214220445198559, + "loss": 0.7814, + "num_input_tokens_seen": 48583072, + "step": 83670 + }, + { + "epoch": 12.46276437295204, + "grad_norm": 0.02978515625, + "learning_rate": 0.011212333940776942, + "loss": 0.8117, + "num_input_tokens_seen": 48585728, + "step": 83675 + }, + { + "epoch": 12.463509085493, + "grad_norm": 0.03515625, + "learning_rate": 0.011210447500344087, + "loss": 0.7876, + "num_input_tokens_seen": 48588256, + "step": 83680 + }, + { + "epoch": 12.464253798033958, + "grad_norm": 0.0303955078125, + "learning_rate": 0.011208561123931877, + "loss": 0.8254, + "num_input_tokens_seen": 48591168, + "step": 83685 + }, + { + "epoch": 12.464998510574919, + "grad_norm": 0.03662109375, + "learning_rate": 0.011206674811572165, + "loss": 0.8157, + "num_input_tokens_seen": 48594112, + "step": 83690 + }, + { + "epoch": 12.465743223115878, + "grad_norm": 0.0341796875, + "learning_rate": 0.011204788563296835, + "loss": 0.7974, + "num_input_tokens_seen": 48597024, + "step": 83695 + }, + { + "epoch": 12.466487935656836, + "grad_norm": 0.035400390625, + "learning_rate": 0.011202902379137743, + "loss": 0.8037, + "num_input_tokens_seen": 48600288, + "step": 83700 + }, + { + "epoch": 12.467232648197795, + "grad_norm": 0.045166015625, + "learning_rate": 0.01120101625912675, + "loss": 0.8099, + "num_input_tokens_seen": 48603648, + "step": 83705 + }, + { + "epoch": 12.467977360738756, + "grad_norm": 0.05712890625, + "learning_rate": 0.011199130203295734, + "loss": 0.8182, + "num_input_tokens_seen": 48606528, + "step": 83710 + }, + { + "epoch": 12.468722073279714, + "grad_norm": 0.049560546875, + "learning_rate": 0.011197244211676545, + "loss": 0.8093, + "num_input_tokens_seen": 48609376, + "step": 83715 + }, + { + "epoch": 12.469466785820673, + "grad_norm": 0.0244140625, + "learning_rate": 0.011195358284301054, + "loss": 0.8005, + "num_input_tokens_seen": 48612320, + "step": 83720 + }, + { + "epoch": 12.470211498361632, + "grad_norm": 0.03857421875, + "learning_rate": 0.011193472421201112, + "loss": 0.8081, + "num_input_tokens_seen": 48615072, + "step": 83725 + }, + { + "epoch": 12.470956210902592, + "grad_norm": 0.050537109375, + "learning_rate": 0.01119158662240859, + "loss": 0.8155, + "num_input_tokens_seen": 48618304, + "step": 83730 + }, + { + "epoch": 12.471700923443551, + "grad_norm": 0.0294189453125, + "learning_rate": 0.011189700887955336, + "loss": 0.7817, + "num_input_tokens_seen": 48621152, + "step": 83735 + }, + { + "epoch": 12.47244563598451, + "grad_norm": 0.044189453125, + "learning_rate": 0.011187815217873218, + "loss": 0.7907, + "num_input_tokens_seen": 48624224, + "step": 83740 + }, + { + "epoch": 12.473190348525469, + "grad_norm": 0.0272216796875, + "learning_rate": 0.01118592961219409, + "loss": 0.8045, + "num_input_tokens_seen": 48627296, + "step": 83745 + }, + { + "epoch": 12.473935061066427, + "grad_norm": 0.03173828125, + "learning_rate": 0.0111840440709498, + "loss": 0.8148, + "num_input_tokens_seen": 48630336, + "step": 83750 + }, + { + "epoch": 12.474679773607388, + "grad_norm": 0.04052734375, + "learning_rate": 0.011182158594172209, + "loss": 0.7945, + "num_input_tokens_seen": 48633120, + "step": 83755 + }, + { + "epoch": 12.475424486148347, + "grad_norm": 0.033935546875, + "learning_rate": 0.011180273181893163, + "loss": 0.7965, + "num_input_tokens_seen": 48635872, + "step": 83760 + }, + { + "epoch": 12.476169198689306, + "grad_norm": 0.042724609375, + "learning_rate": 0.011178387834144526, + "loss": 0.7971, + "num_input_tokens_seen": 48638720, + "step": 83765 + }, + { + "epoch": 12.476913911230264, + "grad_norm": 0.04541015625, + "learning_rate": 0.011176502550958136, + "loss": 0.7828, + "num_input_tokens_seen": 48641600, + "step": 83770 + }, + { + "epoch": 12.477658623771225, + "grad_norm": 0.03955078125, + "learning_rate": 0.011174617332365855, + "loss": 0.8224, + "num_input_tokens_seen": 48644480, + "step": 83775 + }, + { + "epoch": 12.478403336312184, + "grad_norm": 0.032958984375, + "learning_rate": 0.011172732178399526, + "loss": 0.8118, + "num_input_tokens_seen": 48647328, + "step": 83780 + }, + { + "epoch": 12.479148048853142, + "grad_norm": 0.033935546875, + "learning_rate": 0.011170847089090993, + "loss": 0.8057, + "num_input_tokens_seen": 48650368, + "step": 83785 + }, + { + "epoch": 12.479892761394101, + "grad_norm": 0.035400390625, + "learning_rate": 0.011168962064472113, + "loss": 0.8129, + "num_input_tokens_seen": 48653088, + "step": 83790 + }, + { + "epoch": 12.480637473935062, + "grad_norm": 0.0361328125, + "learning_rate": 0.011167077104574718, + "loss": 0.807, + "num_input_tokens_seen": 48656096, + "step": 83795 + }, + { + "epoch": 12.48138218647602, + "grad_norm": 0.032470703125, + "learning_rate": 0.011165192209430665, + "loss": 0.8058, + "num_input_tokens_seen": 48658976, + "step": 83800 + }, + { + "epoch": 12.48212689901698, + "grad_norm": 0.033935546875, + "learning_rate": 0.011163307379071785, + "loss": 0.8099, + "num_input_tokens_seen": 48661920, + "step": 83805 + }, + { + "epoch": 12.482871611557938, + "grad_norm": 0.048095703125, + "learning_rate": 0.011161422613529936, + "loss": 0.8072, + "num_input_tokens_seen": 48664864, + "step": 83810 + }, + { + "epoch": 12.483616324098898, + "grad_norm": 0.034912109375, + "learning_rate": 0.011159537912836946, + "loss": 0.8024, + "num_input_tokens_seen": 48667520, + "step": 83815 + }, + { + "epoch": 12.484361036639857, + "grad_norm": 0.029296875, + "learning_rate": 0.011157653277024662, + "loss": 0.8083, + "num_input_tokens_seen": 48670336, + "step": 83820 + }, + { + "epoch": 12.485105749180816, + "grad_norm": 0.037109375, + "learning_rate": 0.011155768706124922, + "loss": 0.8143, + "num_input_tokens_seen": 48673184, + "step": 83825 + }, + { + "epoch": 12.485850461721775, + "grad_norm": 0.032958984375, + "learning_rate": 0.011153884200169556, + "loss": 0.8112, + "num_input_tokens_seen": 48676000, + "step": 83830 + }, + { + "epoch": 12.486595174262735, + "grad_norm": 0.048828125, + "learning_rate": 0.011151999759190415, + "loss": 0.7937, + "num_input_tokens_seen": 48678880, + "step": 83835 + }, + { + "epoch": 12.487339886803694, + "grad_norm": 0.04541015625, + "learning_rate": 0.011150115383219323, + "loss": 0.8133, + "num_input_tokens_seen": 48681664, + "step": 83840 + }, + { + "epoch": 12.488084599344653, + "grad_norm": 0.05029296875, + "learning_rate": 0.011148231072288118, + "loss": 0.8097, + "num_input_tokens_seen": 48684320, + "step": 83845 + }, + { + "epoch": 12.488829311885612, + "grad_norm": 0.0306396484375, + "learning_rate": 0.011146346826428634, + "loss": 0.8019, + "num_input_tokens_seen": 48687104, + "step": 83850 + }, + { + "epoch": 12.489574024426572, + "grad_norm": 0.043212890625, + "learning_rate": 0.011144462645672707, + "loss": 0.8009, + "num_input_tokens_seen": 48690048, + "step": 83855 + }, + { + "epoch": 12.49031873696753, + "grad_norm": 0.032958984375, + "learning_rate": 0.011142578530052166, + "loss": 0.808, + "num_input_tokens_seen": 48692992, + "step": 83860 + }, + { + "epoch": 12.49106344950849, + "grad_norm": 0.06396484375, + "learning_rate": 0.011140694479598834, + "loss": 0.8229, + "num_input_tokens_seen": 48695968, + "step": 83865 + }, + { + "epoch": 12.491808162049448, + "grad_norm": 0.046875, + "learning_rate": 0.011138810494344552, + "loss": 0.8096, + "num_input_tokens_seen": 48699200, + "step": 83870 + }, + { + "epoch": 12.492552874590409, + "grad_norm": 0.046142578125, + "learning_rate": 0.011136926574321138, + "loss": 0.8066, + "num_input_tokens_seen": 48702144, + "step": 83875 + }, + { + "epoch": 12.493297587131368, + "grad_norm": 0.032470703125, + "learning_rate": 0.011135042719560428, + "loss": 0.8205, + "num_input_tokens_seen": 48705120, + "step": 83880 + }, + { + "epoch": 12.494042299672326, + "grad_norm": 0.034912109375, + "learning_rate": 0.011133158930094238, + "loss": 0.8001, + "num_input_tokens_seen": 48707840, + "step": 83885 + }, + { + "epoch": 12.494787012213285, + "grad_norm": 0.05615234375, + "learning_rate": 0.0111312752059544, + "loss": 0.8068, + "num_input_tokens_seen": 48710848, + "step": 83890 + }, + { + "epoch": 12.495531724754246, + "grad_norm": 0.049072265625, + "learning_rate": 0.011129391547172735, + "loss": 0.8046, + "num_input_tokens_seen": 48713792, + "step": 83895 + }, + { + "epoch": 12.496276437295204, + "grad_norm": 0.06689453125, + "learning_rate": 0.011127507953781068, + "loss": 0.804, + "num_input_tokens_seen": 48716896, + "step": 83900 + }, + { + "epoch": 12.497021149836163, + "grad_norm": 0.038818359375, + "learning_rate": 0.011125624425811219, + "loss": 0.8104, + "num_input_tokens_seen": 48719552, + "step": 83905 + }, + { + "epoch": 12.497765862377122, + "grad_norm": 0.0390625, + "learning_rate": 0.011123740963295005, + "loss": 0.7924, + "num_input_tokens_seen": 48722144, + "step": 83910 + }, + { + "epoch": 12.498510574918082, + "grad_norm": 0.03466796875, + "learning_rate": 0.011121857566264251, + "loss": 0.7875, + "num_input_tokens_seen": 48725024, + "step": 83915 + }, + { + "epoch": 12.499255287459041, + "grad_norm": 0.035888671875, + "learning_rate": 0.011119974234750767, + "loss": 0.813, + "num_input_tokens_seen": 48728192, + "step": 83920 + }, + { + "epoch": 12.5, + "grad_norm": 0.04443359375, + "learning_rate": 0.011118090968786378, + "loss": 0.7933, + "num_input_tokens_seen": 48731296, + "step": 83925 + }, + { + "epoch": 12.500744712540959, + "grad_norm": 0.052001953125, + "learning_rate": 0.011116207768402891, + "loss": 0.8013, + "num_input_tokens_seen": 48734080, + "step": 83930 + }, + { + "epoch": 12.501489425081918, + "grad_norm": 0.04541015625, + "learning_rate": 0.011114324633632135, + "loss": 0.8086, + "num_input_tokens_seen": 48736736, + "step": 83935 + }, + { + "epoch": 12.502234137622878, + "grad_norm": 0.04541015625, + "learning_rate": 0.011112441564505911, + "loss": 0.8055, + "num_input_tokens_seen": 48739584, + "step": 83940 + }, + { + "epoch": 12.502978850163837, + "grad_norm": 0.06689453125, + "learning_rate": 0.011110558561056036, + "loss": 0.7881, + "num_input_tokens_seen": 48742720, + "step": 83945 + }, + { + "epoch": 12.503723562704796, + "grad_norm": 0.06494140625, + "learning_rate": 0.011108675623314324, + "loss": 0.8053, + "num_input_tokens_seen": 48745696, + "step": 83950 + }, + { + "epoch": 12.504468275245754, + "grad_norm": 0.037841796875, + "learning_rate": 0.011106792751312574, + "loss": 0.781, + "num_input_tokens_seen": 48748416, + "step": 83955 + }, + { + "epoch": 12.505212987786715, + "grad_norm": 0.0361328125, + "learning_rate": 0.011104909945082612, + "loss": 0.8036, + "num_input_tokens_seen": 48751296, + "step": 83960 + }, + { + "epoch": 12.505957700327674, + "grad_norm": 0.038330078125, + "learning_rate": 0.011103027204656231, + "loss": 0.8077, + "num_input_tokens_seen": 48754336, + "step": 83965 + }, + { + "epoch": 12.506702412868632, + "grad_norm": 0.0537109375, + "learning_rate": 0.01110114453006525, + "loss": 0.7889, + "num_input_tokens_seen": 48757440, + "step": 83970 + }, + { + "epoch": 12.507447125409591, + "grad_norm": 0.04052734375, + "learning_rate": 0.011099261921341468, + "loss": 0.8183, + "num_input_tokens_seen": 48760384, + "step": 83975 + }, + { + "epoch": 12.508191837950552, + "grad_norm": 0.0625, + "learning_rate": 0.011097379378516691, + "loss": 0.8005, + "num_input_tokens_seen": 48763296, + "step": 83980 + }, + { + "epoch": 12.50893655049151, + "grad_norm": 0.044189453125, + "learning_rate": 0.011095496901622728, + "loss": 0.7915, + "num_input_tokens_seen": 48766208, + "step": 83985 + }, + { + "epoch": 12.50968126303247, + "grad_norm": 0.057373046875, + "learning_rate": 0.011093614490691368, + "loss": 0.7957, + "num_input_tokens_seen": 48769280, + "step": 83990 + }, + { + "epoch": 12.510425975573428, + "grad_norm": 0.1279296875, + "learning_rate": 0.011091732145754426, + "loss": 0.8024, + "num_input_tokens_seen": 48772128, + "step": 83995 + }, + { + "epoch": 12.511170688114388, + "grad_norm": 0.037353515625, + "learning_rate": 0.01108984986684369, + "loss": 0.8086, + "num_input_tokens_seen": 48775008, + "step": 84000 + }, + { + "epoch": 12.511915400655347, + "grad_norm": 0.068359375, + "learning_rate": 0.011087967653990977, + "loss": 0.7942, + "num_input_tokens_seen": 48777760, + "step": 84005 + }, + { + "epoch": 12.512660113196306, + "grad_norm": 0.041748046875, + "learning_rate": 0.011086085507228066, + "loss": 0.7961, + "num_input_tokens_seen": 48780288, + "step": 84010 + }, + { + "epoch": 12.513404825737265, + "grad_norm": 0.047607421875, + "learning_rate": 0.01108420342658677, + "loss": 0.7965, + "num_input_tokens_seen": 48783168, + "step": 84015 + }, + { + "epoch": 12.514149538278225, + "grad_norm": 0.050537109375, + "learning_rate": 0.011082321412098868, + "loss": 0.8092, + "num_input_tokens_seen": 48786816, + "step": 84020 + }, + { + "epoch": 12.514894250819184, + "grad_norm": 0.08642578125, + "learning_rate": 0.011080439463796173, + "loss": 0.7867, + "num_input_tokens_seen": 48789696, + "step": 84025 + }, + { + "epoch": 12.515638963360143, + "grad_norm": 0.055908203125, + "learning_rate": 0.011078557581710469, + "loss": 0.7953, + "num_input_tokens_seen": 48792576, + "step": 84030 + }, + { + "epoch": 12.516383675901102, + "grad_norm": 0.076171875, + "learning_rate": 0.011076675765873541, + "loss": 0.8104, + "num_input_tokens_seen": 48795168, + "step": 84035 + }, + { + "epoch": 12.517128388442062, + "grad_norm": 0.035888671875, + "learning_rate": 0.011074794016317195, + "loss": 0.8017, + "num_input_tokens_seen": 48798080, + "step": 84040 + }, + { + "epoch": 12.51787310098302, + "grad_norm": 0.033203125, + "learning_rate": 0.01107291233307321, + "loss": 0.7991, + "num_input_tokens_seen": 48801120, + "step": 84045 + }, + { + "epoch": 12.51861781352398, + "grad_norm": 0.056884765625, + "learning_rate": 0.011071030716173387, + "loss": 0.7911, + "num_input_tokens_seen": 48804064, + "step": 84050 + }, + { + "epoch": 12.519362526064938, + "grad_norm": 0.036376953125, + "learning_rate": 0.011069149165649498, + "loss": 0.809, + "num_input_tokens_seen": 48806848, + "step": 84055 + }, + { + "epoch": 12.520107238605899, + "grad_norm": 0.048583984375, + "learning_rate": 0.011067267681533346, + "loss": 0.7942, + "num_input_tokens_seen": 48809888, + "step": 84060 + }, + { + "epoch": 12.520851951146858, + "grad_norm": 0.057861328125, + "learning_rate": 0.011065386263856709, + "loss": 0.8068, + "num_input_tokens_seen": 48812864, + "step": 84065 + }, + { + "epoch": 12.521596663687816, + "grad_norm": 0.0673828125, + "learning_rate": 0.011063504912651366, + "loss": 0.7858, + "num_input_tokens_seen": 48815808, + "step": 84070 + }, + { + "epoch": 12.522341376228775, + "grad_norm": 0.07421875, + "learning_rate": 0.011061623627949112, + "loss": 0.7904, + "num_input_tokens_seen": 48818592, + "step": 84075 + }, + { + "epoch": 12.523086088769734, + "grad_norm": 0.046875, + "learning_rate": 0.011059742409781723, + "loss": 0.8038, + "num_input_tokens_seen": 48821376, + "step": 84080 + }, + { + "epoch": 12.523830801310694, + "grad_norm": 0.07666015625, + "learning_rate": 0.011057861258180984, + "loss": 0.7983, + "num_input_tokens_seen": 48824224, + "step": 84085 + }, + { + "epoch": 12.524575513851653, + "grad_norm": 0.0634765625, + "learning_rate": 0.011055980173178664, + "loss": 0.7838, + "num_input_tokens_seen": 48827040, + "step": 84090 + }, + { + "epoch": 12.525320226392612, + "grad_norm": 0.037841796875, + "learning_rate": 0.01105409915480656, + "loss": 0.7955, + "num_input_tokens_seen": 48829696, + "step": 84095 + }, + { + "epoch": 12.526064938933573, + "grad_norm": 0.0458984375, + "learning_rate": 0.011052218203096431, + "loss": 0.866, + "num_input_tokens_seen": 48832864, + "step": 84100 + }, + { + "epoch": 12.526809651474531, + "grad_norm": 0.06298828125, + "learning_rate": 0.01105033731808007, + "loss": 0.7834, + "num_input_tokens_seen": 48835584, + "step": 84105 + }, + { + "epoch": 12.52755436401549, + "grad_norm": 0.05224609375, + "learning_rate": 0.011048456499789246, + "loss": 0.7902, + "num_input_tokens_seen": 48838432, + "step": 84110 + }, + { + "epoch": 12.528299076556449, + "grad_norm": 0.2099609375, + "learning_rate": 0.011046575748255731, + "loss": 0.8539, + "num_input_tokens_seen": 48841664, + "step": 84115 + }, + { + "epoch": 12.529043789097408, + "grad_norm": 0.054931640625, + "learning_rate": 0.011044695063511304, + "loss": 0.795, + "num_input_tokens_seen": 48844448, + "step": 84120 + }, + { + "epoch": 12.529788501638368, + "grad_norm": 0.036865234375, + "learning_rate": 0.011042814445587728, + "loss": 0.7987, + "num_input_tokens_seen": 48847456, + "step": 84125 + }, + { + "epoch": 12.530533214179327, + "grad_norm": 0.051513671875, + "learning_rate": 0.011040933894516783, + "loss": 0.7979, + "num_input_tokens_seen": 48850464, + "step": 84130 + }, + { + "epoch": 12.531277926720286, + "grad_norm": 0.02734375, + "learning_rate": 0.011039053410330232, + "loss": 0.7932, + "num_input_tokens_seen": 48853248, + "step": 84135 + }, + { + "epoch": 12.532022639261244, + "grad_norm": 0.035888671875, + "learning_rate": 0.011037172993059853, + "loss": 0.7866, + "num_input_tokens_seen": 48856032, + "step": 84140 + }, + { + "epoch": 12.532767351802205, + "grad_norm": 0.029052734375, + "learning_rate": 0.01103529264273741, + "loss": 0.8069, + "num_input_tokens_seen": 48859072, + "step": 84145 + }, + { + "epoch": 12.533512064343164, + "grad_norm": 0.060791015625, + "learning_rate": 0.011033412359394668, + "loss": 0.8076, + "num_input_tokens_seen": 48862176, + "step": 84150 + }, + { + "epoch": 12.534256776884122, + "grad_norm": 0.034912109375, + "learning_rate": 0.011031532143063391, + "loss": 0.8068, + "num_input_tokens_seen": 48865184, + "step": 84155 + }, + { + "epoch": 12.535001489425081, + "grad_norm": 0.033203125, + "learning_rate": 0.01102965199377534, + "loss": 0.7965, + "num_input_tokens_seen": 48868096, + "step": 84160 + }, + { + "epoch": 12.535746201966042, + "grad_norm": 0.1552734375, + "learning_rate": 0.01102777191156229, + "loss": 0.8527, + "num_input_tokens_seen": 48870784, + "step": 84165 + }, + { + "epoch": 12.536490914507, + "grad_norm": 0.03564453125, + "learning_rate": 0.011025891896455994, + "loss": 0.7919, + "num_input_tokens_seen": 48873408, + "step": 84170 + }, + { + "epoch": 12.53723562704796, + "grad_norm": 0.033203125, + "learning_rate": 0.011024011948488215, + "loss": 0.7917, + "num_input_tokens_seen": 48876256, + "step": 84175 + }, + { + "epoch": 12.537980339588918, + "grad_norm": 0.04296875, + "learning_rate": 0.011022132067690711, + "loss": 0.7972, + "num_input_tokens_seen": 48878944, + "step": 84180 + }, + { + "epoch": 12.538725052129879, + "grad_norm": 0.054931640625, + "learning_rate": 0.01102025225409525, + "loss": 0.7918, + "num_input_tokens_seen": 48882080, + "step": 84185 + }, + { + "epoch": 12.539469764670837, + "grad_norm": 0.0238037109375, + "learning_rate": 0.01101837250773358, + "loss": 0.8067, + "num_input_tokens_seen": 48884896, + "step": 84190 + }, + { + "epoch": 12.540214477211796, + "grad_norm": 0.036865234375, + "learning_rate": 0.011016492828637451, + "loss": 0.8065, + "num_input_tokens_seen": 48887840, + "step": 84195 + }, + { + "epoch": 12.540959189752755, + "grad_norm": 0.02880859375, + "learning_rate": 0.011014613216838637, + "loss": 0.7942, + "num_input_tokens_seen": 48890560, + "step": 84200 + }, + { + "epoch": 12.541703902293715, + "grad_norm": 0.06494140625, + "learning_rate": 0.011012733672368876, + "loss": 0.7947, + "num_input_tokens_seen": 48893536, + "step": 84205 + }, + { + "epoch": 12.542448614834674, + "grad_norm": 0.04736328125, + "learning_rate": 0.011010854195259934, + "loss": 0.799, + "num_input_tokens_seen": 48896704, + "step": 84210 + }, + { + "epoch": 12.543193327375633, + "grad_norm": 0.03466796875, + "learning_rate": 0.011008974785543546, + "loss": 0.781, + "num_input_tokens_seen": 48899616, + "step": 84215 + }, + { + "epoch": 12.543938039916592, + "grad_norm": 0.03173828125, + "learning_rate": 0.011007095443251482, + "loss": 0.8006, + "num_input_tokens_seen": 48902496, + "step": 84220 + }, + { + "epoch": 12.544682752457552, + "grad_norm": 0.04248046875, + "learning_rate": 0.011005216168415481, + "loss": 0.7841, + "num_input_tokens_seen": 48905472, + "step": 84225 + }, + { + "epoch": 12.545427464998511, + "grad_norm": 0.040771484375, + "learning_rate": 0.011003336961067285, + "loss": 0.7968, + "num_input_tokens_seen": 48908704, + "step": 84230 + }, + { + "epoch": 12.54617217753947, + "grad_norm": 0.048828125, + "learning_rate": 0.011001457821238658, + "loss": 0.7884, + "num_input_tokens_seen": 48911584, + "step": 84235 + }, + { + "epoch": 12.546916890080428, + "grad_norm": 0.032470703125, + "learning_rate": 0.010999578748961332, + "loss": 0.7869, + "num_input_tokens_seen": 48914144, + "step": 84240 + }, + { + "epoch": 12.547661602621389, + "grad_norm": 0.0537109375, + "learning_rate": 0.010997699744267063, + "loss": 0.8032, + "num_input_tokens_seen": 48916832, + "step": 84245 + }, + { + "epoch": 12.548406315162348, + "grad_norm": 0.043212890625, + "learning_rate": 0.010995820807187579, + "loss": 0.7969, + "num_input_tokens_seen": 48919552, + "step": 84250 + }, + { + "epoch": 12.549151027703306, + "grad_norm": 0.04736328125, + "learning_rate": 0.010993941937754643, + "loss": 0.7842, + "num_input_tokens_seen": 48922560, + "step": 84255 + }, + { + "epoch": 12.549895740244265, + "grad_norm": 0.05517578125, + "learning_rate": 0.010992063135999977, + "loss": 0.8074, + "num_input_tokens_seen": 48925312, + "step": 84260 + }, + { + "epoch": 12.550640452785224, + "grad_norm": 0.0625, + "learning_rate": 0.010990184401955339, + "loss": 0.804, + "num_input_tokens_seen": 48927840, + "step": 84265 + }, + { + "epoch": 12.551385165326185, + "grad_norm": 0.061279296875, + "learning_rate": 0.010988305735652458, + "loss": 0.7994, + "num_input_tokens_seen": 48930688, + "step": 84270 + }, + { + "epoch": 12.552129877867143, + "grad_norm": 0.07763671875, + "learning_rate": 0.010986427137123073, + "loss": 0.7866, + "num_input_tokens_seen": 48933888, + "step": 84275 + }, + { + "epoch": 12.552874590408102, + "grad_norm": 0.0361328125, + "learning_rate": 0.010984548606398924, + "loss": 0.8102, + "num_input_tokens_seen": 48936640, + "step": 84280 + }, + { + "epoch": 12.553619302949063, + "grad_norm": 0.03369140625, + "learning_rate": 0.01098267014351174, + "loss": 0.7971, + "num_input_tokens_seen": 48939776, + "step": 84285 + }, + { + "epoch": 12.554364015490021, + "grad_norm": 0.064453125, + "learning_rate": 0.010980791748493268, + "loss": 0.8227, + "num_input_tokens_seen": 48942624, + "step": 84290 + }, + { + "epoch": 12.55510872803098, + "grad_norm": 0.052001953125, + "learning_rate": 0.010978913421375226, + "loss": 0.779, + "num_input_tokens_seen": 48945440, + "step": 84295 + }, + { + "epoch": 12.555853440571939, + "grad_norm": 0.08251953125, + "learning_rate": 0.010977035162189364, + "loss": 0.8083, + "num_input_tokens_seen": 48948224, + "step": 84300 + }, + { + "epoch": 12.556598153112898, + "grad_norm": 0.039306640625, + "learning_rate": 0.0109751569709674, + "loss": 0.788, + "num_input_tokens_seen": 48950944, + "step": 84305 + }, + { + "epoch": 12.557342865653858, + "grad_norm": 0.042236328125, + "learning_rate": 0.01097327884774107, + "loss": 0.7938, + "num_input_tokens_seen": 48953568, + "step": 84310 + }, + { + "epoch": 12.558087578194817, + "grad_norm": 0.06787109375, + "learning_rate": 0.010971400792542102, + "loss": 0.812, + "num_input_tokens_seen": 48956256, + "step": 84315 + }, + { + "epoch": 12.558832290735776, + "grad_norm": 0.021240234375, + "learning_rate": 0.010969522805402218, + "loss": 0.8021, + "num_input_tokens_seen": 48959072, + "step": 84320 + }, + { + "epoch": 12.559577003276734, + "grad_norm": 0.031982421875, + "learning_rate": 0.010967644886353153, + "loss": 0.7855, + "num_input_tokens_seen": 48961792, + "step": 84325 + }, + { + "epoch": 12.560321715817695, + "grad_norm": 0.04541015625, + "learning_rate": 0.010965767035426625, + "loss": 0.797, + "num_input_tokens_seen": 48964768, + "step": 84330 + }, + { + "epoch": 12.561066428358654, + "grad_norm": 0.052734375, + "learning_rate": 0.01096388925265437, + "loss": 0.793, + "num_input_tokens_seen": 48967872, + "step": 84335 + }, + { + "epoch": 12.561811140899612, + "grad_norm": 0.046630859375, + "learning_rate": 0.0109620115380681, + "loss": 0.7954, + "num_input_tokens_seen": 48970816, + "step": 84340 + }, + { + "epoch": 12.562555853440571, + "grad_norm": 0.0235595703125, + "learning_rate": 0.010960133891699543, + "loss": 0.8, + "num_input_tokens_seen": 48973760, + "step": 84345 + }, + { + "epoch": 12.563300565981532, + "grad_norm": 0.0299072265625, + "learning_rate": 0.010958256313580418, + "loss": 0.8084, + "num_input_tokens_seen": 48976576, + "step": 84350 + }, + { + "epoch": 12.56404527852249, + "grad_norm": 0.036376953125, + "learning_rate": 0.010956378803742442, + "loss": 0.7976, + "num_input_tokens_seen": 48979520, + "step": 84355 + }, + { + "epoch": 12.56478999106345, + "grad_norm": 0.043212890625, + "learning_rate": 0.01095450136221734, + "loss": 0.8051, + "num_input_tokens_seen": 48982208, + "step": 84360 + }, + { + "epoch": 12.565534703604408, + "grad_norm": 0.038818359375, + "learning_rate": 0.010952623989036818, + "loss": 0.7985, + "num_input_tokens_seen": 48985024, + "step": 84365 + }, + { + "epoch": 12.566279416145369, + "grad_norm": 0.0213623046875, + "learning_rate": 0.010950746684232609, + "loss": 0.7914, + "num_input_tokens_seen": 48987904, + "step": 84370 + }, + { + "epoch": 12.567024128686327, + "grad_norm": 0.04345703125, + "learning_rate": 0.010948869447836415, + "loss": 0.8088, + "num_input_tokens_seen": 48990976, + "step": 84375 + }, + { + "epoch": 12.567768841227286, + "grad_norm": 0.046630859375, + "learning_rate": 0.010946992279879959, + "loss": 0.7853, + "num_input_tokens_seen": 48994048, + "step": 84380 + }, + { + "epoch": 12.568513553768245, + "grad_norm": 0.041748046875, + "learning_rate": 0.010945115180394947, + "loss": 0.776, + "num_input_tokens_seen": 48996896, + "step": 84385 + }, + { + "epoch": 12.569258266309205, + "grad_norm": 0.02587890625, + "learning_rate": 0.010943238149413087, + "loss": 0.7846, + "num_input_tokens_seen": 48999552, + "step": 84390 + }, + { + "epoch": 12.570002978850164, + "grad_norm": 0.0458984375, + "learning_rate": 0.010941361186966102, + "loss": 0.785, + "num_input_tokens_seen": 49002400, + "step": 84395 + }, + { + "epoch": 12.570747691391123, + "grad_norm": 0.04638671875, + "learning_rate": 0.010939484293085689, + "loss": 0.7924, + "num_input_tokens_seen": 49005056, + "step": 84400 + }, + { + "epoch": 12.571492403932082, + "grad_norm": 0.0341796875, + "learning_rate": 0.010937607467803567, + "loss": 0.7893, + "num_input_tokens_seen": 49007968, + "step": 84405 + }, + { + "epoch": 12.572237116473042, + "grad_norm": 0.0296630859375, + "learning_rate": 0.010935730711151436, + "loss": 0.793, + "num_input_tokens_seen": 49010720, + "step": 84410 + }, + { + "epoch": 12.572981829014001, + "grad_norm": 0.044189453125, + "learning_rate": 0.010933854023161006, + "loss": 0.7789, + "num_input_tokens_seen": 49013376, + "step": 84415 + }, + { + "epoch": 12.57372654155496, + "grad_norm": 0.0264892578125, + "learning_rate": 0.010931977403863974, + "loss": 0.8128, + "num_input_tokens_seen": 49015904, + "step": 84420 + }, + { + "epoch": 12.574471254095918, + "grad_norm": 0.042236328125, + "learning_rate": 0.010930100853292057, + "loss": 0.8033, + "num_input_tokens_seen": 49018592, + "step": 84425 + }, + { + "epoch": 12.575215966636879, + "grad_norm": 0.0302734375, + "learning_rate": 0.010928224371476947, + "loss": 0.789, + "num_input_tokens_seen": 49021376, + "step": 84430 + }, + { + "epoch": 12.575960679177838, + "grad_norm": 0.03857421875, + "learning_rate": 0.010926347958450341, + "loss": 0.789, + "num_input_tokens_seen": 49024192, + "step": 84435 + }, + { + "epoch": 12.576705391718797, + "grad_norm": 0.0269775390625, + "learning_rate": 0.010924471614243954, + "loss": 0.7806, + "num_input_tokens_seen": 49027296, + "step": 84440 + }, + { + "epoch": 12.577450104259755, + "grad_norm": 0.033447265625, + "learning_rate": 0.010922595338889476, + "loss": 0.782, + "num_input_tokens_seen": 49030112, + "step": 84445 + }, + { + "epoch": 12.578194816800714, + "grad_norm": 0.035400390625, + "learning_rate": 0.010920719132418604, + "loss": 0.7849, + "num_input_tokens_seen": 49032832, + "step": 84450 + }, + { + "epoch": 12.578939529341675, + "grad_norm": 0.0634765625, + "learning_rate": 0.010918842994863034, + "loss": 0.7709, + "num_input_tokens_seen": 49036064, + "step": 84455 + }, + { + "epoch": 12.579684241882633, + "grad_norm": 0.053955078125, + "learning_rate": 0.010916966926254467, + "loss": 0.7859, + "num_input_tokens_seen": 49038944, + "step": 84460 + }, + { + "epoch": 12.580428954423592, + "grad_norm": 0.08837890625, + "learning_rate": 0.010915090926624595, + "loss": 0.8235, + "num_input_tokens_seen": 49041856, + "step": 84465 + }, + { + "epoch": 12.58117366696455, + "grad_norm": 0.05078125, + "learning_rate": 0.010913214996005106, + "loss": 0.8058, + "num_input_tokens_seen": 49044832, + "step": 84470 + }, + { + "epoch": 12.581918379505511, + "grad_norm": 0.03271484375, + "learning_rate": 0.0109113391344277, + "loss": 0.7756, + "num_input_tokens_seen": 49047872, + "step": 84475 + }, + { + "epoch": 12.58266309204647, + "grad_norm": 0.043212890625, + "learning_rate": 0.010909463341924063, + "loss": 0.8146, + "num_input_tokens_seen": 49050688, + "step": 84480 + }, + { + "epoch": 12.583407804587429, + "grad_norm": 0.0517578125, + "learning_rate": 0.010907587618525885, + "loss": 0.7943, + "num_input_tokens_seen": 49053376, + "step": 84485 + }, + { + "epoch": 12.584152517128388, + "grad_norm": 0.051025390625, + "learning_rate": 0.01090571196426485, + "loss": 0.8025, + "num_input_tokens_seen": 49056096, + "step": 84490 + }, + { + "epoch": 12.584897229669348, + "grad_norm": 0.044921875, + "learning_rate": 0.010903836379172657, + "loss": 0.7897, + "num_input_tokens_seen": 49058944, + "step": 84495 + }, + { + "epoch": 12.585641942210307, + "grad_norm": 0.030517578125, + "learning_rate": 0.010901960863280982, + "loss": 0.8101, + "num_input_tokens_seen": 49062080, + "step": 84500 + }, + { + "epoch": 12.586386654751266, + "grad_norm": 0.0458984375, + "learning_rate": 0.010900085416621512, + "loss": 0.8065, + "num_input_tokens_seen": 49064736, + "step": 84505 + }, + { + "epoch": 12.587131367292224, + "grad_norm": 0.03369140625, + "learning_rate": 0.010898210039225937, + "loss": 0.8177, + "num_input_tokens_seen": 49067584, + "step": 84510 + }, + { + "epoch": 12.587876079833185, + "grad_norm": 0.0284423828125, + "learning_rate": 0.01089633473112593, + "loss": 0.8051, + "num_input_tokens_seen": 49070400, + "step": 84515 + }, + { + "epoch": 12.588620792374144, + "grad_norm": 0.041748046875, + "learning_rate": 0.01089445949235318, + "loss": 0.7864, + "num_input_tokens_seen": 49073312, + "step": 84520 + }, + { + "epoch": 12.589365504915103, + "grad_norm": 0.041015625, + "learning_rate": 0.010892584322939358, + "loss": 0.8627, + "num_input_tokens_seen": 49076224, + "step": 84525 + }, + { + "epoch": 12.590110217456061, + "grad_norm": 0.04345703125, + "learning_rate": 0.010890709222916156, + "loss": 0.7953, + "num_input_tokens_seen": 49079072, + "step": 84530 + }, + { + "epoch": 12.590854929997022, + "grad_norm": 0.041259765625, + "learning_rate": 0.010888834192315242, + "loss": 0.7931, + "num_input_tokens_seen": 49082016, + "step": 84535 + }, + { + "epoch": 12.59159964253798, + "grad_norm": 0.042236328125, + "learning_rate": 0.0108869592311683, + "loss": 0.7874, + "num_input_tokens_seen": 49085216, + "step": 84540 + }, + { + "epoch": 12.59234435507894, + "grad_norm": 0.043212890625, + "learning_rate": 0.010885084339506997, + "loss": 0.7939, + "num_input_tokens_seen": 49087968, + "step": 84545 + }, + { + "epoch": 12.593089067619898, + "grad_norm": 0.040771484375, + "learning_rate": 0.010883209517363013, + "loss": 0.8191, + "num_input_tokens_seen": 49091232, + "step": 84550 + }, + { + "epoch": 12.593833780160859, + "grad_norm": 0.0235595703125, + "learning_rate": 0.01088133476476802, + "loss": 0.8146, + "num_input_tokens_seen": 49094368, + "step": 84555 + }, + { + "epoch": 12.594578492701817, + "grad_norm": 0.03125, + "learning_rate": 0.010879460081753688, + "loss": 0.7991, + "num_input_tokens_seen": 49097184, + "step": 84560 + }, + { + "epoch": 12.595323205242776, + "grad_norm": 0.033447265625, + "learning_rate": 0.010877585468351693, + "loss": 0.7902, + "num_input_tokens_seen": 49100032, + "step": 84565 + }, + { + "epoch": 12.596067917783735, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0108757109245937, + "loss": 0.8008, + "num_input_tokens_seen": 49102848, + "step": 84570 + }, + { + "epoch": 12.596812630324695, + "grad_norm": 0.035400390625, + "learning_rate": 0.010873836450511383, + "loss": 0.7928, + "num_input_tokens_seen": 49105632, + "step": 84575 + }, + { + "epoch": 12.597557342865654, + "grad_norm": 0.042724609375, + "learning_rate": 0.0108719620461364, + "loss": 0.789, + "num_input_tokens_seen": 49108352, + "step": 84580 + }, + { + "epoch": 12.598302055406613, + "grad_norm": 0.047119140625, + "learning_rate": 0.010870087711500425, + "loss": 0.8012, + "num_input_tokens_seen": 49111264, + "step": 84585 + }, + { + "epoch": 12.599046767947572, + "grad_norm": 0.03759765625, + "learning_rate": 0.010868213446635125, + "loss": 0.7969, + "num_input_tokens_seen": 49114144, + "step": 84590 + }, + { + "epoch": 12.599791480488532, + "grad_norm": 0.020751953125, + "learning_rate": 0.01086633925157215, + "loss": 0.818, + "num_input_tokens_seen": 49117120, + "step": 84595 + }, + { + "epoch": 12.600536193029491, + "grad_norm": 0.0234375, + "learning_rate": 0.01086446512634318, + "loss": 0.7997, + "num_input_tokens_seen": 49119904, + "step": 84600 + }, + { + "epoch": 12.60128090557045, + "grad_norm": 0.038818359375, + "learning_rate": 0.010862591070979866, + "loss": 0.7926, + "num_input_tokens_seen": 49123008, + "step": 84605 + }, + { + "epoch": 12.602025618111409, + "grad_norm": 0.04541015625, + "learning_rate": 0.010860717085513873, + "loss": 0.7846, + "num_input_tokens_seen": 49125920, + "step": 84610 + }, + { + "epoch": 12.602770330652369, + "grad_norm": 0.0201416015625, + "learning_rate": 0.010858843169976852, + "loss": 0.7814, + "num_input_tokens_seen": 49128608, + "step": 84615 + }, + { + "epoch": 12.603515043193328, + "grad_norm": 0.047119140625, + "learning_rate": 0.010856969324400473, + "loss": 0.7969, + "num_input_tokens_seen": 49131456, + "step": 84620 + }, + { + "epoch": 12.604259755734287, + "grad_norm": 0.037841796875, + "learning_rate": 0.010855095548816378, + "loss": 0.7891, + "num_input_tokens_seen": 49134144, + "step": 84625 + }, + { + "epoch": 12.605004468275245, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01085322184325624, + "loss": 0.7925, + "num_input_tokens_seen": 49136832, + "step": 84630 + }, + { + "epoch": 12.605749180816204, + "grad_norm": 0.06591796875, + "learning_rate": 0.010851348207751703, + "loss": 0.8018, + "num_input_tokens_seen": 49139488, + "step": 84635 + }, + { + "epoch": 12.606493893357165, + "grad_norm": 0.0419921875, + "learning_rate": 0.01084947464233442, + "loss": 0.8052, + "num_input_tokens_seen": 49142432, + "step": 84640 + }, + { + "epoch": 12.607238605898123, + "grad_norm": 0.0245361328125, + "learning_rate": 0.010847601147036045, + "loss": 0.7969, + "num_input_tokens_seen": 49145568, + "step": 84645 + }, + { + "epoch": 12.607983318439082, + "grad_norm": 0.0302734375, + "learning_rate": 0.010845727721888224, + "loss": 0.813, + "num_input_tokens_seen": 49148416, + "step": 84650 + }, + { + "epoch": 12.608728030980041, + "grad_norm": 0.0216064453125, + "learning_rate": 0.010843854366922617, + "loss": 0.7947, + "num_input_tokens_seen": 49151200, + "step": 84655 + }, + { + "epoch": 12.609472743521001, + "grad_norm": 0.037353515625, + "learning_rate": 0.01084198108217086, + "loss": 0.7725, + "num_input_tokens_seen": 49154048, + "step": 84660 + }, + { + "epoch": 12.61021745606196, + "grad_norm": 0.13671875, + "learning_rate": 0.010840107867664612, + "loss": 0.853, + "num_input_tokens_seen": 49156768, + "step": 84665 + }, + { + "epoch": 12.610962168602919, + "grad_norm": 0.0291748046875, + "learning_rate": 0.010838234723435515, + "loss": 0.7957, + "num_input_tokens_seen": 49159328, + "step": 84670 + }, + { + "epoch": 12.611706881143878, + "grad_norm": 0.02099609375, + "learning_rate": 0.01083636164951521, + "loss": 0.7981, + "num_input_tokens_seen": 49162368, + "step": 84675 + }, + { + "epoch": 12.612451593684838, + "grad_norm": 0.031005859375, + "learning_rate": 0.010834488645935343, + "loss": 0.8172, + "num_input_tokens_seen": 49165056, + "step": 84680 + }, + { + "epoch": 12.613196306225797, + "grad_norm": 0.03759765625, + "learning_rate": 0.010832615712727554, + "loss": 0.7824, + "num_input_tokens_seen": 49167840, + "step": 84685 + }, + { + "epoch": 12.613941018766756, + "grad_norm": 0.0311279296875, + "learning_rate": 0.010830742849923492, + "loss": 0.8016, + "num_input_tokens_seen": 49170720, + "step": 84690 + }, + { + "epoch": 12.614685731307715, + "grad_norm": 0.0361328125, + "learning_rate": 0.010828870057554785, + "loss": 0.7895, + "num_input_tokens_seen": 49173600, + "step": 84695 + }, + { + "epoch": 12.615430443848675, + "grad_norm": 0.036376953125, + "learning_rate": 0.010826997335653087, + "loss": 0.8075, + "num_input_tokens_seen": 49176512, + "step": 84700 + }, + { + "epoch": 12.616175156389634, + "grad_norm": 0.03076171875, + "learning_rate": 0.010825124684250023, + "loss": 0.7801, + "num_input_tokens_seen": 49179360, + "step": 84705 + }, + { + "epoch": 12.616919868930593, + "grad_norm": 0.041748046875, + "learning_rate": 0.010823252103377238, + "loss": 0.8038, + "num_input_tokens_seen": 49182528, + "step": 84710 + }, + { + "epoch": 12.617664581471551, + "grad_norm": 0.044189453125, + "learning_rate": 0.010821379593066362, + "loss": 0.7916, + "num_input_tokens_seen": 49185312, + "step": 84715 + }, + { + "epoch": 12.618409294012512, + "grad_norm": 0.050048828125, + "learning_rate": 0.010819507153349027, + "loss": 0.773, + "num_input_tokens_seen": 49188192, + "step": 84720 + }, + { + "epoch": 12.61915400655347, + "grad_norm": 0.02294921875, + "learning_rate": 0.010817634784256876, + "loss": 0.7906, + "num_input_tokens_seen": 49191232, + "step": 84725 + }, + { + "epoch": 12.61989871909443, + "grad_norm": 0.02880859375, + "learning_rate": 0.010815762485821527, + "loss": 0.854, + "num_input_tokens_seen": 49194112, + "step": 84730 + }, + { + "epoch": 12.620643431635388, + "grad_norm": 0.041015625, + "learning_rate": 0.010813890258074624, + "loss": 0.7834, + "num_input_tokens_seen": 49196960, + "step": 84735 + }, + { + "epoch": 12.621388144176349, + "grad_norm": 0.02734375, + "learning_rate": 0.010812018101047788, + "loss": 0.8044, + "num_input_tokens_seen": 49199808, + "step": 84740 + }, + { + "epoch": 12.622132856717307, + "grad_norm": 0.021240234375, + "learning_rate": 0.010810146014772655, + "loss": 0.8074, + "num_input_tokens_seen": 49202976, + "step": 84745 + }, + { + "epoch": 12.622877569258266, + "grad_norm": 0.0576171875, + "learning_rate": 0.010808273999280844, + "loss": 0.7901, + "num_input_tokens_seen": 49205888, + "step": 84750 + }, + { + "epoch": 12.623622281799225, + "grad_norm": 0.03955078125, + "learning_rate": 0.010806402054603977, + "loss": 0.8034, + "num_input_tokens_seen": 49208672, + "step": 84755 + }, + { + "epoch": 12.624366994340185, + "grad_norm": 0.04736328125, + "learning_rate": 0.010804530180773694, + "loss": 0.7946, + "num_input_tokens_seen": 49211360, + "step": 84760 + }, + { + "epoch": 12.625111706881144, + "grad_norm": 0.035400390625, + "learning_rate": 0.010802658377821602, + "loss": 0.8049, + "num_input_tokens_seen": 49214144, + "step": 84765 + }, + { + "epoch": 12.625856419422103, + "grad_norm": 0.0302734375, + "learning_rate": 0.010800786645779336, + "loss": 0.7959, + "num_input_tokens_seen": 49217216, + "step": 84770 + }, + { + "epoch": 12.626601131963062, + "grad_norm": 0.0303955078125, + "learning_rate": 0.01079891498467851, + "loss": 0.7878, + "num_input_tokens_seen": 49219936, + "step": 84775 + }, + { + "epoch": 12.62734584450402, + "grad_norm": 0.0390625, + "learning_rate": 0.010797043394550749, + "loss": 0.7916, + "num_input_tokens_seen": 49222912, + "step": 84780 + }, + { + "epoch": 12.628090557044981, + "grad_norm": 0.027587890625, + "learning_rate": 0.01079517187542766, + "loss": 0.7947, + "num_input_tokens_seen": 49225632, + "step": 84785 + }, + { + "epoch": 12.62883526958594, + "grad_norm": 0.076171875, + "learning_rate": 0.010793300427340875, + "loss": 0.8144, + "num_input_tokens_seen": 49228416, + "step": 84790 + }, + { + "epoch": 12.629579982126899, + "grad_norm": 0.03759765625, + "learning_rate": 0.010791429050322003, + "loss": 0.8061, + "num_input_tokens_seen": 49231392, + "step": 84795 + }, + { + "epoch": 12.63032469466786, + "grad_norm": 0.0286865234375, + "learning_rate": 0.010789557744402656, + "loss": 0.7969, + "num_input_tokens_seen": 49234080, + "step": 84800 + }, + { + "epoch": 12.631069407208818, + "grad_norm": 0.03125, + "learning_rate": 0.010787686509614456, + "loss": 0.7865, + "num_input_tokens_seen": 49237024, + "step": 84805 + }, + { + "epoch": 12.631814119749777, + "grad_norm": 0.0439453125, + "learning_rate": 0.01078581534598901, + "loss": 0.7954, + "num_input_tokens_seen": 49240000, + "step": 84810 + }, + { + "epoch": 12.632558832290735, + "grad_norm": 0.05908203125, + "learning_rate": 0.010783944253557933, + "loss": 0.7997, + "num_input_tokens_seen": 49242752, + "step": 84815 + }, + { + "epoch": 12.633303544831694, + "grad_norm": 0.04150390625, + "learning_rate": 0.010782073232352826, + "loss": 0.8036, + "num_input_tokens_seen": 49245632, + "step": 84820 + }, + { + "epoch": 12.634048257372655, + "grad_norm": 0.043212890625, + "learning_rate": 0.010780202282405312, + "loss": 0.7985, + "num_input_tokens_seen": 49248480, + "step": 84825 + }, + { + "epoch": 12.634792969913613, + "grad_norm": 0.039794921875, + "learning_rate": 0.010778331403746991, + "loss": 0.7918, + "num_input_tokens_seen": 49251584, + "step": 84830 + }, + { + "epoch": 12.635537682454572, + "grad_norm": 0.03369140625, + "learning_rate": 0.010776460596409465, + "loss": 0.7904, + "num_input_tokens_seen": 49254432, + "step": 84835 + }, + { + "epoch": 12.636282394995531, + "grad_norm": 0.04248046875, + "learning_rate": 0.01077458986042435, + "loss": 0.7997, + "num_input_tokens_seen": 49257280, + "step": 84840 + }, + { + "epoch": 12.637027107536491, + "grad_norm": 0.042724609375, + "learning_rate": 0.010772719195823241, + "loss": 0.8117, + "num_input_tokens_seen": 49260544, + "step": 84845 + }, + { + "epoch": 12.63777182007745, + "grad_norm": 0.047119140625, + "learning_rate": 0.010770848602637748, + "loss": 0.8057, + "num_input_tokens_seen": 49263232, + "step": 84850 + }, + { + "epoch": 12.638516532618409, + "grad_norm": 0.10888671875, + "learning_rate": 0.010768978080899464, + "loss": 0.7825, + "num_input_tokens_seen": 49266464, + "step": 84855 + }, + { + "epoch": 12.639261245159368, + "grad_norm": 0.046142578125, + "learning_rate": 0.01076710763064, + "loss": 0.7975, + "num_input_tokens_seen": 49269344, + "step": 84860 + }, + { + "epoch": 12.640005957700328, + "grad_norm": 0.040771484375, + "learning_rate": 0.01076523725189095, + "loss": 0.7867, + "num_input_tokens_seen": 49272096, + "step": 84865 + }, + { + "epoch": 12.640750670241287, + "grad_norm": 0.0673828125, + "learning_rate": 0.010763366944683913, + "loss": 0.8184, + "num_input_tokens_seen": 49274944, + "step": 84870 + }, + { + "epoch": 12.641495382782246, + "grad_norm": 0.044189453125, + "learning_rate": 0.010761496709050484, + "loss": 0.8212, + "num_input_tokens_seen": 49277440, + "step": 84875 + }, + { + "epoch": 12.642240095323205, + "grad_norm": 0.0380859375, + "learning_rate": 0.01075962654502226, + "loss": 0.8061, + "num_input_tokens_seen": 49280128, + "step": 84880 + }, + { + "epoch": 12.642984807864165, + "grad_norm": 0.048828125, + "learning_rate": 0.01075775645263084, + "loss": 0.8088, + "num_input_tokens_seen": 49283040, + "step": 84885 + }, + { + "epoch": 12.643729520405124, + "grad_norm": 0.04150390625, + "learning_rate": 0.010755886431907804, + "loss": 0.7744, + "num_input_tokens_seen": 49285632, + "step": 84890 + }, + { + "epoch": 12.644474232946083, + "grad_norm": 0.035888671875, + "learning_rate": 0.010754016482884762, + "loss": 0.7759, + "num_input_tokens_seen": 49288448, + "step": 84895 + }, + { + "epoch": 12.645218945487041, + "grad_norm": 0.041748046875, + "learning_rate": 0.01075214660559329, + "loss": 0.8027, + "num_input_tokens_seen": 49291552, + "step": 84900 + }, + { + "epoch": 12.645963658028002, + "grad_norm": 0.022216796875, + "learning_rate": 0.01075027680006499, + "loss": 0.8063, + "num_input_tokens_seen": 49294528, + "step": 84905 + }, + { + "epoch": 12.64670837056896, + "grad_norm": 0.04296875, + "learning_rate": 0.010748407066331444, + "loss": 0.8076, + "num_input_tokens_seen": 49297600, + "step": 84910 + }, + { + "epoch": 12.64745308310992, + "grad_norm": 0.0216064453125, + "learning_rate": 0.01074653740442423, + "loss": 0.788, + "num_input_tokens_seen": 49300512, + "step": 84915 + }, + { + "epoch": 12.648197795650878, + "grad_norm": 0.04052734375, + "learning_rate": 0.010744667814374952, + "loss": 0.801, + "num_input_tokens_seen": 49303168, + "step": 84920 + }, + { + "epoch": 12.648942508191839, + "grad_norm": 0.03466796875, + "learning_rate": 0.01074279829621518, + "loss": 0.7872, + "num_input_tokens_seen": 49305888, + "step": 84925 + }, + { + "epoch": 12.649687220732797, + "grad_norm": 0.037353515625, + "learning_rate": 0.010740928849976513, + "loss": 0.7884, + "num_input_tokens_seen": 49308640, + "step": 84930 + }, + { + "epoch": 12.650431933273756, + "grad_norm": 0.0361328125, + "learning_rate": 0.010739059475690519, + "loss": 0.8162, + "num_input_tokens_seen": 49311264, + "step": 84935 + }, + { + "epoch": 12.651176645814715, + "grad_norm": 0.04638671875, + "learning_rate": 0.010737190173388787, + "loss": 0.8006, + "num_input_tokens_seen": 49313952, + "step": 84940 + }, + { + "epoch": 12.651921358355676, + "grad_norm": 0.0230712890625, + "learning_rate": 0.010735320943102887, + "loss": 0.8129, + "num_input_tokens_seen": 49316928, + "step": 84945 + }, + { + "epoch": 12.652666070896634, + "grad_norm": 0.04736328125, + "learning_rate": 0.010733451784864415, + "loss": 0.8576, + "num_input_tokens_seen": 49319840, + "step": 84950 + }, + { + "epoch": 12.653410783437593, + "grad_norm": 0.03564453125, + "learning_rate": 0.010731582698704936, + "loss": 0.8088, + "num_input_tokens_seen": 49322976, + "step": 84955 + }, + { + "epoch": 12.654155495978552, + "grad_norm": 0.034912109375, + "learning_rate": 0.010729713684656023, + "loss": 0.7929, + "num_input_tokens_seen": 49326336, + "step": 84960 + }, + { + "epoch": 12.65490020851951, + "grad_norm": 0.0235595703125, + "learning_rate": 0.010727844742749265, + "loss": 0.8022, + "num_input_tokens_seen": 49329344, + "step": 84965 + }, + { + "epoch": 12.655644921060471, + "grad_norm": 0.038818359375, + "learning_rate": 0.010725975873016224, + "loss": 0.7845, + "num_input_tokens_seen": 49332352, + "step": 84970 + }, + { + "epoch": 12.65638963360143, + "grad_norm": 0.050048828125, + "learning_rate": 0.01072410707548848, + "loss": 0.7947, + "num_input_tokens_seen": 49335328, + "step": 84975 + }, + { + "epoch": 12.657134346142389, + "grad_norm": 0.041748046875, + "learning_rate": 0.010722238350197593, + "loss": 0.8014, + "num_input_tokens_seen": 49338016, + "step": 84980 + }, + { + "epoch": 12.657879058683347, + "grad_norm": 0.040283203125, + "learning_rate": 0.010720369697175151, + "loss": 0.7765, + "num_input_tokens_seen": 49340864, + "step": 84985 + }, + { + "epoch": 12.658623771224308, + "grad_norm": 0.043212890625, + "learning_rate": 0.010718501116452712, + "loss": 0.8065, + "num_input_tokens_seen": 49343616, + "step": 84990 + }, + { + "epoch": 12.659368483765267, + "grad_norm": 0.031982421875, + "learning_rate": 0.010716632608061837, + "loss": 0.7964, + "num_input_tokens_seen": 49346816, + "step": 84995 + }, + { + "epoch": 12.660113196306225, + "grad_norm": 0.0294189453125, + "learning_rate": 0.010714764172034109, + "loss": 0.7874, + "num_input_tokens_seen": 49349856, + "step": 85000 + }, + { + "epoch": 12.660857908847184, + "grad_norm": 0.0234375, + "learning_rate": 0.010712895808401082, + "loss": 0.8053, + "num_input_tokens_seen": 49352768, + "step": 85005 + }, + { + "epoch": 12.661602621388145, + "grad_norm": 0.0240478515625, + "learning_rate": 0.010711027517194325, + "loss": 0.8052, + "num_input_tokens_seen": 49355648, + "step": 85010 + }, + { + "epoch": 12.662347333929103, + "grad_norm": 0.040771484375, + "learning_rate": 0.010709159298445394, + "loss": 0.8027, + "num_input_tokens_seen": 49358624, + "step": 85015 + }, + { + "epoch": 12.663092046470062, + "grad_norm": 0.049072265625, + "learning_rate": 0.01070729115218586, + "loss": 0.8087, + "num_input_tokens_seen": 49361600, + "step": 85020 + }, + { + "epoch": 12.663836759011021, + "grad_norm": 0.0260009765625, + "learning_rate": 0.010705423078447275, + "loss": 0.8045, + "num_input_tokens_seen": 49364576, + "step": 85025 + }, + { + "epoch": 12.664581471551982, + "grad_norm": 0.03857421875, + "learning_rate": 0.010703555077261208, + "loss": 0.7863, + "num_input_tokens_seen": 49367584, + "step": 85030 + }, + { + "epoch": 12.66532618409294, + "grad_norm": 0.02001953125, + "learning_rate": 0.01070168714865921, + "loss": 0.791, + "num_input_tokens_seen": 49370592, + "step": 85035 + }, + { + "epoch": 12.666070896633899, + "grad_norm": 0.0303955078125, + "learning_rate": 0.010699819292672838, + "loss": 0.801, + "num_input_tokens_seen": 49373344, + "step": 85040 + }, + { + "epoch": 12.666815609174858, + "grad_norm": 0.04345703125, + "learning_rate": 0.01069795150933365, + "loss": 0.7799, + "num_input_tokens_seen": 49376416, + "step": 85045 + }, + { + "epoch": 12.667560321715818, + "grad_norm": 0.0302734375, + "learning_rate": 0.01069608379867319, + "loss": 0.7873, + "num_input_tokens_seen": 49379456, + "step": 85050 + }, + { + "epoch": 12.668305034256777, + "grad_norm": 0.05859375, + "learning_rate": 0.010694216160723032, + "loss": 0.7846, + "num_input_tokens_seen": 49382496, + "step": 85055 + }, + { + "epoch": 12.669049746797736, + "grad_norm": 0.11572265625, + "learning_rate": 0.010692348595514706, + "loss": 0.8322, + "num_input_tokens_seen": 49385248, + "step": 85060 + }, + { + "epoch": 12.669794459338695, + "grad_norm": 0.0361328125, + "learning_rate": 0.010690481103079779, + "loss": 0.8045, + "num_input_tokens_seen": 49388448, + "step": 85065 + }, + { + "epoch": 12.670539171879655, + "grad_norm": 0.040771484375, + "learning_rate": 0.010688613683449795, + "loss": 0.8075, + "num_input_tokens_seen": 49391168, + "step": 85070 + }, + { + "epoch": 12.671283884420614, + "grad_norm": 0.0341796875, + "learning_rate": 0.010686746336656295, + "loss": 0.8115, + "num_input_tokens_seen": 49394016, + "step": 85075 + }, + { + "epoch": 12.672028596961573, + "grad_norm": 0.0245361328125, + "learning_rate": 0.010684879062730837, + "loss": 0.8094, + "num_input_tokens_seen": 49396704, + "step": 85080 + }, + { + "epoch": 12.672773309502531, + "grad_norm": 0.04296875, + "learning_rate": 0.010683011861704957, + "loss": 0.7961, + "num_input_tokens_seen": 49399616, + "step": 85085 + }, + { + "epoch": 12.673518022043492, + "grad_norm": 0.034423828125, + "learning_rate": 0.010681144733610207, + "loss": 0.8101, + "num_input_tokens_seen": 49402208, + "step": 85090 + }, + { + "epoch": 12.67426273458445, + "grad_norm": 0.04345703125, + "learning_rate": 0.010679277678478124, + "loss": 0.7983, + "num_input_tokens_seen": 49405216, + "step": 85095 + }, + { + "epoch": 12.67500744712541, + "grad_norm": 0.034912109375, + "learning_rate": 0.010677410696340256, + "loss": 0.7828, + "num_input_tokens_seen": 49408128, + "step": 85100 + }, + { + "epoch": 12.675752159666368, + "grad_norm": 0.04443359375, + "learning_rate": 0.01067554378722814, + "loss": 0.7999, + "num_input_tokens_seen": 49410912, + "step": 85105 + }, + { + "epoch": 12.676496872207329, + "grad_norm": 0.06005859375, + "learning_rate": 0.01067367695117332, + "loss": 0.7996, + "num_input_tokens_seen": 49413568, + "step": 85110 + }, + { + "epoch": 12.677241584748288, + "grad_norm": 0.0242919921875, + "learning_rate": 0.01067181018820733, + "loss": 0.7992, + "num_input_tokens_seen": 49416352, + "step": 85115 + }, + { + "epoch": 12.677986297289246, + "grad_norm": 0.033935546875, + "learning_rate": 0.010669943498361702, + "loss": 0.7896, + "num_input_tokens_seen": 49419200, + "step": 85120 + }, + { + "epoch": 12.678731009830205, + "grad_norm": 0.029052734375, + "learning_rate": 0.010668076881667982, + "loss": 0.7961, + "num_input_tokens_seen": 49422336, + "step": 85125 + }, + { + "epoch": 12.679475722371166, + "grad_norm": 0.034912109375, + "learning_rate": 0.010666210338157697, + "loss": 0.7963, + "num_input_tokens_seen": 49425056, + "step": 85130 + }, + { + "epoch": 12.680220434912124, + "grad_norm": 0.0517578125, + "learning_rate": 0.01066434386786239, + "loss": 0.785, + "num_input_tokens_seen": 49428096, + "step": 85135 + }, + { + "epoch": 12.680965147453083, + "grad_norm": 0.03759765625, + "learning_rate": 0.010662477470813581, + "loss": 0.7942, + "num_input_tokens_seen": 49431040, + "step": 85140 + }, + { + "epoch": 12.681709859994042, + "grad_norm": 0.04931640625, + "learning_rate": 0.01066061114704281, + "loss": 0.8158, + "num_input_tokens_seen": 49433888, + "step": 85145 + }, + { + "epoch": 12.682454572535, + "grad_norm": 0.030029296875, + "learning_rate": 0.010658744896581605, + "loss": 0.8046, + "num_input_tokens_seen": 49436864, + "step": 85150 + }, + { + "epoch": 12.683199285075961, + "grad_norm": 0.0400390625, + "learning_rate": 0.010656878719461487, + "loss": 0.7757, + "num_input_tokens_seen": 49439968, + "step": 85155 + }, + { + "epoch": 12.68394399761692, + "grad_norm": 0.06591796875, + "learning_rate": 0.010655012615713996, + "loss": 0.7897, + "num_input_tokens_seen": 49442944, + "step": 85160 + }, + { + "epoch": 12.684688710157879, + "grad_norm": 0.048583984375, + "learning_rate": 0.01065314658537064, + "loss": 0.8051, + "num_input_tokens_seen": 49445888, + "step": 85165 + }, + { + "epoch": 12.685433422698837, + "grad_norm": 0.048583984375, + "learning_rate": 0.010651280628462965, + "loss": 0.81, + "num_input_tokens_seen": 49449216, + "step": 85170 + }, + { + "epoch": 12.686178135239798, + "grad_norm": 0.042724609375, + "learning_rate": 0.010649414745022479, + "loss": 0.7999, + "num_input_tokens_seen": 49452160, + "step": 85175 + }, + { + "epoch": 12.686922847780757, + "grad_norm": 0.03173828125, + "learning_rate": 0.010647548935080714, + "loss": 0.7996, + "num_input_tokens_seen": 49455008, + "step": 85180 + }, + { + "epoch": 12.687667560321715, + "grad_norm": 0.0233154296875, + "learning_rate": 0.010645683198669178, + "loss": 0.7917, + "num_input_tokens_seen": 49457824, + "step": 85185 + }, + { + "epoch": 12.688412272862674, + "grad_norm": 0.05517578125, + "learning_rate": 0.010643817535819404, + "loss": 0.7847, + "num_input_tokens_seen": 49460736, + "step": 85190 + }, + { + "epoch": 12.689156985403635, + "grad_norm": 0.048828125, + "learning_rate": 0.010641951946562906, + "loss": 0.7908, + "num_input_tokens_seen": 49463744, + "step": 85195 + }, + { + "epoch": 12.689901697944594, + "grad_norm": 0.038818359375, + "learning_rate": 0.010640086430931199, + "loss": 0.7836, + "num_input_tokens_seen": 49466528, + "step": 85200 + }, + { + "epoch": 12.690646410485552, + "grad_norm": 0.05859375, + "learning_rate": 0.0106382209889558, + "loss": 0.7742, + "num_input_tokens_seen": 49469632, + "step": 85205 + }, + { + "epoch": 12.691391123026511, + "grad_norm": 0.032958984375, + "learning_rate": 0.010636355620668224, + "loss": 0.7914, + "num_input_tokens_seen": 49472608, + "step": 85210 + }, + { + "epoch": 12.692135835567472, + "grad_norm": 0.05029296875, + "learning_rate": 0.010634490326099987, + "loss": 0.8143, + "num_input_tokens_seen": 49475712, + "step": 85215 + }, + { + "epoch": 12.69288054810843, + "grad_norm": 0.0272216796875, + "learning_rate": 0.010632625105282595, + "loss": 0.8546, + "num_input_tokens_seen": 49478688, + "step": 85220 + }, + { + "epoch": 12.69362526064939, + "grad_norm": 0.04931640625, + "learning_rate": 0.010630759958247568, + "loss": 0.8072, + "num_input_tokens_seen": 49481632, + "step": 85225 + }, + { + "epoch": 12.694369973190348, + "grad_norm": 0.0361328125, + "learning_rate": 0.010628894885026406, + "loss": 0.8075, + "num_input_tokens_seen": 49484672, + "step": 85230 + }, + { + "epoch": 12.695114685731308, + "grad_norm": 0.040283203125, + "learning_rate": 0.010627029885650629, + "loss": 0.7934, + "num_input_tokens_seen": 49487680, + "step": 85235 + }, + { + "epoch": 12.695859398272267, + "grad_norm": 0.050048828125, + "learning_rate": 0.010625164960151735, + "loss": 0.793, + "num_input_tokens_seen": 49490752, + "step": 85240 + }, + { + "epoch": 12.696604110813226, + "grad_norm": 0.04248046875, + "learning_rate": 0.010623300108561228, + "loss": 0.7825, + "num_input_tokens_seen": 49493536, + "step": 85245 + }, + { + "epoch": 12.697348823354185, + "grad_norm": 0.037841796875, + "learning_rate": 0.010621435330910622, + "loss": 0.8019, + "num_input_tokens_seen": 49496640, + "step": 85250 + }, + { + "epoch": 12.698093535895145, + "grad_norm": 0.041259765625, + "learning_rate": 0.01061957062723141, + "loss": 0.7871, + "num_input_tokens_seen": 49499616, + "step": 85255 + }, + { + "epoch": 12.698838248436104, + "grad_norm": 0.03662109375, + "learning_rate": 0.010617705997555106, + "loss": 0.8073, + "num_input_tokens_seen": 49502400, + "step": 85260 + }, + { + "epoch": 12.699582960977063, + "grad_norm": 0.042236328125, + "learning_rate": 0.010615841441913205, + "loss": 0.7675, + "num_input_tokens_seen": 49505120, + "step": 85265 + }, + { + "epoch": 12.700327673518021, + "grad_norm": 0.05908203125, + "learning_rate": 0.010613976960337206, + "loss": 0.7859, + "num_input_tokens_seen": 49508032, + "step": 85270 + }, + { + "epoch": 12.701072386058982, + "grad_norm": 0.049072265625, + "learning_rate": 0.010612112552858611, + "loss": 0.7825, + "num_input_tokens_seen": 49510944, + "step": 85275 + }, + { + "epoch": 12.70181709859994, + "grad_norm": 0.056640625, + "learning_rate": 0.010610248219508909, + "loss": 0.8194, + "num_input_tokens_seen": 49513632, + "step": 85280 + }, + { + "epoch": 12.7025618111409, + "grad_norm": 0.07763671875, + "learning_rate": 0.010608383960319606, + "loss": 0.8, + "num_input_tokens_seen": 49516320, + "step": 85285 + }, + { + "epoch": 12.703306523681858, + "grad_norm": 0.054443359375, + "learning_rate": 0.010606519775322187, + "loss": 0.7761, + "num_input_tokens_seen": 49519232, + "step": 85290 + }, + { + "epoch": 12.704051236222817, + "grad_norm": 0.0264892578125, + "learning_rate": 0.010604655664548159, + "loss": 0.8307, + "num_input_tokens_seen": 49521952, + "step": 85295 + }, + { + "epoch": 12.704795948763778, + "grad_norm": 0.05126953125, + "learning_rate": 0.010602791628029, + "loss": 0.7945, + "num_input_tokens_seen": 49525088, + "step": 85300 + }, + { + "epoch": 12.705540661304736, + "grad_norm": 0.051513671875, + "learning_rate": 0.010600927665796215, + "loss": 0.8077, + "num_input_tokens_seen": 49528032, + "step": 85305 + }, + { + "epoch": 12.706285373845695, + "grad_norm": 0.049560546875, + "learning_rate": 0.010599063777881276, + "loss": 0.8145, + "num_input_tokens_seen": 49530880, + "step": 85310 + }, + { + "epoch": 12.707030086386656, + "grad_norm": 0.035400390625, + "learning_rate": 0.01059719996431569, + "loss": 0.7787, + "num_input_tokens_seen": 49533728, + "step": 85315 + }, + { + "epoch": 12.707774798927614, + "grad_norm": 0.033935546875, + "learning_rate": 0.010595336225130933, + "loss": 0.7928, + "num_input_tokens_seen": 49536416, + "step": 85320 + }, + { + "epoch": 12.708519511468573, + "grad_norm": 0.03515625, + "learning_rate": 0.01059347256035849, + "loss": 0.795, + "num_input_tokens_seen": 49539488, + "step": 85325 + }, + { + "epoch": 12.709264224009532, + "grad_norm": 0.031982421875, + "learning_rate": 0.010591608970029856, + "loss": 0.8101, + "num_input_tokens_seen": 49542272, + "step": 85330 + }, + { + "epoch": 12.71000893655049, + "grad_norm": 0.03466796875, + "learning_rate": 0.010589745454176503, + "loss": 0.794, + "num_input_tokens_seen": 49545024, + "step": 85335 + }, + { + "epoch": 12.710753649091451, + "grad_norm": 0.02978515625, + "learning_rate": 0.010587882012829921, + "loss": 0.8009, + "num_input_tokens_seen": 49548160, + "step": 85340 + }, + { + "epoch": 12.71149836163241, + "grad_norm": 0.047607421875, + "learning_rate": 0.010586018646021583, + "loss": 0.7943, + "num_input_tokens_seen": 49551008, + "step": 85345 + }, + { + "epoch": 12.712243074173369, + "grad_norm": 0.052001953125, + "learning_rate": 0.010584155353782982, + "loss": 0.803, + "num_input_tokens_seen": 49553600, + "step": 85350 + }, + { + "epoch": 12.712987786714327, + "grad_norm": 0.038818359375, + "learning_rate": 0.010582292136145584, + "loss": 0.7984, + "num_input_tokens_seen": 49556832, + "step": 85355 + }, + { + "epoch": 12.713732499255288, + "grad_norm": 0.051513671875, + "learning_rate": 0.010580428993140868, + "loss": 0.7779, + "num_input_tokens_seen": 49560320, + "step": 85360 + }, + { + "epoch": 12.714477211796247, + "grad_norm": 0.051513671875, + "learning_rate": 0.010578565924800316, + "loss": 0.788, + "num_input_tokens_seen": 49563264, + "step": 85365 + }, + { + "epoch": 12.715221924337206, + "grad_norm": 0.04541015625, + "learning_rate": 0.010576702931155396, + "loss": 0.7755, + "num_input_tokens_seen": 49566656, + "step": 85370 + }, + { + "epoch": 12.715966636878164, + "grad_norm": 0.1044921875, + "learning_rate": 0.01057484001223759, + "loss": 0.8025, + "num_input_tokens_seen": 49569312, + "step": 85375 + }, + { + "epoch": 12.716711349419125, + "grad_norm": 0.045166015625, + "learning_rate": 0.010572977168078356, + "loss": 0.7796, + "num_input_tokens_seen": 49572096, + "step": 85380 + }, + { + "epoch": 12.717456061960084, + "grad_norm": 0.057373046875, + "learning_rate": 0.01057111439870918, + "loss": 0.7754, + "num_input_tokens_seen": 49574976, + "step": 85385 + }, + { + "epoch": 12.718200774501042, + "grad_norm": 0.07421875, + "learning_rate": 0.010569251704161519, + "loss": 0.7725, + "num_input_tokens_seen": 49577632, + "step": 85390 + }, + { + "epoch": 12.718945487042001, + "grad_norm": 0.05126953125, + "learning_rate": 0.010567389084466854, + "loss": 0.8485, + "num_input_tokens_seen": 49580576, + "step": 85395 + }, + { + "epoch": 12.719690199582962, + "grad_norm": 0.0712890625, + "learning_rate": 0.010565526539656644, + "loss": 0.7883, + "num_input_tokens_seen": 49583584, + "step": 85400 + }, + { + "epoch": 12.72043491212392, + "grad_norm": 0.059814453125, + "learning_rate": 0.010563664069762353, + "loss": 0.7568, + "num_input_tokens_seen": 49586560, + "step": 85405 + }, + { + "epoch": 12.72117962466488, + "grad_norm": 0.0703125, + "learning_rate": 0.010561801674815454, + "loss": 0.795, + "num_input_tokens_seen": 49589056, + "step": 85410 + }, + { + "epoch": 12.721924337205838, + "grad_norm": 0.072265625, + "learning_rate": 0.010559939354847395, + "loss": 0.7942, + "num_input_tokens_seen": 49592096, + "step": 85415 + }, + { + "epoch": 12.722669049746798, + "grad_norm": 0.047119140625, + "learning_rate": 0.010558077109889655, + "loss": 0.8615, + "num_input_tokens_seen": 49595072, + "step": 85420 + }, + { + "epoch": 12.723413762287757, + "grad_norm": 0.08740234375, + "learning_rate": 0.010556214939973685, + "loss": 0.8015, + "num_input_tokens_seen": 49597792, + "step": 85425 + }, + { + "epoch": 12.724158474828716, + "grad_norm": 0.07470703125, + "learning_rate": 0.010554352845130949, + "loss": 0.7806, + "num_input_tokens_seen": 49600832, + "step": 85430 + }, + { + "epoch": 12.724903187369675, + "grad_norm": 0.0859375, + "learning_rate": 0.010552490825392906, + "loss": 0.8155, + "num_input_tokens_seen": 49603488, + "step": 85435 + }, + { + "epoch": 12.725647899910635, + "grad_norm": 0.06689453125, + "learning_rate": 0.010550628880791005, + "loss": 0.7792, + "num_input_tokens_seen": 49606720, + "step": 85440 + }, + { + "epoch": 12.726392612451594, + "grad_norm": 0.052490234375, + "learning_rate": 0.010548767011356713, + "loss": 0.8078, + "num_input_tokens_seen": 49609664, + "step": 85445 + }, + { + "epoch": 12.727137324992553, + "grad_norm": 0.044189453125, + "learning_rate": 0.01054690521712147, + "loss": 0.7716, + "num_input_tokens_seen": 49612480, + "step": 85450 + }, + { + "epoch": 12.727882037533512, + "grad_norm": 0.046630859375, + "learning_rate": 0.010545043498116746, + "loss": 0.7967, + "num_input_tokens_seen": 49615488, + "step": 85455 + }, + { + "epoch": 12.728626750074472, + "grad_norm": 0.11083984375, + "learning_rate": 0.010543181854373975, + "loss": 0.7942, + "num_input_tokens_seen": 49618176, + "step": 85460 + }, + { + "epoch": 12.72937146261543, + "grad_norm": 0.052734375, + "learning_rate": 0.010541320285924624, + "loss": 0.8144, + "num_input_tokens_seen": 49620864, + "step": 85465 + }, + { + "epoch": 12.73011617515639, + "grad_norm": 0.0625, + "learning_rate": 0.010539458792800133, + "loss": 0.8029, + "num_input_tokens_seen": 49623648, + "step": 85470 + }, + { + "epoch": 12.730860887697348, + "grad_norm": 0.052490234375, + "learning_rate": 0.010537597375031956, + "loss": 0.7846, + "num_input_tokens_seen": 49626912, + "step": 85475 + }, + { + "epoch": 12.731605600238307, + "grad_norm": 0.0703125, + "learning_rate": 0.010535736032651535, + "loss": 0.8095, + "num_input_tokens_seen": 49629856, + "step": 85480 + }, + { + "epoch": 12.732350312779268, + "grad_norm": 0.047607421875, + "learning_rate": 0.01053387476569031, + "loss": 0.8164, + "num_input_tokens_seen": 49632768, + "step": 85485 + }, + { + "epoch": 12.733095025320226, + "grad_norm": 0.032958984375, + "learning_rate": 0.010532013574179735, + "loss": 0.8023, + "num_input_tokens_seen": 49635680, + "step": 85490 + }, + { + "epoch": 12.733839737861185, + "grad_norm": 0.040771484375, + "learning_rate": 0.010530152458151247, + "loss": 0.7934, + "num_input_tokens_seen": 49638336, + "step": 85495 + }, + { + "epoch": 12.734584450402146, + "grad_norm": 0.052490234375, + "learning_rate": 0.010528291417636296, + "loss": 0.8036, + "num_input_tokens_seen": 49641216, + "step": 85500 + }, + { + "epoch": 12.735329162943104, + "grad_norm": 0.050048828125, + "learning_rate": 0.010526430452666314, + "loss": 0.8109, + "num_input_tokens_seen": 49644544, + "step": 85505 + }, + { + "epoch": 12.736073875484063, + "grad_norm": 0.04443359375, + "learning_rate": 0.010524569563272745, + "loss": 0.793, + "num_input_tokens_seen": 49647488, + "step": 85510 + }, + { + "epoch": 12.736818588025022, + "grad_norm": 0.0225830078125, + "learning_rate": 0.010522708749487025, + "loss": 0.7982, + "num_input_tokens_seen": 49650112, + "step": 85515 + }, + { + "epoch": 12.73756330056598, + "grad_norm": 0.0301513671875, + "learning_rate": 0.010520848011340584, + "loss": 0.8061, + "num_input_tokens_seen": 49652800, + "step": 85520 + }, + { + "epoch": 12.738308013106941, + "grad_norm": 0.038330078125, + "learning_rate": 0.010518987348864868, + "loss": 0.813, + "num_input_tokens_seen": 49655680, + "step": 85525 + }, + { + "epoch": 12.7390527256479, + "grad_norm": 0.046142578125, + "learning_rate": 0.010517126762091305, + "loss": 0.7815, + "num_input_tokens_seen": 49658496, + "step": 85530 + }, + { + "epoch": 12.739797438188859, + "grad_norm": 0.041015625, + "learning_rate": 0.01051526625105133, + "loss": 0.841, + "num_input_tokens_seen": 49661472, + "step": 85535 + }, + { + "epoch": 12.740542150729818, + "grad_norm": 0.04150390625, + "learning_rate": 0.01051340581577637, + "loss": 0.8007, + "num_input_tokens_seen": 49664448, + "step": 85540 + }, + { + "epoch": 12.741286863270778, + "grad_norm": 0.060546875, + "learning_rate": 0.010511545456297866, + "loss": 0.8085, + "num_input_tokens_seen": 49667424, + "step": 85545 + }, + { + "epoch": 12.742031575811737, + "grad_norm": 0.022705078125, + "learning_rate": 0.010509685172647231, + "loss": 0.8014, + "num_input_tokens_seen": 49670400, + "step": 85550 + }, + { + "epoch": 12.742776288352696, + "grad_norm": 0.0303955078125, + "learning_rate": 0.01050782496485591, + "loss": 0.7981, + "num_input_tokens_seen": 49673024, + "step": 85555 + }, + { + "epoch": 12.743521000893654, + "grad_norm": 0.02978515625, + "learning_rate": 0.010505964832955319, + "loss": 0.8024, + "num_input_tokens_seen": 49676320, + "step": 85560 + }, + { + "epoch": 12.744265713434615, + "grad_norm": 0.035888671875, + "learning_rate": 0.01050410477697688, + "loss": 0.8067, + "num_input_tokens_seen": 49679360, + "step": 85565 + }, + { + "epoch": 12.745010425975574, + "grad_norm": 0.032958984375, + "learning_rate": 0.010502244796952024, + "loss": 0.8017, + "num_input_tokens_seen": 49682176, + "step": 85570 + }, + { + "epoch": 12.745755138516532, + "grad_norm": 0.03857421875, + "learning_rate": 0.010500384892912168, + "loss": 0.7916, + "num_input_tokens_seen": 49684928, + "step": 85575 + }, + { + "epoch": 12.746499851057491, + "grad_norm": 0.04052734375, + "learning_rate": 0.01049852506488874, + "loss": 0.8167, + "num_input_tokens_seen": 49687808, + "step": 85580 + }, + { + "epoch": 12.747244563598452, + "grad_norm": 0.032958984375, + "learning_rate": 0.01049666531291315, + "loss": 0.7873, + "num_input_tokens_seen": 49690752, + "step": 85585 + }, + { + "epoch": 12.74798927613941, + "grad_norm": 0.043212890625, + "learning_rate": 0.01049480563701683, + "loss": 0.7987, + "num_input_tokens_seen": 49693696, + "step": 85590 + }, + { + "epoch": 12.74873398868037, + "grad_norm": 0.043701171875, + "learning_rate": 0.01049294603723119, + "loss": 0.7911, + "num_input_tokens_seen": 49697024, + "step": 85595 + }, + { + "epoch": 12.749478701221328, + "grad_norm": 0.051025390625, + "learning_rate": 0.01049108651358764, + "loss": 0.773, + "num_input_tokens_seen": 49699712, + "step": 85600 + }, + { + "epoch": 12.750223413762289, + "grad_norm": 0.030517578125, + "learning_rate": 0.010489227066117606, + "loss": 0.8121, + "num_input_tokens_seen": 49702464, + "step": 85605 + }, + { + "epoch": 12.750968126303247, + "grad_norm": 0.0322265625, + "learning_rate": 0.010487367694852489, + "loss": 0.7849, + "num_input_tokens_seen": 49705184, + "step": 85610 + }, + { + "epoch": 12.751712838844206, + "grad_norm": 0.0322265625, + "learning_rate": 0.010485508399823717, + "loss": 0.8287, + "num_input_tokens_seen": 49708192, + "step": 85615 + }, + { + "epoch": 12.752457551385165, + "grad_norm": 0.042236328125, + "learning_rate": 0.010483649181062683, + "loss": 0.7967, + "num_input_tokens_seen": 49710912, + "step": 85620 + }, + { + "epoch": 12.753202263926125, + "grad_norm": 0.048583984375, + "learning_rate": 0.010481790038600815, + "loss": 0.7786, + "num_input_tokens_seen": 49713536, + "step": 85625 + }, + { + "epoch": 12.753946976467084, + "grad_norm": 0.035400390625, + "learning_rate": 0.010479930972469506, + "loss": 0.8001, + "num_input_tokens_seen": 49716320, + "step": 85630 + }, + { + "epoch": 12.754691689008043, + "grad_norm": 0.07275390625, + "learning_rate": 0.010478071982700175, + "loss": 0.7741, + "num_input_tokens_seen": 49719424, + "step": 85635 + }, + { + "epoch": 12.755436401549002, + "grad_norm": 0.068359375, + "learning_rate": 0.01047621306932422, + "loss": 0.7986, + "num_input_tokens_seen": 49722240, + "step": 85640 + }, + { + "epoch": 12.756181114089962, + "grad_norm": 0.022216796875, + "learning_rate": 0.010474354232373043, + "loss": 0.8105, + "num_input_tokens_seen": 49724864, + "step": 85645 + }, + { + "epoch": 12.756925826630921, + "grad_norm": 0.03662109375, + "learning_rate": 0.010472495471878054, + "loss": 0.798, + "num_input_tokens_seen": 49727776, + "step": 85650 + }, + { + "epoch": 12.75767053917188, + "grad_norm": 0.044677734375, + "learning_rate": 0.010470636787870649, + "loss": 0.7916, + "num_input_tokens_seen": 49730752, + "step": 85655 + }, + { + "epoch": 12.758415251712838, + "grad_norm": 0.0303955078125, + "learning_rate": 0.010468778180382238, + "loss": 0.8048, + "num_input_tokens_seen": 49733504, + "step": 85660 + }, + { + "epoch": 12.759159964253797, + "grad_norm": 0.0224609375, + "learning_rate": 0.010466919649444214, + "loss": 0.8019, + "num_input_tokens_seen": 49736480, + "step": 85665 + }, + { + "epoch": 12.759904676794758, + "grad_norm": 0.038330078125, + "learning_rate": 0.010465061195087975, + "loss": 0.8126, + "num_input_tokens_seen": 49739520, + "step": 85670 + }, + { + "epoch": 12.760649389335716, + "grad_norm": 0.03466796875, + "learning_rate": 0.010463202817344919, + "loss": 0.7794, + "num_input_tokens_seen": 49742272, + "step": 85675 + }, + { + "epoch": 12.761394101876675, + "grad_norm": 0.03173828125, + "learning_rate": 0.010461344516246434, + "loss": 0.8048, + "num_input_tokens_seen": 49745120, + "step": 85680 + }, + { + "epoch": 12.762138814417634, + "grad_norm": 0.033447265625, + "learning_rate": 0.010459486291823927, + "loss": 0.8011, + "num_input_tokens_seen": 49748096, + "step": 85685 + }, + { + "epoch": 12.762883526958595, + "grad_norm": 0.036865234375, + "learning_rate": 0.010457628144108777, + "loss": 0.8027, + "num_input_tokens_seen": 49750880, + "step": 85690 + }, + { + "epoch": 12.763628239499553, + "grad_norm": 0.0478515625, + "learning_rate": 0.01045577007313239, + "loss": 0.799, + "num_input_tokens_seen": 49753824, + "step": 85695 + }, + { + "epoch": 12.764372952040512, + "grad_norm": 0.0250244140625, + "learning_rate": 0.010453912078926146, + "loss": 0.7787, + "num_input_tokens_seen": 49756640, + "step": 85700 + }, + { + "epoch": 12.76511766458147, + "grad_norm": 0.058349609375, + "learning_rate": 0.010452054161521441, + "loss": 0.7807, + "num_input_tokens_seen": 49759648, + "step": 85705 + }, + { + "epoch": 12.765862377122431, + "grad_norm": 0.0390625, + "learning_rate": 0.01045019632094965, + "loss": 0.7866, + "num_input_tokens_seen": 49762592, + "step": 85710 + }, + { + "epoch": 12.76660708966339, + "grad_norm": 0.0380859375, + "learning_rate": 0.010448338557242176, + "loss": 0.7928, + "num_input_tokens_seen": 49765600, + "step": 85715 + }, + { + "epoch": 12.767351802204349, + "grad_norm": 0.03857421875, + "learning_rate": 0.010446480870430397, + "loss": 0.7769, + "num_input_tokens_seen": 49768352, + "step": 85720 + }, + { + "epoch": 12.768096514745308, + "grad_norm": 0.0240478515625, + "learning_rate": 0.010444623260545686, + "loss": 0.8113, + "num_input_tokens_seen": 49771520, + "step": 85725 + }, + { + "epoch": 12.768841227286268, + "grad_norm": 0.054443359375, + "learning_rate": 0.010442765727619444, + "loss": 0.8108, + "num_input_tokens_seen": 49774336, + "step": 85730 + }, + { + "epoch": 12.769585939827227, + "grad_norm": 0.29296875, + "learning_rate": 0.01044090827168304, + "loss": 0.8468, + "num_input_tokens_seen": 49777152, + "step": 85735 + }, + { + "epoch": 12.770330652368186, + "grad_norm": 0.03466796875, + "learning_rate": 0.01043905089276786, + "loss": 0.8309, + "num_input_tokens_seen": 49779840, + "step": 85740 + }, + { + "epoch": 12.771075364909144, + "grad_norm": 0.0234375, + "learning_rate": 0.010437193590905272, + "loss": 0.7882, + "num_input_tokens_seen": 49782592, + "step": 85745 + }, + { + "epoch": 12.771820077450105, + "grad_norm": 0.037841796875, + "learning_rate": 0.010435336366126669, + "loss": 0.8127, + "num_input_tokens_seen": 49785760, + "step": 85750 + }, + { + "epoch": 12.772564789991064, + "grad_norm": 0.0267333984375, + "learning_rate": 0.010433479218463418, + "loss": 0.7896, + "num_input_tokens_seen": 49788640, + "step": 85755 + }, + { + "epoch": 12.773309502532022, + "grad_norm": 0.0235595703125, + "learning_rate": 0.010431622147946887, + "loss": 0.7874, + "num_input_tokens_seen": 49791392, + "step": 85760 + }, + { + "epoch": 12.774054215072981, + "grad_norm": 0.0289306640625, + "learning_rate": 0.010429765154608465, + "loss": 0.7811, + "num_input_tokens_seen": 49794208, + "step": 85765 + }, + { + "epoch": 12.774798927613942, + "grad_norm": 0.033447265625, + "learning_rate": 0.010427908238479512, + "loss": 0.8149, + "num_input_tokens_seen": 49797056, + "step": 85770 + }, + { + "epoch": 12.7755436401549, + "grad_norm": 0.027099609375, + "learning_rate": 0.010426051399591404, + "loss": 0.8039, + "num_input_tokens_seen": 49799776, + "step": 85775 + }, + { + "epoch": 12.77628835269586, + "grad_norm": 0.03662109375, + "learning_rate": 0.010424194637975503, + "loss": 0.7876, + "num_input_tokens_seen": 49802688, + "step": 85780 + }, + { + "epoch": 12.777033065236818, + "grad_norm": 0.042724609375, + "learning_rate": 0.010422337953663192, + "loss": 0.7719, + "num_input_tokens_seen": 49805696, + "step": 85785 + }, + { + "epoch": 12.777777777777779, + "grad_norm": 0.03759765625, + "learning_rate": 0.010420481346685821, + "loss": 0.8054, + "num_input_tokens_seen": 49808704, + "step": 85790 + }, + { + "epoch": 12.778522490318737, + "grad_norm": 0.028564453125, + "learning_rate": 0.010418624817074767, + "loss": 0.7835, + "num_input_tokens_seen": 49811712, + "step": 85795 + }, + { + "epoch": 12.779267202859696, + "grad_norm": 0.0294189453125, + "learning_rate": 0.010416768364861394, + "loss": 0.7846, + "num_input_tokens_seen": 49814688, + "step": 85800 + }, + { + "epoch": 12.780011915400655, + "grad_norm": 0.0228271484375, + "learning_rate": 0.010414911990077056, + "loss": 0.8078, + "num_input_tokens_seen": 49817472, + "step": 85805 + }, + { + "epoch": 12.780756627941614, + "grad_norm": 0.02880859375, + "learning_rate": 0.010413055692753125, + "loss": 0.7775, + "num_input_tokens_seen": 49820384, + "step": 85810 + }, + { + "epoch": 12.781501340482574, + "grad_norm": 0.0634765625, + "learning_rate": 0.010411199472920947, + "loss": 0.8085, + "num_input_tokens_seen": 49823456, + "step": 85815 + }, + { + "epoch": 12.782246053023533, + "grad_norm": 0.0206298828125, + "learning_rate": 0.010409343330611897, + "loss": 0.8119, + "num_input_tokens_seen": 49826272, + "step": 85820 + }, + { + "epoch": 12.782990765564492, + "grad_norm": 0.029052734375, + "learning_rate": 0.010407487265857322, + "loss": 0.8148, + "num_input_tokens_seen": 49829216, + "step": 85825 + }, + { + "epoch": 12.783735478105452, + "grad_norm": 0.0286865234375, + "learning_rate": 0.010405631278688586, + "loss": 0.7931, + "num_input_tokens_seen": 49832128, + "step": 85830 + }, + { + "epoch": 12.784480190646411, + "grad_norm": 0.041259765625, + "learning_rate": 0.010403775369137037, + "loss": 0.7918, + "num_input_tokens_seen": 49835136, + "step": 85835 + }, + { + "epoch": 12.78522490318737, + "grad_norm": 0.04345703125, + "learning_rate": 0.010401919537234036, + "loss": 0.7666, + "num_input_tokens_seen": 49837984, + "step": 85840 + }, + { + "epoch": 12.785969615728328, + "grad_norm": 0.0203857421875, + "learning_rate": 0.01040006378301093, + "loss": 0.7752, + "num_input_tokens_seen": 49840896, + "step": 85845 + }, + { + "epoch": 12.786714328269287, + "grad_norm": 0.0281982421875, + "learning_rate": 0.010398208106499066, + "loss": 0.8056, + "num_input_tokens_seen": 49843872, + "step": 85850 + }, + { + "epoch": 12.787459040810248, + "grad_norm": 0.06982421875, + "learning_rate": 0.010396352507729804, + "loss": 0.7826, + "num_input_tokens_seen": 49846944, + "step": 85855 + }, + { + "epoch": 12.788203753351207, + "grad_norm": 0.036865234375, + "learning_rate": 0.010394496986734484, + "loss": 0.7803, + "num_input_tokens_seen": 49849760, + "step": 85860 + }, + { + "epoch": 12.788948465892165, + "grad_norm": 0.041259765625, + "learning_rate": 0.010392641543544459, + "loss": 0.8135, + "num_input_tokens_seen": 49852800, + "step": 85865 + }, + { + "epoch": 12.789693178433124, + "grad_norm": 0.02783203125, + "learning_rate": 0.010390786178191069, + "loss": 0.7876, + "num_input_tokens_seen": 49855712, + "step": 85870 + }, + { + "epoch": 12.790437890974085, + "grad_norm": 0.0194091796875, + "learning_rate": 0.010388930890705667, + "loss": 0.8053, + "num_input_tokens_seen": 49858560, + "step": 85875 + }, + { + "epoch": 12.791182603515043, + "grad_norm": 0.05126953125, + "learning_rate": 0.010387075681119592, + "loss": 0.8064, + "num_input_tokens_seen": 49861344, + "step": 85880 + }, + { + "epoch": 12.791927316056002, + "grad_norm": 0.048583984375, + "learning_rate": 0.010385220549464178, + "loss": 0.7979, + "num_input_tokens_seen": 49864032, + "step": 85885 + }, + { + "epoch": 12.79267202859696, + "grad_norm": 0.021728515625, + "learning_rate": 0.010383365495770779, + "loss": 0.8042, + "num_input_tokens_seen": 49866912, + "step": 85890 + }, + { + "epoch": 12.793416741137921, + "grad_norm": 0.0478515625, + "learning_rate": 0.010381510520070723, + "loss": 0.7736, + "num_input_tokens_seen": 49869504, + "step": 85895 + }, + { + "epoch": 12.79416145367888, + "grad_norm": 0.028564453125, + "learning_rate": 0.010379655622395359, + "loss": 0.8059, + "num_input_tokens_seen": 49872160, + "step": 85900 + }, + { + "epoch": 12.794906166219839, + "grad_norm": 0.04833984375, + "learning_rate": 0.010377800802776008, + "loss": 0.806, + "num_input_tokens_seen": 49875264, + "step": 85905 + }, + { + "epoch": 12.795650878760798, + "grad_norm": 0.04248046875, + "learning_rate": 0.010375946061244022, + "loss": 0.7824, + "num_input_tokens_seen": 49877920, + "step": 85910 + }, + { + "epoch": 12.796395591301758, + "grad_norm": 0.0400390625, + "learning_rate": 0.010374091397830723, + "loss": 0.8151, + "num_input_tokens_seen": 49880672, + "step": 85915 + }, + { + "epoch": 12.797140303842717, + "grad_norm": 0.057861328125, + "learning_rate": 0.010372236812567453, + "loss": 0.8094, + "num_input_tokens_seen": 49883424, + "step": 85920 + }, + { + "epoch": 12.797885016383676, + "grad_norm": 0.0311279296875, + "learning_rate": 0.010370382305485539, + "loss": 0.7962, + "num_input_tokens_seen": 49886368, + "step": 85925 + }, + { + "epoch": 12.798629728924634, + "grad_norm": 0.034912109375, + "learning_rate": 0.010368527876616306, + "loss": 0.7996, + "num_input_tokens_seen": 49889280, + "step": 85930 + }, + { + "epoch": 12.799374441465595, + "grad_norm": 0.043212890625, + "learning_rate": 0.01036667352599109, + "loss": 0.7918, + "num_input_tokens_seen": 49892352, + "step": 85935 + }, + { + "epoch": 12.800119154006554, + "grad_norm": 0.033203125, + "learning_rate": 0.010364819253641213, + "loss": 0.8002, + "num_input_tokens_seen": 49895040, + "step": 85940 + }, + { + "epoch": 12.800863866547513, + "grad_norm": 0.0301513671875, + "learning_rate": 0.010362965059598009, + "loss": 0.7745, + "num_input_tokens_seen": 49897600, + "step": 85945 + }, + { + "epoch": 12.801608579088471, + "grad_norm": 0.048583984375, + "learning_rate": 0.01036111094389279, + "loss": 0.8078, + "num_input_tokens_seen": 49900832, + "step": 85950 + }, + { + "epoch": 12.802353291629432, + "grad_norm": 0.030517578125, + "learning_rate": 0.010359256906556893, + "loss": 0.7848, + "num_input_tokens_seen": 49903840, + "step": 85955 + }, + { + "epoch": 12.80309800417039, + "grad_norm": 0.052490234375, + "learning_rate": 0.010357402947621634, + "loss": 0.8055, + "num_input_tokens_seen": 49906848, + "step": 85960 + }, + { + "epoch": 12.80384271671135, + "grad_norm": 0.02880859375, + "learning_rate": 0.01035554906711833, + "loss": 0.7895, + "num_input_tokens_seen": 49909536, + "step": 85965 + }, + { + "epoch": 12.804587429252308, + "grad_norm": 0.0400390625, + "learning_rate": 0.010353695265078308, + "loss": 0.7836, + "num_input_tokens_seen": 49912384, + "step": 85970 + }, + { + "epoch": 12.805332141793269, + "grad_norm": 0.05712890625, + "learning_rate": 0.010351841541532878, + "loss": 0.7885, + "num_input_tokens_seen": 49915616, + "step": 85975 + }, + { + "epoch": 12.806076854334227, + "grad_norm": 0.046630859375, + "learning_rate": 0.010349987896513363, + "loss": 0.7888, + "num_input_tokens_seen": 49918624, + "step": 85980 + }, + { + "epoch": 12.806821566875186, + "grad_norm": 0.06591796875, + "learning_rate": 0.010348134330051073, + "loss": 0.7817, + "num_input_tokens_seen": 49921984, + "step": 85985 + }, + { + "epoch": 12.807566279416145, + "grad_norm": 0.04345703125, + "learning_rate": 0.01034628084217733, + "loss": 0.8411, + "num_input_tokens_seen": 49924576, + "step": 85990 + }, + { + "epoch": 12.808310991957104, + "grad_norm": 0.0223388671875, + "learning_rate": 0.010344427432923442, + "loss": 0.8057, + "num_input_tokens_seen": 49927296, + "step": 85995 + }, + { + "epoch": 12.809055704498064, + "grad_norm": 0.060546875, + "learning_rate": 0.010342574102320723, + "loss": 0.8356, + "num_input_tokens_seen": 49930304, + "step": 86000 + }, + { + "epoch": 12.809800417039023, + "grad_norm": 0.0294189453125, + "learning_rate": 0.010340720850400479, + "loss": 0.7937, + "num_input_tokens_seen": 49933376, + "step": 86005 + }, + { + "epoch": 12.810545129579982, + "grad_norm": 0.05029296875, + "learning_rate": 0.010338867677194016, + "loss": 0.7876, + "num_input_tokens_seen": 49936000, + "step": 86010 + }, + { + "epoch": 12.811289842120942, + "grad_norm": 0.048583984375, + "learning_rate": 0.010337014582732652, + "loss": 0.7889, + "num_input_tokens_seen": 49938976, + "step": 86015 + }, + { + "epoch": 12.812034554661901, + "grad_norm": 0.0517578125, + "learning_rate": 0.01033516156704768, + "loss": 0.787, + "num_input_tokens_seen": 49941888, + "step": 86020 + }, + { + "epoch": 12.81277926720286, + "grad_norm": 0.03369140625, + "learning_rate": 0.01033330863017042, + "loss": 0.7909, + "num_input_tokens_seen": 49944544, + "step": 86025 + }, + { + "epoch": 12.813523979743819, + "grad_norm": 0.053955078125, + "learning_rate": 0.010331455772132165, + "loss": 0.7961, + "num_input_tokens_seen": 49947200, + "step": 86030 + }, + { + "epoch": 12.814268692284777, + "grad_norm": 0.049072265625, + "learning_rate": 0.010329602992964223, + "loss": 0.8113, + "num_input_tokens_seen": 49950112, + "step": 86035 + }, + { + "epoch": 12.815013404825738, + "grad_norm": 0.0458984375, + "learning_rate": 0.010327750292697892, + "loss": 0.7942, + "num_input_tokens_seen": 49952832, + "step": 86040 + }, + { + "epoch": 12.815758117366697, + "grad_norm": 0.044921875, + "learning_rate": 0.010325897671364463, + "loss": 0.8003, + "num_input_tokens_seen": 49955712, + "step": 86045 + }, + { + "epoch": 12.816502829907655, + "grad_norm": 0.034912109375, + "learning_rate": 0.010324045128995249, + "loss": 0.8259, + "num_input_tokens_seen": 49958720, + "step": 86050 + }, + { + "epoch": 12.817247542448614, + "grad_norm": 0.033203125, + "learning_rate": 0.010322192665621534, + "loss": 0.8147, + "num_input_tokens_seen": 49961760, + "step": 86055 + }, + { + "epoch": 12.817992254989575, + "grad_norm": 0.0439453125, + "learning_rate": 0.010320340281274626, + "loss": 0.7908, + "num_input_tokens_seen": 49964576, + "step": 86060 + }, + { + "epoch": 12.818736967530533, + "grad_norm": 0.033447265625, + "learning_rate": 0.01031848797598581, + "loss": 0.8338, + "num_input_tokens_seen": 49967584, + "step": 86065 + }, + { + "epoch": 12.819481680071492, + "grad_norm": 0.04296875, + "learning_rate": 0.010316635749786385, + "loss": 0.8054, + "num_input_tokens_seen": 49970624, + "step": 86070 + }, + { + "epoch": 12.820226392612451, + "grad_norm": 0.06201171875, + "learning_rate": 0.010314783602707633, + "loss": 0.794, + "num_input_tokens_seen": 49973312, + "step": 86075 + }, + { + "epoch": 12.820971105153411, + "grad_norm": 0.041015625, + "learning_rate": 0.010312931534780856, + "loss": 0.8129, + "num_input_tokens_seen": 49976448, + "step": 86080 + }, + { + "epoch": 12.82171581769437, + "grad_norm": 0.044677734375, + "learning_rate": 0.010311079546037338, + "loss": 0.7738, + "num_input_tokens_seen": 49979488, + "step": 86085 + }, + { + "epoch": 12.822460530235329, + "grad_norm": 0.02197265625, + "learning_rate": 0.010309227636508357, + "loss": 0.8135, + "num_input_tokens_seen": 49982240, + "step": 86090 + }, + { + "epoch": 12.823205242776288, + "grad_norm": 0.03125, + "learning_rate": 0.010307375806225212, + "loss": 0.8038, + "num_input_tokens_seen": 49985088, + "step": 86095 + }, + { + "epoch": 12.823949955317248, + "grad_norm": 0.0380859375, + "learning_rate": 0.010305524055219183, + "loss": 0.8103, + "num_input_tokens_seen": 49988192, + "step": 86100 + }, + { + "epoch": 12.824694667858207, + "grad_norm": 0.032958984375, + "learning_rate": 0.010303672383521558, + "loss": 0.7982, + "num_input_tokens_seen": 49991392, + "step": 86105 + }, + { + "epoch": 12.825439380399166, + "grad_norm": 0.056884765625, + "learning_rate": 0.010301820791163607, + "loss": 0.8197, + "num_input_tokens_seen": 49994496, + "step": 86110 + }, + { + "epoch": 12.826184092940125, + "grad_norm": 0.055908203125, + "learning_rate": 0.010299969278176625, + "loss": 0.7893, + "num_input_tokens_seen": 49997408, + "step": 86115 + }, + { + "epoch": 12.826928805481085, + "grad_norm": 0.0322265625, + "learning_rate": 0.010298117844591888, + "loss": 0.7873, + "num_input_tokens_seen": 50000288, + "step": 86120 + }, + { + "epoch": 12.827673518022044, + "grad_norm": 0.031005859375, + "learning_rate": 0.010296266490440662, + "loss": 0.8211, + "num_input_tokens_seen": 50003264, + "step": 86125 + }, + { + "epoch": 12.828418230563003, + "grad_norm": 0.0294189453125, + "learning_rate": 0.01029441521575424, + "loss": 0.7869, + "num_input_tokens_seen": 50005920, + "step": 86130 + }, + { + "epoch": 12.829162943103961, + "grad_norm": 0.041748046875, + "learning_rate": 0.010292564020563888, + "loss": 0.7794, + "num_input_tokens_seen": 50008800, + "step": 86135 + }, + { + "epoch": 12.829907655644922, + "grad_norm": 0.03564453125, + "learning_rate": 0.010290712904900884, + "loss": 0.7987, + "num_input_tokens_seen": 50012352, + "step": 86140 + }, + { + "epoch": 12.83065236818588, + "grad_norm": 0.038818359375, + "learning_rate": 0.010288861868796493, + "loss": 0.803, + "num_input_tokens_seen": 50015424, + "step": 86145 + }, + { + "epoch": 12.83139708072684, + "grad_norm": 0.0322265625, + "learning_rate": 0.010287010912282001, + "loss": 0.7941, + "num_input_tokens_seen": 50018176, + "step": 86150 + }, + { + "epoch": 12.832141793267798, + "grad_norm": 0.06787109375, + "learning_rate": 0.01028516003538866, + "loss": 0.7931, + "num_input_tokens_seen": 50021024, + "step": 86155 + }, + { + "epoch": 12.832886505808759, + "grad_norm": 0.03369140625, + "learning_rate": 0.010283309238147758, + "loss": 0.7784, + "num_input_tokens_seen": 50023968, + "step": 86160 + }, + { + "epoch": 12.833631218349717, + "grad_norm": 0.056884765625, + "learning_rate": 0.01028145852059055, + "loss": 0.8209, + "num_input_tokens_seen": 50027104, + "step": 86165 + }, + { + "epoch": 12.834375930890676, + "grad_norm": 0.03955078125, + "learning_rate": 0.010279607882748303, + "loss": 0.7865, + "num_input_tokens_seen": 50030016, + "step": 86170 + }, + { + "epoch": 12.835120643431635, + "grad_norm": 0.048095703125, + "learning_rate": 0.010277757324652286, + "loss": 0.7966, + "num_input_tokens_seen": 50032800, + "step": 86175 + }, + { + "epoch": 12.835865355972594, + "grad_norm": 0.08935546875, + "learning_rate": 0.010275906846333753, + "loss": 0.7894, + "num_input_tokens_seen": 50035872, + "step": 86180 + }, + { + "epoch": 12.836610068513554, + "grad_norm": 0.0322265625, + "learning_rate": 0.01027405644782398, + "loss": 0.8183, + "num_input_tokens_seen": 50038592, + "step": 86185 + }, + { + "epoch": 12.837354781054513, + "grad_norm": 0.03369140625, + "learning_rate": 0.010272206129154214, + "loss": 0.8156, + "num_input_tokens_seen": 50041472, + "step": 86190 + }, + { + "epoch": 12.838099493595472, + "grad_norm": 0.0400390625, + "learning_rate": 0.010270355890355723, + "loss": 0.8261, + "num_input_tokens_seen": 50044512, + "step": 86195 + }, + { + "epoch": 12.83884420613643, + "grad_norm": 0.048828125, + "learning_rate": 0.010268505731459765, + "loss": 0.7974, + "num_input_tokens_seen": 50047136, + "step": 86200 + }, + { + "epoch": 12.839588918677391, + "grad_norm": 0.0380859375, + "learning_rate": 0.010266655652497588, + "loss": 0.7772, + "num_input_tokens_seen": 50050272, + "step": 86205 + }, + { + "epoch": 12.84033363121835, + "grad_norm": 0.10986328125, + "learning_rate": 0.01026480565350046, + "loss": 0.8072, + "num_input_tokens_seen": 50053088, + "step": 86210 + }, + { + "epoch": 12.841078343759309, + "grad_norm": 0.0322265625, + "learning_rate": 0.01026295573449962, + "loss": 0.7881, + "num_input_tokens_seen": 50056000, + "step": 86215 + }, + { + "epoch": 12.841823056300267, + "grad_norm": 0.02978515625, + "learning_rate": 0.010261105895526333, + "loss": 0.8231, + "num_input_tokens_seen": 50058880, + "step": 86220 + }, + { + "epoch": 12.842567768841228, + "grad_norm": 0.0213623046875, + "learning_rate": 0.010259256136611846, + "loss": 0.7778, + "num_input_tokens_seen": 50061920, + "step": 86225 + }, + { + "epoch": 12.843312481382187, + "grad_norm": 0.04345703125, + "learning_rate": 0.010257406457787409, + "loss": 0.7983, + "num_input_tokens_seen": 50065120, + "step": 86230 + }, + { + "epoch": 12.844057193923145, + "grad_norm": 0.04345703125, + "learning_rate": 0.010255556859084263, + "loss": 0.7772, + "num_input_tokens_seen": 50067936, + "step": 86235 + }, + { + "epoch": 12.844801906464104, + "grad_norm": 0.031494140625, + "learning_rate": 0.010253707340533668, + "loss": 0.798, + "num_input_tokens_seen": 50070784, + "step": 86240 + }, + { + "epoch": 12.845546619005065, + "grad_norm": 0.0322265625, + "learning_rate": 0.010251857902166865, + "loss": 0.766, + "num_input_tokens_seen": 50073504, + "step": 86245 + }, + { + "epoch": 12.846291331546023, + "grad_norm": 0.042236328125, + "learning_rate": 0.010250008544015088, + "loss": 0.8052, + "num_input_tokens_seen": 50076608, + "step": 86250 + }, + { + "epoch": 12.847036044086982, + "grad_norm": 0.042724609375, + "learning_rate": 0.0102481592661096, + "loss": 0.7888, + "num_input_tokens_seen": 50079648, + "step": 86255 + }, + { + "epoch": 12.847780756627941, + "grad_norm": 0.03955078125, + "learning_rate": 0.010246310068481625, + "loss": 0.8013, + "num_input_tokens_seen": 50082336, + "step": 86260 + }, + { + "epoch": 12.848525469168901, + "grad_norm": 0.0196533203125, + "learning_rate": 0.010244460951162413, + "loss": 0.7847, + "num_input_tokens_seen": 50085248, + "step": 86265 + }, + { + "epoch": 12.84927018170986, + "grad_norm": 0.029541015625, + "learning_rate": 0.010242611914183195, + "loss": 0.7802, + "num_input_tokens_seen": 50087968, + "step": 86270 + }, + { + "epoch": 12.850014894250819, + "grad_norm": 0.042724609375, + "learning_rate": 0.01024076295757522, + "loss": 0.7902, + "num_input_tokens_seen": 50090656, + "step": 86275 + }, + { + "epoch": 12.850759606791778, + "grad_norm": 0.033203125, + "learning_rate": 0.010238914081369717, + "loss": 0.8041, + "num_input_tokens_seen": 50093728, + "step": 86280 + }, + { + "epoch": 12.851504319332738, + "grad_norm": 0.03564453125, + "learning_rate": 0.010237065285597915, + "loss": 0.7992, + "num_input_tokens_seen": 50096384, + "step": 86285 + }, + { + "epoch": 12.852249031873697, + "grad_norm": 0.036865234375, + "learning_rate": 0.010235216570291061, + "loss": 0.805, + "num_input_tokens_seen": 50099040, + "step": 86290 + }, + { + "epoch": 12.852993744414656, + "grad_norm": 0.05126953125, + "learning_rate": 0.010233367935480377, + "loss": 0.7604, + "num_input_tokens_seen": 50102208, + "step": 86295 + }, + { + "epoch": 12.853738456955615, + "grad_norm": 0.02685546875, + "learning_rate": 0.0102315193811971, + "loss": 0.8026, + "num_input_tokens_seen": 50104960, + "step": 86300 + }, + { + "epoch": 12.854483169496575, + "grad_norm": 0.054443359375, + "learning_rate": 0.01022967090747245, + "loss": 0.8049, + "num_input_tokens_seen": 50108160, + "step": 86305 + }, + { + "epoch": 12.855227882037534, + "grad_norm": 0.0225830078125, + "learning_rate": 0.010227822514337669, + "loss": 0.784, + "num_input_tokens_seen": 50111072, + "step": 86310 + }, + { + "epoch": 12.855972594578493, + "grad_norm": 0.0380859375, + "learning_rate": 0.010225974201823971, + "loss": 0.7824, + "num_input_tokens_seen": 50113728, + "step": 86315 + }, + { + "epoch": 12.856717307119451, + "grad_norm": 0.035888671875, + "learning_rate": 0.01022412596996259, + "loss": 0.8089, + "num_input_tokens_seen": 50116544, + "step": 86320 + }, + { + "epoch": 12.857462019660412, + "grad_norm": 0.05517578125, + "learning_rate": 0.01022227781878475, + "loss": 0.7749, + "num_input_tokens_seen": 50119328, + "step": 86325 + }, + { + "epoch": 12.85820673220137, + "grad_norm": 0.044921875, + "learning_rate": 0.010220429748321665, + "loss": 0.7975, + "num_input_tokens_seen": 50122016, + "step": 86330 + }, + { + "epoch": 12.85895144474233, + "grad_norm": 0.044921875, + "learning_rate": 0.010218581758604567, + "loss": 0.7983, + "num_input_tokens_seen": 50124800, + "step": 86335 + }, + { + "epoch": 12.859696157283288, + "grad_norm": 0.0308837890625, + "learning_rate": 0.010216733849664662, + "loss": 0.7831, + "num_input_tokens_seen": 50127680, + "step": 86340 + }, + { + "epoch": 12.860440869824249, + "grad_norm": 0.03564453125, + "learning_rate": 0.010214886021533184, + "loss": 0.779, + "num_input_tokens_seen": 50131040, + "step": 86345 + }, + { + "epoch": 12.861185582365207, + "grad_norm": 0.04296875, + "learning_rate": 0.010213038274241337, + "loss": 0.8371, + "num_input_tokens_seen": 50134112, + "step": 86350 + }, + { + "epoch": 12.861930294906166, + "grad_norm": 0.032958984375, + "learning_rate": 0.01021119060782035, + "loss": 0.7915, + "num_input_tokens_seen": 50137216, + "step": 86355 + }, + { + "epoch": 12.862675007447125, + "grad_norm": 0.030029296875, + "learning_rate": 0.01020934302230143, + "loss": 0.7732, + "num_input_tokens_seen": 50139872, + "step": 86360 + }, + { + "epoch": 12.863419719988084, + "grad_norm": 0.0311279296875, + "learning_rate": 0.010207495517715787, + "loss": 0.8003, + "num_input_tokens_seen": 50142592, + "step": 86365 + }, + { + "epoch": 12.864164432529044, + "grad_norm": 0.05029296875, + "learning_rate": 0.010205648094094643, + "loss": 0.7842, + "num_input_tokens_seen": 50145504, + "step": 86370 + }, + { + "epoch": 12.864909145070003, + "grad_norm": 0.044677734375, + "learning_rate": 0.01020380075146919, + "loss": 0.8078, + "num_input_tokens_seen": 50148384, + "step": 86375 + }, + { + "epoch": 12.865653857610962, + "grad_norm": 0.041748046875, + "learning_rate": 0.010201953489870658, + "loss": 0.7977, + "num_input_tokens_seen": 50151072, + "step": 86380 + }, + { + "epoch": 12.86639857015192, + "grad_norm": 0.03173828125, + "learning_rate": 0.010200106309330237, + "loss": 0.7758, + "num_input_tokens_seen": 50154304, + "step": 86385 + }, + { + "epoch": 12.867143282692881, + "grad_norm": 0.04296875, + "learning_rate": 0.01019825920987915, + "loss": 0.8082, + "num_input_tokens_seen": 50157376, + "step": 86390 + }, + { + "epoch": 12.86788799523384, + "grad_norm": 0.05078125, + "learning_rate": 0.010196412191548591, + "loss": 0.7716, + "num_input_tokens_seen": 50160128, + "step": 86395 + }, + { + "epoch": 12.868632707774799, + "grad_norm": 0.032470703125, + "learning_rate": 0.010194565254369767, + "loss": 0.8365, + "num_input_tokens_seen": 50162944, + "step": 86400 + }, + { + "epoch": 12.869377420315757, + "grad_norm": 0.04931640625, + "learning_rate": 0.01019271839837388, + "loss": 0.8183, + "num_input_tokens_seen": 50165824, + "step": 86405 + }, + { + "epoch": 12.870122132856718, + "grad_norm": 0.03955078125, + "learning_rate": 0.010190871623592123, + "loss": 0.8114, + "num_input_tokens_seen": 50168640, + "step": 86410 + }, + { + "epoch": 12.870866845397677, + "grad_norm": 0.0322265625, + "learning_rate": 0.010189024930055707, + "loss": 0.8259, + "num_input_tokens_seen": 50171392, + "step": 86415 + }, + { + "epoch": 12.871611557938635, + "grad_norm": 0.040283203125, + "learning_rate": 0.010187178317795823, + "loss": 0.8108, + "num_input_tokens_seen": 50174400, + "step": 86420 + }, + { + "epoch": 12.872356270479594, + "grad_norm": 0.038330078125, + "learning_rate": 0.010185331786843672, + "loss": 0.7816, + "num_input_tokens_seen": 50177056, + "step": 86425 + }, + { + "epoch": 12.873100983020555, + "grad_norm": 0.0289306640625, + "learning_rate": 0.010183485337230446, + "loss": 0.7961, + "num_input_tokens_seen": 50179872, + "step": 86430 + }, + { + "epoch": 12.873845695561513, + "grad_norm": 0.0576171875, + "learning_rate": 0.010181638968987342, + "loss": 0.8054, + "num_input_tokens_seen": 50182464, + "step": 86435 + }, + { + "epoch": 12.874590408102472, + "grad_norm": 0.029052734375, + "learning_rate": 0.010179792682145543, + "loss": 0.7869, + "num_input_tokens_seen": 50185280, + "step": 86440 + }, + { + "epoch": 12.875335120643431, + "grad_norm": 0.055419921875, + "learning_rate": 0.010177946476736257, + "loss": 0.7993, + "num_input_tokens_seen": 50188224, + "step": 86445 + }, + { + "epoch": 12.876079833184392, + "grad_norm": 0.031982421875, + "learning_rate": 0.010176100352790662, + "loss": 0.7755, + "num_input_tokens_seen": 50191104, + "step": 86450 + }, + { + "epoch": 12.87682454572535, + "grad_norm": 0.031494140625, + "learning_rate": 0.010174254310339945, + "loss": 0.7894, + "num_input_tokens_seen": 50193856, + "step": 86455 + }, + { + "epoch": 12.877569258266309, + "grad_norm": 0.032470703125, + "learning_rate": 0.0101724083494153, + "loss": 0.804, + "num_input_tokens_seen": 50196704, + "step": 86460 + }, + { + "epoch": 12.878313970807268, + "grad_norm": 0.043212890625, + "learning_rate": 0.010170562470047908, + "loss": 0.7981, + "num_input_tokens_seen": 50199584, + "step": 86465 + }, + { + "epoch": 12.879058683348228, + "grad_norm": 0.045166015625, + "learning_rate": 0.010168716672268957, + "loss": 0.7835, + "num_input_tokens_seen": 50202464, + "step": 86470 + }, + { + "epoch": 12.879803395889187, + "grad_norm": 0.041015625, + "learning_rate": 0.010166870956109622, + "loss": 0.785, + "num_input_tokens_seen": 50205184, + "step": 86475 + }, + { + "epoch": 12.880548108430146, + "grad_norm": 0.04296875, + "learning_rate": 0.010165025321601096, + "loss": 0.7975, + "num_input_tokens_seen": 50208000, + "step": 86480 + }, + { + "epoch": 12.881292820971105, + "grad_norm": 0.056396484375, + "learning_rate": 0.010163179768774552, + "loss": 0.7976, + "num_input_tokens_seen": 50210752, + "step": 86485 + }, + { + "epoch": 12.882037533512065, + "grad_norm": 0.052490234375, + "learning_rate": 0.010161334297661166, + "loss": 0.7859, + "num_input_tokens_seen": 50213792, + "step": 86490 + }, + { + "epoch": 12.882782246053024, + "grad_norm": 0.037841796875, + "learning_rate": 0.010159488908292125, + "loss": 0.8001, + "num_input_tokens_seen": 50216640, + "step": 86495 + }, + { + "epoch": 12.883526958593983, + "grad_norm": 0.049072265625, + "learning_rate": 0.010157643600698594, + "loss": 0.8537, + "num_input_tokens_seen": 50219520, + "step": 86500 + }, + { + "epoch": 12.884271671134941, + "grad_norm": 0.08447265625, + "learning_rate": 0.010155798374911758, + "loss": 0.8173, + "num_input_tokens_seen": 50222272, + "step": 86505 + }, + { + "epoch": 12.8850163836759, + "grad_norm": 0.0294189453125, + "learning_rate": 0.010153953230962777, + "loss": 0.7853, + "num_input_tokens_seen": 50225152, + "step": 86510 + }, + { + "epoch": 12.88576109621686, + "grad_norm": 0.041259765625, + "learning_rate": 0.010152108168882838, + "loss": 0.7976, + "num_input_tokens_seen": 50228160, + "step": 86515 + }, + { + "epoch": 12.88650580875782, + "grad_norm": 0.030029296875, + "learning_rate": 0.010150263188703101, + "loss": 0.7875, + "num_input_tokens_seen": 50231168, + "step": 86520 + }, + { + "epoch": 12.887250521298778, + "grad_norm": 0.0284423828125, + "learning_rate": 0.01014841829045474, + "loss": 0.8005, + "num_input_tokens_seen": 50233760, + "step": 86525 + }, + { + "epoch": 12.887995233839739, + "grad_norm": 0.0306396484375, + "learning_rate": 0.010146573474168922, + "loss": 0.7868, + "num_input_tokens_seen": 50236544, + "step": 86530 + }, + { + "epoch": 12.888739946380698, + "grad_norm": 0.0218505859375, + "learning_rate": 0.010144728739876811, + "loss": 0.8006, + "num_input_tokens_seen": 50239328, + "step": 86535 + }, + { + "epoch": 12.889484658921656, + "grad_norm": 0.03076171875, + "learning_rate": 0.010142884087609575, + "loss": 0.7905, + "num_input_tokens_seen": 50242208, + "step": 86540 + }, + { + "epoch": 12.890229371462615, + "grad_norm": 0.033935546875, + "learning_rate": 0.010141039517398368, + "loss": 0.8009, + "num_input_tokens_seen": 50245248, + "step": 86545 + }, + { + "epoch": 12.890974084003574, + "grad_norm": 0.061767578125, + "learning_rate": 0.01013919502927437, + "loss": 0.8002, + "num_input_tokens_seen": 50248480, + "step": 86550 + }, + { + "epoch": 12.891718796544534, + "grad_norm": 0.026611328125, + "learning_rate": 0.010137350623268726, + "loss": 0.7795, + "num_input_tokens_seen": 50251072, + "step": 86555 + }, + { + "epoch": 12.892463509085493, + "grad_norm": 0.046630859375, + "learning_rate": 0.010135506299412607, + "loss": 0.7962, + "num_input_tokens_seen": 50254208, + "step": 86560 + }, + { + "epoch": 12.893208221626452, + "grad_norm": 0.046630859375, + "learning_rate": 0.010133662057737159, + "loss": 0.7999, + "num_input_tokens_seen": 50257280, + "step": 86565 + }, + { + "epoch": 12.89395293416741, + "grad_norm": 0.05517578125, + "learning_rate": 0.010131817898273545, + "loss": 0.8052, + "num_input_tokens_seen": 50260064, + "step": 86570 + }, + { + "epoch": 12.894697646708371, + "grad_norm": 0.03955078125, + "learning_rate": 0.010129973821052922, + "loss": 0.8034, + "num_input_tokens_seen": 50262816, + "step": 86575 + }, + { + "epoch": 12.89544235924933, + "grad_norm": 0.058837890625, + "learning_rate": 0.010128129826106438, + "loss": 0.7831, + "num_input_tokens_seen": 50265984, + "step": 86580 + }, + { + "epoch": 12.896187071790289, + "grad_norm": 0.0211181640625, + "learning_rate": 0.010126285913465254, + "loss": 0.8022, + "num_input_tokens_seen": 50268832, + "step": 86585 + }, + { + "epoch": 12.896931784331247, + "grad_norm": 0.05322265625, + "learning_rate": 0.010124442083160512, + "loss": 0.7785, + "num_input_tokens_seen": 50271648, + "step": 86590 + }, + { + "epoch": 12.897676496872208, + "grad_norm": 0.042724609375, + "learning_rate": 0.010122598335223372, + "loss": 0.7873, + "num_input_tokens_seen": 50274560, + "step": 86595 + }, + { + "epoch": 12.898421209413167, + "grad_norm": 0.031005859375, + "learning_rate": 0.010120754669684965, + "loss": 0.8028, + "num_input_tokens_seen": 50277408, + "step": 86600 + }, + { + "epoch": 12.899165921954125, + "grad_norm": 0.046142578125, + "learning_rate": 0.010118911086576457, + "loss": 0.8004, + "num_input_tokens_seen": 50280320, + "step": 86605 + }, + { + "epoch": 12.899910634495084, + "grad_norm": 0.050537109375, + "learning_rate": 0.010117067585928988, + "loss": 0.7924, + "num_input_tokens_seen": 50282816, + "step": 86610 + }, + { + "epoch": 12.900655347036045, + "grad_norm": 0.03125, + "learning_rate": 0.01011522416777369, + "loss": 0.8039, + "num_input_tokens_seen": 50285504, + "step": 86615 + }, + { + "epoch": 12.901400059577004, + "grad_norm": 0.03173828125, + "learning_rate": 0.010113380832141722, + "loss": 0.7877, + "num_input_tokens_seen": 50288096, + "step": 86620 + }, + { + "epoch": 12.902144772117962, + "grad_norm": 0.0390625, + "learning_rate": 0.010111537579064215, + "loss": 0.816, + "num_input_tokens_seen": 50290912, + "step": 86625 + }, + { + "epoch": 12.902889484658921, + "grad_norm": 0.03076171875, + "learning_rate": 0.010109694408572316, + "loss": 0.7988, + "num_input_tokens_seen": 50293696, + "step": 86630 + }, + { + "epoch": 12.903634197199882, + "grad_norm": 0.048095703125, + "learning_rate": 0.010107851320697152, + "loss": 0.8234, + "num_input_tokens_seen": 50296576, + "step": 86635 + }, + { + "epoch": 12.90437890974084, + "grad_norm": 0.02294921875, + "learning_rate": 0.010106008315469876, + "loss": 0.7911, + "num_input_tokens_seen": 50299424, + "step": 86640 + }, + { + "epoch": 12.905123622281799, + "grad_norm": 0.046142578125, + "learning_rate": 0.010104165392921614, + "loss": 0.8107, + "num_input_tokens_seen": 50302208, + "step": 86645 + }, + { + "epoch": 12.905868334822758, + "grad_norm": 0.0419921875, + "learning_rate": 0.010102322553083498, + "loss": 0.7866, + "num_input_tokens_seen": 50305248, + "step": 86650 + }, + { + "epoch": 12.906613047363718, + "grad_norm": 0.046875, + "learning_rate": 0.010100479795986668, + "loss": 0.7917, + "num_input_tokens_seen": 50308096, + "step": 86655 + }, + { + "epoch": 12.907357759904677, + "grad_norm": 0.030029296875, + "learning_rate": 0.010098637121662252, + "loss": 0.793, + "num_input_tokens_seen": 50310816, + "step": 86660 + }, + { + "epoch": 12.908102472445636, + "grad_norm": 0.02783203125, + "learning_rate": 0.01009679453014138, + "loss": 0.7928, + "num_input_tokens_seen": 50313568, + "step": 86665 + }, + { + "epoch": 12.908847184986595, + "grad_norm": 0.04833984375, + "learning_rate": 0.010094952021455177, + "loss": 0.8105, + "num_input_tokens_seen": 50316736, + "step": 86670 + }, + { + "epoch": 12.909591897527555, + "grad_norm": 0.024658203125, + "learning_rate": 0.010093109595634779, + "loss": 0.7879, + "num_input_tokens_seen": 50319744, + "step": 86675 + }, + { + "epoch": 12.910336610068514, + "grad_norm": 0.12890625, + "learning_rate": 0.010091267252711303, + "loss": 0.8059, + "num_input_tokens_seen": 50322368, + "step": 86680 + }, + { + "epoch": 12.911081322609473, + "grad_norm": 0.039794921875, + "learning_rate": 0.010089424992715881, + "loss": 0.7735, + "num_input_tokens_seen": 50325280, + "step": 86685 + }, + { + "epoch": 12.911826035150431, + "grad_norm": 0.0361328125, + "learning_rate": 0.010087582815679634, + "loss": 0.8078, + "num_input_tokens_seen": 50328224, + "step": 86690 + }, + { + "epoch": 12.91257074769139, + "grad_norm": 0.05126953125, + "learning_rate": 0.010085740721633681, + "loss": 0.7833, + "num_input_tokens_seen": 50331232, + "step": 86695 + }, + { + "epoch": 12.91331546023235, + "grad_norm": 0.054931640625, + "learning_rate": 0.010083898710609147, + "loss": 0.7872, + "num_input_tokens_seen": 50334144, + "step": 86700 + }, + { + "epoch": 12.91406017277331, + "grad_norm": 0.052734375, + "learning_rate": 0.01008205678263714, + "loss": 0.7939, + "num_input_tokens_seen": 50336896, + "step": 86705 + }, + { + "epoch": 12.914804885314268, + "grad_norm": 0.0654296875, + "learning_rate": 0.010080214937748794, + "loss": 0.8216, + "num_input_tokens_seen": 50339968, + "step": 86710 + }, + { + "epoch": 12.915549597855229, + "grad_norm": 0.0218505859375, + "learning_rate": 0.010078373175975206, + "loss": 0.7853, + "num_input_tokens_seen": 50342912, + "step": 86715 + }, + { + "epoch": 12.916294310396188, + "grad_norm": 0.0281982421875, + "learning_rate": 0.010076531497347512, + "loss": 0.7927, + "num_input_tokens_seen": 50345728, + "step": 86720 + }, + { + "epoch": 12.917039022937146, + "grad_norm": 0.06884765625, + "learning_rate": 0.010074689901896812, + "loss": 0.7765, + "num_input_tokens_seen": 50348928, + "step": 86725 + }, + { + "epoch": 12.917783735478105, + "grad_norm": 0.031494140625, + "learning_rate": 0.010072848389654217, + "loss": 0.8018, + "num_input_tokens_seen": 50351744, + "step": 86730 + }, + { + "epoch": 12.918528448019064, + "grad_norm": 0.039306640625, + "learning_rate": 0.010071006960650844, + "loss": 0.7738, + "num_input_tokens_seen": 50354528, + "step": 86735 + }, + { + "epoch": 12.919273160560024, + "grad_norm": 0.031494140625, + "learning_rate": 0.010069165614917793, + "loss": 0.7976, + "num_input_tokens_seen": 50357472, + "step": 86740 + }, + { + "epoch": 12.920017873100983, + "grad_norm": 0.0311279296875, + "learning_rate": 0.010067324352486183, + "loss": 0.7931, + "num_input_tokens_seen": 50360512, + "step": 86745 + }, + { + "epoch": 12.920762585641942, + "grad_norm": 0.056640625, + "learning_rate": 0.010065483173387109, + "loss": 0.7786, + "num_input_tokens_seen": 50363008, + "step": 86750 + }, + { + "epoch": 12.9215072981829, + "grad_norm": 0.0654296875, + "learning_rate": 0.010063642077651687, + "loss": 0.7793, + "num_input_tokens_seen": 50365728, + "step": 86755 + }, + { + "epoch": 12.922252010723861, + "grad_norm": 0.032470703125, + "learning_rate": 0.010061801065311012, + "loss": 0.8055, + "num_input_tokens_seen": 50368640, + "step": 86760 + }, + { + "epoch": 12.92299672326482, + "grad_norm": 0.37890625, + "learning_rate": 0.010059960136396193, + "loss": 0.7955, + "num_input_tokens_seen": 50371904, + "step": 86765 + }, + { + "epoch": 12.923741435805779, + "grad_norm": 0.05322265625, + "learning_rate": 0.010058119290938326, + "loss": 0.7667, + "num_input_tokens_seen": 50374784, + "step": 86770 + }, + { + "epoch": 12.924486148346737, + "grad_norm": 0.037109375, + "learning_rate": 0.010056278528968506, + "loss": 0.8254, + "num_input_tokens_seen": 50377600, + "step": 86775 + }, + { + "epoch": 12.925230860887698, + "grad_norm": 0.034423828125, + "learning_rate": 0.010054437850517841, + "loss": 0.8143, + "num_input_tokens_seen": 50380736, + "step": 86780 + }, + { + "epoch": 12.925975573428657, + "grad_norm": 0.046630859375, + "learning_rate": 0.010052597255617414, + "loss": 0.775, + "num_input_tokens_seen": 50383968, + "step": 86785 + }, + { + "epoch": 12.926720285969616, + "grad_norm": 0.02978515625, + "learning_rate": 0.010050756744298337, + "loss": 0.7936, + "num_input_tokens_seen": 50386720, + "step": 86790 + }, + { + "epoch": 12.927464998510574, + "grad_norm": 0.032958984375, + "learning_rate": 0.01004891631659169, + "loss": 0.8066, + "num_input_tokens_seen": 50389760, + "step": 86795 + }, + { + "epoch": 12.928209711051535, + "grad_norm": 0.03076171875, + "learning_rate": 0.010047075972528574, + "loss": 0.7926, + "num_input_tokens_seen": 50392416, + "step": 86800 + }, + { + "epoch": 12.928954423592494, + "grad_norm": 0.03125, + "learning_rate": 0.010045235712140076, + "loss": 0.7969, + "num_input_tokens_seen": 50395104, + "step": 86805 + }, + { + "epoch": 12.929699136133452, + "grad_norm": 0.0189208984375, + "learning_rate": 0.010043395535457276, + "loss": 0.8056, + "num_input_tokens_seen": 50397888, + "step": 86810 + }, + { + "epoch": 12.930443848674411, + "grad_norm": 0.064453125, + "learning_rate": 0.010041555442511278, + "loss": 0.8156, + "num_input_tokens_seen": 50400608, + "step": 86815 + }, + { + "epoch": 12.931188561215372, + "grad_norm": 0.0283203125, + "learning_rate": 0.010039715433333155, + "loss": 0.8077, + "num_input_tokens_seen": 50403680, + "step": 86820 + }, + { + "epoch": 12.93193327375633, + "grad_norm": 0.04443359375, + "learning_rate": 0.010037875507954009, + "loss": 0.7919, + "num_input_tokens_seen": 50406400, + "step": 86825 + }, + { + "epoch": 12.93267798629729, + "grad_norm": 0.0380859375, + "learning_rate": 0.010036035666404902, + "loss": 0.7872, + "num_input_tokens_seen": 50409472, + "step": 86830 + }, + { + "epoch": 12.933422698838248, + "grad_norm": 0.0322265625, + "learning_rate": 0.010034195908716936, + "loss": 0.7805, + "num_input_tokens_seen": 50412320, + "step": 86835 + }, + { + "epoch": 12.934167411379208, + "grad_norm": 0.031982421875, + "learning_rate": 0.010032356234921175, + "loss": 0.7968, + "num_input_tokens_seen": 50415072, + "step": 86840 + }, + { + "epoch": 12.934912123920167, + "grad_norm": 0.033935546875, + "learning_rate": 0.010030516645048714, + "loss": 0.8081, + "num_input_tokens_seen": 50417888, + "step": 86845 + }, + { + "epoch": 12.935656836461126, + "grad_norm": 0.027099609375, + "learning_rate": 0.010028677139130622, + "loss": 0.7863, + "num_input_tokens_seen": 50420832, + "step": 86850 + }, + { + "epoch": 12.936401549002085, + "grad_norm": 0.03857421875, + "learning_rate": 0.010026837717197971, + "loss": 0.808, + "num_input_tokens_seen": 50424064, + "step": 86855 + }, + { + "epoch": 12.937146261543045, + "grad_norm": 0.037109375, + "learning_rate": 0.01002499837928185, + "loss": 0.7742, + "num_input_tokens_seen": 50426944, + "step": 86860 + }, + { + "epoch": 12.937890974084004, + "grad_norm": 0.029052734375, + "learning_rate": 0.01002315912541332, + "loss": 0.7781, + "num_input_tokens_seen": 50429696, + "step": 86865 + }, + { + "epoch": 12.938635686624963, + "grad_norm": 0.02783203125, + "learning_rate": 0.010021319955623465, + "loss": 0.7958, + "num_input_tokens_seen": 50432320, + "step": 86870 + }, + { + "epoch": 12.939380399165922, + "grad_norm": 0.039794921875, + "learning_rate": 0.010019480869943339, + "loss": 0.7797, + "num_input_tokens_seen": 50435072, + "step": 86875 + }, + { + "epoch": 12.94012511170688, + "grad_norm": 0.030029296875, + "learning_rate": 0.010017641868404028, + "loss": 0.8066, + "num_input_tokens_seen": 50437984, + "step": 86880 + }, + { + "epoch": 12.94086982424784, + "grad_norm": 0.06591796875, + "learning_rate": 0.010015802951036596, + "loss": 0.8042, + "num_input_tokens_seen": 50440800, + "step": 86885 + }, + { + "epoch": 12.9416145367888, + "grad_norm": 0.053466796875, + "learning_rate": 0.010013964117872103, + "loss": 0.8032, + "num_input_tokens_seen": 50443808, + "step": 86890 + }, + { + "epoch": 12.942359249329758, + "grad_norm": 0.06689453125, + "learning_rate": 0.010012125368941619, + "loss": 0.8192, + "num_input_tokens_seen": 50447264, + "step": 86895 + }, + { + "epoch": 12.943103961870717, + "grad_norm": 0.203125, + "learning_rate": 0.010010286704276207, + "loss": 0.8307, + "num_input_tokens_seen": 50450144, + "step": 86900 + }, + { + "epoch": 12.943848674411678, + "grad_norm": 0.0693359375, + "learning_rate": 0.010008448123906933, + "loss": 0.7737, + "num_input_tokens_seen": 50452800, + "step": 86905 + }, + { + "epoch": 12.944593386952636, + "grad_norm": 0.0289306640625, + "learning_rate": 0.01000660962786485, + "loss": 0.8012, + "num_input_tokens_seen": 50455744, + "step": 86910 + }, + { + "epoch": 12.945338099493595, + "grad_norm": 0.0299072265625, + "learning_rate": 0.010004771216181027, + "loss": 0.8052, + "num_input_tokens_seen": 50458624, + "step": 86915 + }, + { + "epoch": 12.946082812034554, + "grad_norm": 0.0284423828125, + "learning_rate": 0.010002932888886514, + "loss": 0.7871, + "num_input_tokens_seen": 50461536, + "step": 86920 + }, + { + "epoch": 12.946827524575514, + "grad_norm": 0.07666015625, + "learning_rate": 0.010001094646012374, + "loss": 0.8134, + "num_input_tokens_seen": 50464640, + "step": 86925 + }, + { + "epoch": 12.947572237116473, + "grad_norm": 0.0380859375, + "learning_rate": 0.009999256487589658, + "loss": 0.8166, + "num_input_tokens_seen": 50467552, + "step": 86930 + }, + { + "epoch": 12.948316949657432, + "grad_norm": 0.034912109375, + "learning_rate": 0.009997418413649421, + "loss": 0.8004, + "num_input_tokens_seen": 50470656, + "step": 86935 + }, + { + "epoch": 12.94906166219839, + "grad_norm": 0.05126953125, + "learning_rate": 0.009995580424222718, + "loss": 0.8008, + "num_input_tokens_seen": 50473536, + "step": 86940 + }, + { + "epoch": 12.949806374739351, + "grad_norm": 0.033447265625, + "learning_rate": 0.009993742519340591, + "loss": 0.7891, + "num_input_tokens_seen": 50476480, + "step": 86945 + }, + { + "epoch": 12.95055108728031, + "grad_norm": 0.032958984375, + "learning_rate": 0.009991904699034103, + "loss": 0.8074, + "num_input_tokens_seen": 50479392, + "step": 86950 + }, + { + "epoch": 12.951295799821269, + "grad_norm": 0.0791015625, + "learning_rate": 0.00999006696333429, + "loss": 0.7989, + "num_input_tokens_seen": 50482432, + "step": 86955 + }, + { + "epoch": 12.952040512362228, + "grad_norm": 0.054931640625, + "learning_rate": 0.00998822931227221, + "loss": 0.8145, + "num_input_tokens_seen": 50485184, + "step": 86960 + }, + { + "epoch": 12.952785224903188, + "grad_norm": 0.0308837890625, + "learning_rate": 0.009986391745878898, + "loss": 0.8004, + "num_input_tokens_seen": 50488256, + "step": 86965 + }, + { + "epoch": 12.953529937444147, + "grad_norm": 0.06396484375, + "learning_rate": 0.009984554264185397, + "loss": 0.8066, + "num_input_tokens_seen": 50491136, + "step": 86970 + }, + { + "epoch": 12.954274649985106, + "grad_norm": 0.046142578125, + "learning_rate": 0.009982716867222763, + "loss": 0.7999, + "num_input_tokens_seen": 50494048, + "step": 86975 + }, + { + "epoch": 12.955019362526064, + "grad_norm": 0.0537109375, + "learning_rate": 0.00998087955502202, + "loss": 0.7969, + "num_input_tokens_seen": 50496864, + "step": 86980 + }, + { + "epoch": 12.955764075067025, + "grad_norm": 0.0361328125, + "learning_rate": 0.009979042327614223, + "loss": 0.7899, + "num_input_tokens_seen": 50500096, + "step": 86985 + }, + { + "epoch": 12.956508787607984, + "grad_norm": 0.06298828125, + "learning_rate": 0.009977205185030398, + "loss": 0.7847, + "num_input_tokens_seen": 50503136, + "step": 86990 + }, + { + "epoch": 12.957253500148942, + "grad_norm": 0.042236328125, + "learning_rate": 0.009975368127301591, + "loss": 0.8068, + "num_input_tokens_seen": 50506080, + "step": 86995 + }, + { + "epoch": 12.957998212689901, + "grad_norm": 0.046875, + "learning_rate": 0.009973531154458827, + "loss": 0.7859, + "num_input_tokens_seen": 50508800, + "step": 87000 + }, + { + "epoch": 12.958742925230862, + "grad_norm": 0.0419921875, + "learning_rate": 0.00997169426653315, + "loss": 0.8181, + "num_input_tokens_seen": 50511616, + "step": 87005 + }, + { + "epoch": 12.95948763777182, + "grad_norm": 0.048828125, + "learning_rate": 0.009969857463555587, + "loss": 0.7936, + "num_input_tokens_seen": 50514656, + "step": 87010 + }, + { + "epoch": 12.96023235031278, + "grad_norm": 0.01904296875, + "learning_rate": 0.009968020745557166, + "loss": 0.8083, + "num_input_tokens_seen": 50517568, + "step": 87015 + }, + { + "epoch": 12.960977062853738, + "grad_norm": 0.06103515625, + "learning_rate": 0.009966184112568926, + "loss": 0.7869, + "num_input_tokens_seen": 50520352, + "step": 87020 + }, + { + "epoch": 12.961721775394697, + "grad_norm": 0.0302734375, + "learning_rate": 0.009964347564621884, + "loss": 0.7896, + "num_input_tokens_seen": 50523296, + "step": 87025 + }, + { + "epoch": 12.962466487935657, + "grad_norm": 0.03369140625, + "learning_rate": 0.009962511101747076, + "loss": 0.7873, + "num_input_tokens_seen": 50526144, + "step": 87030 + }, + { + "epoch": 12.963211200476616, + "grad_norm": 0.02685546875, + "learning_rate": 0.009960674723975518, + "loss": 0.8012, + "num_input_tokens_seen": 50529120, + "step": 87035 + }, + { + "epoch": 12.963955913017575, + "grad_norm": 0.041259765625, + "learning_rate": 0.009958838431338244, + "loss": 0.794, + "num_input_tokens_seen": 50532064, + "step": 87040 + }, + { + "epoch": 12.964700625558535, + "grad_norm": 0.0291748046875, + "learning_rate": 0.009957002223866267, + "loss": 0.7943, + "num_input_tokens_seen": 50534816, + "step": 87045 + }, + { + "epoch": 12.965445338099494, + "grad_norm": 0.04443359375, + "learning_rate": 0.009955166101590614, + "loss": 0.802, + "num_input_tokens_seen": 50537568, + "step": 87050 + }, + { + "epoch": 12.966190050640453, + "grad_norm": 0.0311279296875, + "learning_rate": 0.009953330064542307, + "loss": 0.7944, + "num_input_tokens_seen": 50540512, + "step": 87055 + }, + { + "epoch": 12.966934763181412, + "grad_norm": 0.03466796875, + "learning_rate": 0.009951494112752353, + "loss": 0.8028, + "num_input_tokens_seen": 50543360, + "step": 87060 + }, + { + "epoch": 12.96767947572237, + "grad_norm": 0.02978515625, + "learning_rate": 0.00994965824625178, + "loss": 0.8184, + "num_input_tokens_seen": 50546368, + "step": 87065 + }, + { + "epoch": 12.96842418826333, + "grad_norm": 0.044921875, + "learning_rate": 0.00994782246507159, + "loss": 0.793, + "num_input_tokens_seen": 50549344, + "step": 87070 + }, + { + "epoch": 12.96916890080429, + "grad_norm": 0.0439453125, + "learning_rate": 0.009945986769242812, + "loss": 0.7839, + "num_input_tokens_seen": 50552192, + "step": 87075 + }, + { + "epoch": 12.969913613345248, + "grad_norm": 0.0458984375, + "learning_rate": 0.009944151158796443, + "loss": 0.7865, + "num_input_tokens_seen": 50554848, + "step": 87080 + }, + { + "epoch": 12.970658325886207, + "grad_norm": 0.028564453125, + "learning_rate": 0.009942315633763509, + "loss": 0.7908, + "num_input_tokens_seen": 50558080, + "step": 87085 + }, + { + "epoch": 12.971403038427168, + "grad_norm": 0.048828125, + "learning_rate": 0.009940480194175014, + "loss": 0.7649, + "num_input_tokens_seen": 50561280, + "step": 87090 + }, + { + "epoch": 12.972147750968126, + "grad_norm": 0.059814453125, + "learning_rate": 0.00993864484006196, + "loss": 0.7882, + "num_input_tokens_seen": 50564064, + "step": 87095 + }, + { + "epoch": 12.972892463509085, + "grad_norm": 0.054931640625, + "learning_rate": 0.009936809571455358, + "loss": 0.7913, + "num_input_tokens_seen": 50566912, + "step": 87100 + }, + { + "epoch": 12.973637176050044, + "grad_norm": 0.03173828125, + "learning_rate": 0.009934974388386208, + "loss": 0.7787, + "num_input_tokens_seen": 50570048, + "step": 87105 + }, + { + "epoch": 12.974381888591004, + "grad_norm": 0.0322265625, + "learning_rate": 0.009933139290885523, + "loss": 0.7912, + "num_input_tokens_seen": 50572832, + "step": 87110 + }, + { + "epoch": 12.975126601131963, + "grad_norm": 0.05126953125, + "learning_rate": 0.009931304278984294, + "loss": 0.8091, + "num_input_tokens_seen": 50575616, + "step": 87115 + }, + { + "epoch": 12.975871313672922, + "grad_norm": 0.0235595703125, + "learning_rate": 0.009929469352713534, + "loss": 0.7944, + "num_input_tokens_seen": 50578560, + "step": 87120 + }, + { + "epoch": 12.97661602621388, + "grad_norm": 0.0306396484375, + "learning_rate": 0.009927634512104232, + "loss": 0.8312, + "num_input_tokens_seen": 50581216, + "step": 87125 + }, + { + "epoch": 12.977360738754841, + "grad_norm": 0.0181884765625, + "learning_rate": 0.009925799757187393, + "loss": 0.8089, + "num_input_tokens_seen": 50583968, + "step": 87130 + }, + { + "epoch": 12.9781054512958, + "grad_norm": 0.050537109375, + "learning_rate": 0.009923965087994011, + "loss": 0.8004, + "num_input_tokens_seen": 50586752, + "step": 87135 + }, + { + "epoch": 12.978850163836759, + "grad_norm": 0.06298828125, + "learning_rate": 0.009922130504555074, + "loss": 0.7939, + "num_input_tokens_seen": 50589696, + "step": 87140 + }, + { + "epoch": 12.979594876377718, + "grad_norm": 0.04150390625, + "learning_rate": 0.009920296006901586, + "loss": 0.8163, + "num_input_tokens_seen": 50592544, + "step": 87145 + }, + { + "epoch": 12.980339588918678, + "grad_norm": 0.0211181640625, + "learning_rate": 0.009918461595064527, + "loss": 0.8656, + "num_input_tokens_seen": 50595680, + "step": 87150 + }, + { + "epoch": 12.981084301459637, + "grad_norm": 0.064453125, + "learning_rate": 0.009916627269074902, + "loss": 0.8094, + "num_input_tokens_seen": 50598688, + "step": 87155 + }, + { + "epoch": 12.981829014000596, + "grad_norm": 0.0224609375, + "learning_rate": 0.00991479302896369, + "loss": 0.7939, + "num_input_tokens_seen": 50601504, + "step": 87160 + }, + { + "epoch": 12.982573726541554, + "grad_norm": 0.059326171875, + "learning_rate": 0.009912958874761886, + "loss": 0.7979, + "num_input_tokens_seen": 50604640, + "step": 87165 + }, + { + "epoch": 12.983318439082515, + "grad_norm": 0.04736328125, + "learning_rate": 0.00991112480650047, + "loss": 0.7732, + "num_input_tokens_seen": 50607680, + "step": 87170 + }, + { + "epoch": 12.984063151623474, + "grad_norm": 0.030029296875, + "learning_rate": 0.009909290824210422, + "loss": 0.8293, + "num_input_tokens_seen": 50611488, + "step": 87175 + }, + { + "epoch": 12.984807864164432, + "grad_norm": 0.0208740234375, + "learning_rate": 0.009907456927922737, + "loss": 0.7882, + "num_input_tokens_seen": 50614496, + "step": 87180 + }, + { + "epoch": 12.985552576705391, + "grad_norm": 0.03125, + "learning_rate": 0.009905623117668386, + "loss": 0.7937, + "num_input_tokens_seen": 50617312, + "step": 87185 + }, + { + "epoch": 12.986297289246352, + "grad_norm": 0.032470703125, + "learning_rate": 0.009903789393478362, + "loss": 0.8011, + "num_input_tokens_seen": 50620256, + "step": 87190 + }, + { + "epoch": 12.98704200178731, + "grad_norm": 0.150390625, + "learning_rate": 0.009901955755383631, + "loss": 0.8346, + "num_input_tokens_seen": 50623072, + "step": 87195 + }, + { + "epoch": 12.98778671432827, + "grad_norm": 0.0517578125, + "learning_rate": 0.009900122203415183, + "loss": 0.82, + "num_input_tokens_seen": 50625568, + "step": 87200 + }, + { + "epoch": 12.988531426869228, + "grad_norm": 0.01806640625, + "learning_rate": 0.009898288737603978, + "loss": 0.7856, + "num_input_tokens_seen": 50628800, + "step": 87205 + }, + { + "epoch": 12.989276139410187, + "grad_norm": 0.0225830078125, + "learning_rate": 0.009896455357981006, + "loss": 0.7967, + "num_input_tokens_seen": 50631488, + "step": 87210 + }, + { + "epoch": 12.990020851951147, + "grad_norm": 0.0299072265625, + "learning_rate": 0.009894622064577235, + "loss": 0.78, + "num_input_tokens_seen": 50634176, + "step": 87215 + }, + { + "epoch": 12.990765564492106, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00989278885742363, + "loss": 0.8183, + "num_input_tokens_seen": 50637088, + "step": 87220 + }, + { + "epoch": 12.991510277033065, + "grad_norm": 0.04248046875, + "learning_rate": 0.009890955736551169, + "loss": 0.7921, + "num_input_tokens_seen": 50639968, + "step": 87225 + }, + { + "epoch": 12.992254989574025, + "grad_norm": 0.060791015625, + "learning_rate": 0.009889122701990816, + "loss": 0.8235, + "num_input_tokens_seen": 50642976, + "step": 87230 + }, + { + "epoch": 12.992999702114984, + "grad_norm": 0.046142578125, + "learning_rate": 0.009887289753773544, + "loss": 0.7978, + "num_input_tokens_seen": 50646144, + "step": 87235 + }, + { + "epoch": 12.993744414655943, + "grad_norm": 0.02001953125, + "learning_rate": 0.00988545689193031, + "loss": 0.802, + "num_input_tokens_seen": 50649280, + "step": 87240 + }, + { + "epoch": 12.994489127196902, + "grad_norm": 0.034423828125, + "learning_rate": 0.00988362411649209, + "loss": 0.7916, + "num_input_tokens_seen": 50651808, + "step": 87245 + }, + { + "epoch": 12.99523383973786, + "grad_norm": 0.018798828125, + "learning_rate": 0.009881791427489841, + "loss": 0.7918, + "num_input_tokens_seen": 50654912, + "step": 87250 + }, + { + "epoch": 12.995978552278821, + "grad_norm": 0.037353515625, + "learning_rate": 0.009879958824954519, + "loss": 0.7858, + "num_input_tokens_seen": 50657600, + "step": 87255 + }, + { + "epoch": 12.99672326481978, + "grad_norm": 0.034912109375, + "learning_rate": 0.009878126308917089, + "loss": 0.7949, + "num_input_tokens_seen": 50660512, + "step": 87260 + }, + { + "epoch": 12.997467977360738, + "grad_norm": 0.03515625, + "learning_rate": 0.00987629387940851, + "loss": 0.7994, + "num_input_tokens_seen": 50663392, + "step": 87265 + }, + { + "epoch": 12.998212689901697, + "grad_norm": 0.022216796875, + "learning_rate": 0.009874461536459744, + "loss": 0.8055, + "num_input_tokens_seen": 50666592, + "step": 87270 + }, + { + "epoch": 12.998957402442658, + "grad_norm": 0.02880859375, + "learning_rate": 0.009872629280101733, + "loss": 0.7956, + "num_input_tokens_seen": 50669664, + "step": 87275 + }, + { + "epoch": 12.999702114983616, + "grad_norm": 0.0556640625, + "learning_rate": 0.009870797110365441, + "loss": 0.8272, + "num_input_tokens_seen": 50672352, + "step": 87280 + }, + { + "epoch": 13.0, + "eval_loss": 0.799648106098175, + "eval_runtime": 70.5802, + "eval_samples_per_second": 42.278, + "eval_steps_per_second": 10.57, + "num_input_tokens_seen": 50673016, + "step": 87282 + }, + { + "epoch": 13.000446827524575, + "grad_norm": 0.02197265625, + "learning_rate": 0.009868965027281821, + "loss": 0.8028, + "num_input_tokens_seen": 50675000, + "step": 87285 + }, + { + "epoch": 13.001191540065534, + "grad_norm": 0.03173828125, + "learning_rate": 0.009867133030881823, + "loss": 0.8006, + "num_input_tokens_seen": 50678072, + "step": 87290 + }, + { + "epoch": 13.001936252606495, + "grad_norm": 0.034423828125, + "learning_rate": 0.009865301121196397, + "loss": 0.8331, + "num_input_tokens_seen": 50680952, + "step": 87295 + }, + { + "epoch": 13.002680965147453, + "grad_norm": 0.037353515625, + "learning_rate": 0.009863469298256482, + "loss": 0.8074, + "num_input_tokens_seen": 50683928, + "step": 87300 + }, + { + "epoch": 13.003425677688412, + "grad_norm": 0.0361328125, + "learning_rate": 0.00986163756209304, + "loss": 0.791, + "num_input_tokens_seen": 50687096, + "step": 87305 + }, + { + "epoch": 13.00417039022937, + "grad_norm": 0.03369140625, + "learning_rate": 0.009859805912737002, + "loss": 0.7843, + "num_input_tokens_seen": 50689848, + "step": 87310 + }, + { + "epoch": 13.004915102770331, + "grad_norm": 0.041259765625, + "learning_rate": 0.009857974350219328, + "loss": 0.7911, + "num_input_tokens_seen": 50692632, + "step": 87315 + }, + { + "epoch": 13.00565981531129, + "grad_norm": 0.03125, + "learning_rate": 0.009856142874570943, + "loss": 0.7854, + "num_input_tokens_seen": 50695576, + "step": 87320 + }, + { + "epoch": 13.006404527852249, + "grad_norm": 0.0380859375, + "learning_rate": 0.009854311485822801, + "loss": 0.7974, + "num_input_tokens_seen": 50698872, + "step": 87325 + }, + { + "epoch": 13.007149240393208, + "grad_norm": 0.02783203125, + "learning_rate": 0.009852480184005839, + "loss": 0.8212, + "num_input_tokens_seen": 50701752, + "step": 87330 + }, + { + "epoch": 13.007893952934168, + "grad_norm": 0.05810546875, + "learning_rate": 0.009850648969150985, + "loss": 0.7837, + "num_input_tokens_seen": 50704632, + "step": 87335 + }, + { + "epoch": 13.008638665475127, + "grad_norm": 0.031494140625, + "learning_rate": 0.009848817841289189, + "loss": 0.7953, + "num_input_tokens_seen": 50707448, + "step": 87340 + }, + { + "epoch": 13.009383378016086, + "grad_norm": 0.03076171875, + "learning_rate": 0.009846986800451374, + "loss": 0.8014, + "num_input_tokens_seen": 50710424, + "step": 87345 + }, + { + "epoch": 13.010128090557044, + "grad_norm": 0.03662109375, + "learning_rate": 0.009845155846668485, + "loss": 0.799, + "num_input_tokens_seen": 50713400, + "step": 87350 + }, + { + "epoch": 13.010872803098005, + "grad_norm": 0.035400390625, + "learning_rate": 0.009843324979971448, + "loss": 0.7929, + "num_input_tokens_seen": 50716568, + "step": 87355 + }, + { + "epoch": 13.011617515638964, + "grad_norm": 0.017578125, + "learning_rate": 0.009841494200391194, + "loss": 0.7909, + "num_input_tokens_seen": 50719352, + "step": 87360 + }, + { + "epoch": 13.012362228179922, + "grad_norm": 0.0211181640625, + "learning_rate": 0.009839663507958651, + "loss": 0.8108, + "num_input_tokens_seen": 50722136, + "step": 87365 + }, + { + "epoch": 13.013106940720881, + "grad_norm": 0.0306396484375, + "learning_rate": 0.00983783290270475, + "loss": 0.8084, + "num_input_tokens_seen": 50725176, + "step": 87370 + }, + { + "epoch": 13.013851653261842, + "grad_norm": 0.019775390625, + "learning_rate": 0.009836002384660418, + "loss": 0.823, + "num_input_tokens_seen": 50727992, + "step": 87375 + }, + { + "epoch": 13.0145963658028, + "grad_norm": 0.037841796875, + "learning_rate": 0.009834171953856573, + "loss": 0.8186, + "num_input_tokens_seen": 50730648, + "step": 87380 + }, + { + "epoch": 13.01534107834376, + "grad_norm": 0.0390625, + "learning_rate": 0.009832341610324145, + "loss": 0.8014, + "num_input_tokens_seen": 50733592, + "step": 87385 + }, + { + "epoch": 13.016085790884718, + "grad_norm": 0.032470703125, + "learning_rate": 0.009830511354094051, + "loss": 0.8024, + "num_input_tokens_seen": 50736536, + "step": 87390 + }, + { + "epoch": 13.016830503425677, + "grad_norm": 0.0400390625, + "learning_rate": 0.00982868118519722, + "loss": 0.7949, + "num_input_tokens_seen": 50739512, + "step": 87395 + }, + { + "epoch": 13.017575215966637, + "grad_norm": 0.02685546875, + "learning_rate": 0.009826851103664554, + "loss": 0.8237, + "num_input_tokens_seen": 50742168, + "step": 87400 + }, + { + "epoch": 13.018319928507596, + "grad_norm": 0.041748046875, + "learning_rate": 0.009825021109526988, + "loss": 0.7924, + "num_input_tokens_seen": 50745048, + "step": 87405 + }, + { + "epoch": 13.019064641048555, + "grad_norm": 0.1494140625, + "learning_rate": 0.009823191202815431, + "loss": 0.8076, + "num_input_tokens_seen": 50747992, + "step": 87410 + }, + { + "epoch": 13.019809353589514, + "grad_norm": 0.03125, + "learning_rate": 0.009821361383560793, + "loss": 0.8093, + "num_input_tokens_seen": 50750744, + "step": 87415 + }, + { + "epoch": 13.020554066130474, + "grad_norm": 0.034423828125, + "learning_rate": 0.009819531651793995, + "loss": 0.8087, + "num_input_tokens_seen": 50753624, + "step": 87420 + }, + { + "epoch": 13.021298778671433, + "grad_norm": 0.030517578125, + "learning_rate": 0.009817702007545942, + "loss": 0.8037, + "num_input_tokens_seen": 50756184, + "step": 87425 + }, + { + "epoch": 13.022043491212392, + "grad_norm": 0.042724609375, + "learning_rate": 0.009815872450847552, + "loss": 0.7922, + "num_input_tokens_seen": 50759288, + "step": 87430 + }, + { + "epoch": 13.02278820375335, + "grad_norm": 0.044189453125, + "learning_rate": 0.009814042981729717, + "loss": 0.8037, + "num_input_tokens_seen": 50762456, + "step": 87435 + }, + { + "epoch": 13.023532916294311, + "grad_norm": 0.04833984375, + "learning_rate": 0.009812213600223364, + "loss": 0.8061, + "num_input_tokens_seen": 50765368, + "step": 87440 + }, + { + "epoch": 13.02427762883527, + "grad_norm": 0.03125, + "learning_rate": 0.009810384306359384, + "loss": 0.79, + "num_input_tokens_seen": 50768280, + "step": 87445 + }, + { + "epoch": 13.025022341376228, + "grad_norm": 0.04248046875, + "learning_rate": 0.009808555100168693, + "loss": 0.7905, + "num_input_tokens_seen": 50771000, + "step": 87450 + }, + { + "epoch": 13.025767053917187, + "grad_norm": 0.031982421875, + "learning_rate": 0.009806725981682186, + "loss": 0.7916, + "num_input_tokens_seen": 50773688, + "step": 87455 + }, + { + "epoch": 13.026511766458148, + "grad_norm": 0.0537109375, + "learning_rate": 0.009804896950930762, + "loss": 0.7958, + "num_input_tokens_seen": 50776632, + "step": 87460 + }, + { + "epoch": 13.027256478999107, + "grad_norm": 0.03662109375, + "learning_rate": 0.009803068007945328, + "loss": 0.809, + "num_input_tokens_seen": 50779384, + "step": 87465 + }, + { + "epoch": 13.028001191540065, + "grad_norm": 0.026611328125, + "learning_rate": 0.00980123915275677, + "loss": 0.8002, + "num_input_tokens_seen": 50782328, + "step": 87470 + }, + { + "epoch": 13.028745904081024, + "grad_norm": 0.032470703125, + "learning_rate": 0.009799410385396001, + "loss": 0.7674, + "num_input_tokens_seen": 50784984, + "step": 87475 + }, + { + "epoch": 13.029490616621985, + "grad_norm": 0.03271484375, + "learning_rate": 0.0097975817058939, + "loss": 0.8022, + "num_input_tokens_seen": 50787928, + "step": 87480 + }, + { + "epoch": 13.030235329162943, + "grad_norm": 0.037109375, + "learning_rate": 0.009795753114281379, + "loss": 0.7846, + "num_input_tokens_seen": 50790680, + "step": 87485 + }, + { + "epoch": 13.030980041703902, + "grad_norm": 0.030517578125, + "learning_rate": 0.009793924610589316, + "loss": 0.7977, + "num_input_tokens_seen": 50793752, + "step": 87490 + }, + { + "epoch": 13.03172475424486, + "grad_norm": 0.034912109375, + "learning_rate": 0.009792096194848603, + "loss": 0.7908, + "num_input_tokens_seen": 50796856, + "step": 87495 + }, + { + "epoch": 13.032469466785821, + "grad_norm": 0.0400390625, + "learning_rate": 0.009790267867090132, + "loss": 0.8011, + "num_input_tokens_seen": 50799512, + "step": 87500 + }, + { + "epoch": 13.03321417932678, + "grad_norm": 0.03271484375, + "learning_rate": 0.009788439627344788, + "loss": 0.784, + "num_input_tokens_seen": 50802616, + "step": 87505 + }, + { + "epoch": 13.033958891867739, + "grad_norm": 0.03662109375, + "learning_rate": 0.009786611475643468, + "loss": 0.8131, + "num_input_tokens_seen": 50805816, + "step": 87510 + }, + { + "epoch": 13.034703604408698, + "grad_norm": 0.039794921875, + "learning_rate": 0.009784783412017037, + "loss": 0.7856, + "num_input_tokens_seen": 50808472, + "step": 87515 + }, + { + "epoch": 13.035448316949658, + "grad_norm": 0.019775390625, + "learning_rate": 0.0097829554364964, + "loss": 0.8022, + "num_input_tokens_seen": 50811448, + "step": 87520 + }, + { + "epoch": 13.036193029490617, + "grad_norm": 0.044189453125, + "learning_rate": 0.009781127549112423, + "loss": 0.7958, + "num_input_tokens_seen": 50814520, + "step": 87525 + }, + { + "epoch": 13.036937742031576, + "grad_norm": 0.031982421875, + "learning_rate": 0.009779299749895996, + "loss": 0.8019, + "num_input_tokens_seen": 50817272, + "step": 87530 + }, + { + "epoch": 13.037682454572534, + "grad_norm": 0.03125, + "learning_rate": 0.009777472038877992, + "loss": 0.8044, + "num_input_tokens_seen": 50820248, + "step": 87535 + }, + { + "epoch": 13.038427167113495, + "grad_norm": 0.0306396484375, + "learning_rate": 0.009775644416089286, + "loss": 0.7887, + "num_input_tokens_seen": 50823288, + "step": 87540 + }, + { + "epoch": 13.039171879654454, + "grad_norm": 0.0311279296875, + "learning_rate": 0.009773816881560761, + "loss": 0.8122, + "num_input_tokens_seen": 50826200, + "step": 87545 + }, + { + "epoch": 13.039916592195413, + "grad_norm": 0.037353515625, + "learning_rate": 0.00977198943532329, + "loss": 0.8493, + "num_input_tokens_seen": 50829272, + "step": 87550 + }, + { + "epoch": 13.040661304736371, + "grad_norm": 0.030029296875, + "learning_rate": 0.009770162077407739, + "loss": 0.7952, + "num_input_tokens_seen": 50831864, + "step": 87555 + }, + { + "epoch": 13.041406017277332, + "grad_norm": 0.03173828125, + "learning_rate": 0.009768334807844987, + "loss": 0.8015, + "num_input_tokens_seen": 50834904, + "step": 87560 + }, + { + "epoch": 13.04215072981829, + "grad_norm": 0.048095703125, + "learning_rate": 0.009766507626665904, + "loss": 0.7981, + "num_input_tokens_seen": 50837720, + "step": 87565 + }, + { + "epoch": 13.04289544235925, + "grad_norm": 0.0286865234375, + "learning_rate": 0.009764680533901355, + "loss": 0.8004, + "num_input_tokens_seen": 50840568, + "step": 87570 + }, + { + "epoch": 13.043640154900208, + "grad_norm": 0.043701171875, + "learning_rate": 0.009762853529582201, + "loss": 0.8112, + "num_input_tokens_seen": 50843640, + "step": 87575 + }, + { + "epoch": 13.044384867441167, + "grad_norm": 0.032958984375, + "learning_rate": 0.009761026613739319, + "loss": 0.8016, + "num_input_tokens_seen": 50846712, + "step": 87580 + }, + { + "epoch": 13.045129579982127, + "grad_norm": 0.031982421875, + "learning_rate": 0.009759199786403569, + "loss": 0.7854, + "num_input_tokens_seen": 50849752, + "step": 87585 + }, + { + "epoch": 13.045874292523086, + "grad_norm": 0.04296875, + "learning_rate": 0.009757373047605806, + "loss": 0.8028, + "num_input_tokens_seen": 50852664, + "step": 87590 + }, + { + "epoch": 13.046619005064045, + "grad_norm": 0.04052734375, + "learning_rate": 0.0097555463973769, + "loss": 0.8058, + "num_input_tokens_seen": 50855352, + "step": 87595 + }, + { + "epoch": 13.047363717605004, + "grad_norm": 0.02978515625, + "learning_rate": 0.009753719835747713, + "loss": 0.8016, + "num_input_tokens_seen": 50858168, + "step": 87600 + }, + { + "epoch": 13.048108430145964, + "grad_norm": 0.029541015625, + "learning_rate": 0.009751893362749089, + "loss": 0.7897, + "num_input_tokens_seen": 50861304, + "step": 87605 + }, + { + "epoch": 13.048853142686923, + "grad_norm": 0.035400390625, + "learning_rate": 0.009750066978411898, + "loss": 0.8121, + "num_input_tokens_seen": 50864152, + "step": 87610 + }, + { + "epoch": 13.049597855227882, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00974824068276699, + "loss": 0.8084, + "num_input_tokens_seen": 50867224, + "step": 87615 + }, + { + "epoch": 13.05034256776884, + "grad_norm": 0.0296630859375, + "learning_rate": 0.009746414475845216, + "loss": 0.8076, + "num_input_tokens_seen": 50870200, + "step": 87620 + }, + { + "epoch": 13.051087280309801, + "grad_norm": 0.041015625, + "learning_rate": 0.009744588357677431, + "loss": 0.8088, + "num_input_tokens_seen": 50873112, + "step": 87625 + }, + { + "epoch": 13.05183199285076, + "grad_norm": 0.0419921875, + "learning_rate": 0.009742762328294479, + "loss": 0.7904, + "num_input_tokens_seen": 50876088, + "step": 87630 + }, + { + "epoch": 13.052576705391719, + "grad_norm": 0.0184326171875, + "learning_rate": 0.009740936387727222, + "loss": 0.7877, + "num_input_tokens_seen": 50879160, + "step": 87635 + }, + { + "epoch": 13.053321417932677, + "grad_norm": 0.0225830078125, + "learning_rate": 0.009739110536006491, + "loss": 0.8142, + "num_input_tokens_seen": 50882232, + "step": 87640 + }, + { + "epoch": 13.054066130473638, + "grad_norm": 0.038818359375, + "learning_rate": 0.009737284773163149, + "loss": 0.7785, + "num_input_tokens_seen": 50885848, + "step": 87645 + }, + { + "epoch": 13.054810843014597, + "grad_norm": 0.034423828125, + "learning_rate": 0.009735459099228027, + "loss": 0.8272, + "num_input_tokens_seen": 50888856, + "step": 87650 + }, + { + "epoch": 13.055555555555555, + "grad_norm": 0.044677734375, + "learning_rate": 0.009733633514231977, + "loss": 0.8134, + "num_input_tokens_seen": 50891640, + "step": 87655 + }, + { + "epoch": 13.056300268096514, + "grad_norm": 0.033447265625, + "learning_rate": 0.009731808018205836, + "loss": 0.7769, + "num_input_tokens_seen": 50894296, + "step": 87660 + }, + { + "epoch": 13.057044980637475, + "grad_norm": 0.04052734375, + "learning_rate": 0.00972998261118044, + "loss": 0.7923, + "num_input_tokens_seen": 50897208, + "step": 87665 + }, + { + "epoch": 13.057789693178433, + "grad_norm": 0.03369140625, + "learning_rate": 0.009728157293186634, + "loss": 0.8063, + "num_input_tokens_seen": 50900088, + "step": 87670 + }, + { + "epoch": 13.058534405719392, + "grad_norm": 0.03857421875, + "learning_rate": 0.00972633206425525, + "loss": 0.7901, + "num_input_tokens_seen": 50903096, + "step": 87675 + }, + { + "epoch": 13.059279118260351, + "grad_norm": 0.031494140625, + "learning_rate": 0.009724506924417129, + "loss": 0.7996, + "num_input_tokens_seen": 50906072, + "step": 87680 + }, + { + "epoch": 13.060023830801311, + "grad_norm": 0.03173828125, + "learning_rate": 0.0097226818737031, + "loss": 0.825, + "num_input_tokens_seen": 50908888, + "step": 87685 + }, + { + "epoch": 13.06076854334227, + "grad_norm": 0.050537109375, + "learning_rate": 0.009720856912144001, + "loss": 0.7888, + "num_input_tokens_seen": 50911640, + "step": 87690 + }, + { + "epoch": 13.061513255883229, + "grad_norm": 0.0302734375, + "learning_rate": 0.009719032039770658, + "loss": 0.7757, + "num_input_tokens_seen": 50914552, + "step": 87695 + }, + { + "epoch": 13.062257968424188, + "grad_norm": 0.0289306640625, + "learning_rate": 0.009717207256613895, + "loss": 0.8117, + "num_input_tokens_seen": 50917400, + "step": 87700 + }, + { + "epoch": 13.063002680965148, + "grad_norm": 0.03173828125, + "learning_rate": 0.009715382562704553, + "loss": 0.8056, + "num_input_tokens_seen": 50920216, + "step": 87705 + }, + { + "epoch": 13.063747393506107, + "grad_norm": 0.0302734375, + "learning_rate": 0.009713557958073443, + "loss": 0.8194, + "num_input_tokens_seen": 50923256, + "step": 87710 + }, + { + "epoch": 13.064492106047066, + "grad_norm": 0.0203857421875, + "learning_rate": 0.009711733442751409, + "loss": 0.7801, + "num_input_tokens_seen": 50926040, + "step": 87715 + }, + { + "epoch": 13.065236818588025, + "grad_norm": 0.03759765625, + "learning_rate": 0.009709909016769257, + "loss": 0.7855, + "num_input_tokens_seen": 50928952, + "step": 87720 + }, + { + "epoch": 13.065981531128985, + "grad_norm": 0.0186767578125, + "learning_rate": 0.00970808468015782, + "loss": 0.7999, + "num_input_tokens_seen": 50931960, + "step": 87725 + }, + { + "epoch": 13.066726243669944, + "grad_norm": 0.048828125, + "learning_rate": 0.009706260432947906, + "loss": 0.7859, + "num_input_tokens_seen": 50934776, + "step": 87730 + }, + { + "epoch": 13.067470956210903, + "grad_norm": 0.03271484375, + "learning_rate": 0.00970443627517035, + "loss": 0.794, + "num_input_tokens_seen": 50937592, + "step": 87735 + }, + { + "epoch": 13.068215668751861, + "grad_norm": 0.03125, + "learning_rate": 0.00970261220685596, + "loss": 0.8008, + "num_input_tokens_seen": 50940504, + "step": 87740 + }, + { + "epoch": 13.06896038129282, + "grad_norm": 0.0400390625, + "learning_rate": 0.009700788228035547, + "loss": 0.8007, + "num_input_tokens_seen": 50943192, + "step": 87745 + }, + { + "epoch": 13.06970509383378, + "grad_norm": 0.046142578125, + "learning_rate": 0.00969896433873994, + "loss": 0.799, + "num_input_tokens_seen": 50946136, + "step": 87750 + }, + { + "epoch": 13.07044980637474, + "grad_norm": 0.030029296875, + "learning_rate": 0.009697140538999935, + "loss": 0.7766, + "num_input_tokens_seen": 50949304, + "step": 87755 + }, + { + "epoch": 13.071194518915698, + "grad_norm": 0.0400390625, + "learning_rate": 0.009695316828846357, + "loss": 0.7995, + "num_input_tokens_seen": 50952152, + "step": 87760 + }, + { + "epoch": 13.071939231456657, + "grad_norm": 0.04736328125, + "learning_rate": 0.00969349320831, + "loss": 0.8038, + "num_input_tokens_seen": 50955288, + "step": 87765 + }, + { + "epoch": 13.072683943997617, + "grad_norm": 0.04052734375, + "learning_rate": 0.009691669677421692, + "loss": 0.8122, + "num_input_tokens_seen": 50957816, + "step": 87770 + }, + { + "epoch": 13.073428656538576, + "grad_norm": 0.080078125, + "learning_rate": 0.009689846236212227, + "loss": 0.806, + "num_input_tokens_seen": 50960504, + "step": 87775 + }, + { + "epoch": 13.074173369079535, + "grad_norm": 0.049560546875, + "learning_rate": 0.009688022884712408, + "loss": 0.7663, + "num_input_tokens_seen": 50963256, + "step": 87780 + }, + { + "epoch": 13.074918081620494, + "grad_norm": 0.033447265625, + "learning_rate": 0.00968619962295305, + "loss": 0.8149, + "num_input_tokens_seen": 50966296, + "step": 87785 + }, + { + "epoch": 13.075662794161454, + "grad_norm": 0.03564453125, + "learning_rate": 0.009684376450964946, + "loss": 0.7904, + "num_input_tokens_seen": 50969144, + "step": 87790 + }, + { + "epoch": 13.076407506702413, + "grad_norm": 0.04345703125, + "learning_rate": 0.009682553368778902, + "loss": 0.7746, + "num_input_tokens_seen": 50971864, + "step": 87795 + }, + { + "epoch": 13.077152219243372, + "grad_norm": 0.04345703125, + "learning_rate": 0.009680730376425707, + "loss": 0.7768, + "num_input_tokens_seen": 50974776, + "step": 87800 + }, + { + "epoch": 13.07789693178433, + "grad_norm": 0.044921875, + "learning_rate": 0.009678907473936172, + "loss": 0.7649, + "num_input_tokens_seen": 50977432, + "step": 87805 + }, + { + "epoch": 13.078641644325291, + "grad_norm": 0.045654296875, + "learning_rate": 0.009677084661341082, + "loss": 0.8152, + "num_input_tokens_seen": 50980568, + "step": 87810 + }, + { + "epoch": 13.07938635686625, + "grad_norm": 0.04833984375, + "learning_rate": 0.009675261938671242, + "loss": 0.791, + "num_input_tokens_seen": 50983672, + "step": 87815 + }, + { + "epoch": 13.080131069407209, + "grad_norm": 0.044189453125, + "learning_rate": 0.009673439305957442, + "loss": 0.7857, + "num_input_tokens_seen": 50986648, + "step": 87820 + }, + { + "epoch": 13.080875781948167, + "grad_norm": 0.0634765625, + "learning_rate": 0.009671616763230466, + "loss": 0.8041, + "num_input_tokens_seen": 50989816, + "step": 87825 + }, + { + "epoch": 13.081620494489128, + "grad_norm": 0.03515625, + "learning_rate": 0.009669794310521115, + "loss": 0.796, + "num_input_tokens_seen": 50992696, + "step": 87830 + }, + { + "epoch": 13.082365207030087, + "grad_norm": 0.027099609375, + "learning_rate": 0.009667971947860163, + "loss": 0.7966, + "num_input_tokens_seen": 50995608, + "step": 87835 + }, + { + "epoch": 13.083109919571045, + "grad_norm": 0.060546875, + "learning_rate": 0.009666149675278412, + "loss": 0.792, + "num_input_tokens_seen": 50998392, + "step": 87840 + }, + { + "epoch": 13.083854632112004, + "grad_norm": 0.043212890625, + "learning_rate": 0.009664327492806634, + "loss": 0.8017, + "num_input_tokens_seen": 51001336, + "step": 87845 + }, + { + "epoch": 13.084599344652965, + "grad_norm": 0.0301513671875, + "learning_rate": 0.009662505400475628, + "loss": 0.7869, + "num_input_tokens_seen": 51004024, + "step": 87850 + }, + { + "epoch": 13.085344057193923, + "grad_norm": 0.039306640625, + "learning_rate": 0.009660683398316168, + "loss": 0.8037, + "num_input_tokens_seen": 51007320, + "step": 87855 + }, + { + "epoch": 13.086088769734882, + "grad_norm": 0.052001953125, + "learning_rate": 0.009658861486359032, + "loss": 0.7986, + "num_input_tokens_seen": 51010456, + "step": 87860 + }, + { + "epoch": 13.086833482275841, + "grad_norm": 0.0201416015625, + "learning_rate": 0.009657039664635008, + "loss": 0.8038, + "num_input_tokens_seen": 51013432, + "step": 87865 + }, + { + "epoch": 13.087578194816802, + "grad_norm": 0.0208740234375, + "learning_rate": 0.009655217933174861, + "loss": 0.8159, + "num_input_tokens_seen": 51016280, + "step": 87870 + }, + { + "epoch": 13.08832290735776, + "grad_norm": 0.03173828125, + "learning_rate": 0.00965339629200938, + "loss": 0.7815, + "num_input_tokens_seen": 51019448, + "step": 87875 + }, + { + "epoch": 13.089067619898719, + "grad_norm": 0.04248046875, + "learning_rate": 0.009651574741169332, + "loss": 0.8157, + "num_input_tokens_seen": 51022360, + "step": 87880 + }, + { + "epoch": 13.089812332439678, + "grad_norm": 0.0291748046875, + "learning_rate": 0.009649753280685493, + "loss": 0.7884, + "num_input_tokens_seen": 51025080, + "step": 87885 + }, + { + "epoch": 13.090557044980638, + "grad_norm": 0.0191650390625, + "learning_rate": 0.009647931910588634, + "loss": 0.7823, + "num_input_tokens_seen": 51027832, + "step": 87890 + }, + { + "epoch": 13.091301757521597, + "grad_norm": 0.044189453125, + "learning_rate": 0.00964611063090953, + "loss": 0.7678, + "num_input_tokens_seen": 51030808, + "step": 87895 + }, + { + "epoch": 13.092046470062556, + "grad_norm": 0.0303955078125, + "learning_rate": 0.009644289441678944, + "loss": 0.802, + "num_input_tokens_seen": 51034104, + "step": 87900 + }, + { + "epoch": 13.092791182603515, + "grad_norm": 0.034423828125, + "learning_rate": 0.00964246834292764, + "loss": 0.7973, + "num_input_tokens_seen": 51037048, + "step": 87905 + }, + { + "epoch": 13.093535895144473, + "grad_norm": 0.042236328125, + "learning_rate": 0.009640647334686394, + "loss": 0.7861, + "num_input_tokens_seen": 51040184, + "step": 87910 + }, + { + "epoch": 13.094280607685434, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00963882641698596, + "loss": 0.7872, + "num_input_tokens_seen": 51042808, + "step": 87915 + }, + { + "epoch": 13.095025320226393, + "grad_norm": 0.04541015625, + "learning_rate": 0.009637005589857105, + "loss": 0.807, + "num_input_tokens_seen": 51045432, + "step": 87920 + }, + { + "epoch": 13.095770032767351, + "grad_norm": 0.036376953125, + "learning_rate": 0.00963518485333059, + "loss": 0.7858, + "num_input_tokens_seen": 51048536, + "step": 87925 + }, + { + "epoch": 13.09651474530831, + "grad_norm": 0.0322265625, + "learning_rate": 0.009633364207437179, + "loss": 0.7865, + "num_input_tokens_seen": 51051448, + "step": 87930 + }, + { + "epoch": 13.09725945784927, + "grad_norm": 0.03173828125, + "learning_rate": 0.009631543652207624, + "loss": 0.8005, + "num_input_tokens_seen": 51054328, + "step": 87935 + }, + { + "epoch": 13.09800417039023, + "grad_norm": 0.050537109375, + "learning_rate": 0.009629723187672676, + "loss": 0.8063, + "num_input_tokens_seen": 51057080, + "step": 87940 + }, + { + "epoch": 13.098748882931188, + "grad_norm": 0.045166015625, + "learning_rate": 0.009627902813863104, + "loss": 0.8013, + "num_input_tokens_seen": 51060120, + "step": 87945 + }, + { + "epoch": 13.099493595472147, + "grad_norm": 0.049072265625, + "learning_rate": 0.009626082530809653, + "loss": 0.7989, + "num_input_tokens_seen": 51063160, + "step": 87950 + }, + { + "epoch": 13.100238308013108, + "grad_norm": 0.022216796875, + "learning_rate": 0.009624262338543075, + "loss": 0.7932, + "num_input_tokens_seen": 51066136, + "step": 87955 + }, + { + "epoch": 13.100983020554066, + "grad_norm": 0.041259765625, + "learning_rate": 0.009622442237094115, + "loss": 0.8071, + "num_input_tokens_seen": 51068888, + "step": 87960 + }, + { + "epoch": 13.101727733095025, + "grad_norm": 0.059814453125, + "learning_rate": 0.009620622226493536, + "loss": 0.7774, + "num_input_tokens_seen": 51072056, + "step": 87965 + }, + { + "epoch": 13.102472445635984, + "grad_norm": 0.1611328125, + "learning_rate": 0.009618802306772072, + "loss": 0.8502, + "num_input_tokens_seen": 51075032, + "step": 87970 + }, + { + "epoch": 13.103217158176944, + "grad_norm": 0.03662109375, + "learning_rate": 0.009616982477960476, + "loss": 0.8222, + "num_input_tokens_seen": 51077944, + "step": 87975 + }, + { + "epoch": 13.103961870717903, + "grad_norm": 0.03759765625, + "learning_rate": 0.009615162740089495, + "loss": 0.8084, + "num_input_tokens_seen": 51080632, + "step": 87980 + }, + { + "epoch": 13.104706583258862, + "grad_norm": 0.050048828125, + "learning_rate": 0.009613343093189858, + "loss": 0.7786, + "num_input_tokens_seen": 51083352, + "step": 87985 + }, + { + "epoch": 13.10545129579982, + "grad_norm": 0.037353515625, + "learning_rate": 0.009611523537292322, + "loss": 0.7916, + "num_input_tokens_seen": 51086360, + "step": 87990 + }, + { + "epoch": 13.106196008340781, + "grad_norm": 0.0185546875, + "learning_rate": 0.009609704072427611, + "loss": 0.7948, + "num_input_tokens_seen": 51089144, + "step": 87995 + }, + { + "epoch": 13.10694072088174, + "grad_norm": 0.044189453125, + "learning_rate": 0.009607884698626478, + "loss": 0.7989, + "num_input_tokens_seen": 51092088, + "step": 88000 + }, + { + "epoch": 13.107685433422699, + "grad_norm": 0.045654296875, + "learning_rate": 0.009606065415919646, + "loss": 0.7818, + "num_input_tokens_seen": 51094968, + "step": 88005 + }, + { + "epoch": 13.108430145963657, + "grad_norm": 0.03369140625, + "learning_rate": 0.009604246224337865, + "loss": 0.8104, + "num_input_tokens_seen": 51098136, + "step": 88010 + }, + { + "epoch": 13.109174858504618, + "grad_norm": 0.0380859375, + "learning_rate": 0.009602427123911857, + "loss": 0.7886, + "num_input_tokens_seen": 51101048, + "step": 88015 + }, + { + "epoch": 13.109919571045577, + "grad_norm": 0.0361328125, + "learning_rate": 0.009600608114672357, + "loss": 0.8012, + "num_input_tokens_seen": 51103736, + "step": 88020 + }, + { + "epoch": 13.110664283586535, + "grad_norm": 0.037841796875, + "learning_rate": 0.009598789196650095, + "loss": 0.8089, + "num_input_tokens_seen": 51106840, + "step": 88025 + }, + { + "epoch": 13.111408996127494, + "grad_norm": 0.03857421875, + "learning_rate": 0.009596970369875798, + "loss": 0.817, + "num_input_tokens_seen": 51109592, + "step": 88030 + }, + { + "epoch": 13.112153708668455, + "grad_norm": 0.045166015625, + "learning_rate": 0.009595151634380197, + "loss": 0.7847, + "num_input_tokens_seen": 51112440, + "step": 88035 + }, + { + "epoch": 13.112898421209414, + "grad_norm": 0.02294921875, + "learning_rate": 0.009593332990194011, + "loss": 0.8083, + "num_input_tokens_seen": 51115128, + "step": 88040 + }, + { + "epoch": 13.113643133750372, + "grad_norm": 0.041259765625, + "learning_rate": 0.009591514437347976, + "loss": 0.7943, + "num_input_tokens_seen": 51117720, + "step": 88045 + }, + { + "epoch": 13.114387846291331, + "grad_norm": 0.0400390625, + "learning_rate": 0.009589695975872807, + "loss": 0.8012, + "num_input_tokens_seen": 51120792, + "step": 88050 + }, + { + "epoch": 13.115132558832292, + "grad_norm": 0.053466796875, + "learning_rate": 0.009587877605799225, + "loss": 0.7888, + "num_input_tokens_seen": 51123576, + "step": 88055 + }, + { + "epoch": 13.11587727137325, + "grad_norm": 0.03125, + "learning_rate": 0.009586059327157951, + "loss": 0.801, + "num_input_tokens_seen": 51126648, + "step": 88060 + }, + { + "epoch": 13.116621983914209, + "grad_norm": 0.049072265625, + "learning_rate": 0.009584241139979697, + "loss": 0.7941, + "num_input_tokens_seen": 51129432, + "step": 88065 + }, + { + "epoch": 13.117366696455168, + "grad_norm": 0.045166015625, + "learning_rate": 0.009582423044295189, + "loss": 0.7813, + "num_input_tokens_seen": 51132344, + "step": 88070 + }, + { + "epoch": 13.118111408996128, + "grad_norm": 0.0380859375, + "learning_rate": 0.009580605040135134, + "loss": 0.7984, + "num_input_tokens_seen": 51135480, + "step": 88075 + }, + { + "epoch": 13.118856121537087, + "grad_norm": 0.045654296875, + "learning_rate": 0.009578787127530252, + "loss": 0.7996, + "num_input_tokens_seen": 51138424, + "step": 88080 + }, + { + "epoch": 13.119600834078046, + "grad_norm": 0.046875, + "learning_rate": 0.00957696930651125, + "loss": 0.8044, + "num_input_tokens_seen": 51141272, + "step": 88085 + }, + { + "epoch": 13.120345546619005, + "grad_norm": 0.03466796875, + "learning_rate": 0.009575151577108842, + "loss": 0.7803, + "num_input_tokens_seen": 51144248, + "step": 88090 + }, + { + "epoch": 13.121090259159963, + "grad_norm": 0.0208740234375, + "learning_rate": 0.009573333939353737, + "loss": 0.7964, + "num_input_tokens_seen": 51147256, + "step": 88095 + }, + { + "epoch": 13.121834971700924, + "grad_norm": 0.034423828125, + "learning_rate": 0.00957151639327663, + "loss": 0.8171, + "num_input_tokens_seen": 51150392, + "step": 88100 + }, + { + "epoch": 13.122579684241883, + "grad_norm": 0.0751953125, + "learning_rate": 0.009569698938908244, + "loss": 0.7824, + "num_input_tokens_seen": 51153304, + "step": 88105 + }, + { + "epoch": 13.123324396782841, + "grad_norm": 0.045166015625, + "learning_rate": 0.009567881576279273, + "loss": 0.8104, + "num_input_tokens_seen": 51156312, + "step": 88110 + }, + { + "epoch": 13.1240691093238, + "grad_norm": 0.038330078125, + "learning_rate": 0.009566064305420422, + "loss": 0.797, + "num_input_tokens_seen": 51159224, + "step": 88115 + }, + { + "epoch": 13.12481382186476, + "grad_norm": 0.052978515625, + "learning_rate": 0.009564247126362392, + "loss": 0.7573, + "num_input_tokens_seen": 51162232, + "step": 88120 + }, + { + "epoch": 13.12555853440572, + "grad_norm": 0.031494140625, + "learning_rate": 0.009562430039135884, + "loss": 0.8213, + "num_input_tokens_seen": 51164984, + "step": 88125 + }, + { + "epoch": 13.126303246946678, + "grad_norm": 0.03125, + "learning_rate": 0.009560613043771588, + "loss": 0.7896, + "num_input_tokens_seen": 51167672, + "step": 88130 + }, + { + "epoch": 13.127047959487637, + "grad_norm": 0.046875, + "learning_rate": 0.009558796140300212, + "loss": 0.7946, + "num_input_tokens_seen": 51170392, + "step": 88135 + }, + { + "epoch": 13.127792672028598, + "grad_norm": 0.04345703125, + "learning_rate": 0.009556979328752446, + "loss": 0.7853, + "num_input_tokens_seen": 51173048, + "step": 88140 + }, + { + "epoch": 13.128537384569556, + "grad_norm": 0.047119140625, + "learning_rate": 0.009555162609158976, + "loss": 0.7887, + "num_input_tokens_seen": 51175960, + "step": 88145 + }, + { + "epoch": 13.129282097110515, + "grad_norm": 0.047607421875, + "learning_rate": 0.009553345981550505, + "loss": 0.7962, + "num_input_tokens_seen": 51179032, + "step": 88150 + }, + { + "epoch": 13.130026809651474, + "grad_norm": 0.027099609375, + "learning_rate": 0.009551529445957717, + "loss": 0.7927, + "num_input_tokens_seen": 51182008, + "step": 88155 + }, + { + "epoch": 13.130771522192434, + "grad_norm": 0.033203125, + "learning_rate": 0.009549713002411303, + "loss": 0.8165, + "num_input_tokens_seen": 51184952, + "step": 88160 + }, + { + "epoch": 13.131516234733393, + "grad_norm": 0.0267333984375, + "learning_rate": 0.009547896650941943, + "loss": 0.7894, + "num_input_tokens_seen": 51187960, + "step": 88165 + }, + { + "epoch": 13.132260947274352, + "grad_norm": 0.031982421875, + "learning_rate": 0.009546080391580333, + "loss": 0.8006, + "num_input_tokens_seen": 51190808, + "step": 88170 + }, + { + "epoch": 13.13300565981531, + "grad_norm": 0.033203125, + "learning_rate": 0.009544264224357151, + "loss": 0.7877, + "num_input_tokens_seen": 51193656, + "step": 88175 + }, + { + "epoch": 13.133750372356271, + "grad_norm": 0.033203125, + "learning_rate": 0.009542448149303075, + "loss": 0.8047, + "num_input_tokens_seen": 51196568, + "step": 88180 + }, + { + "epoch": 13.13449508489723, + "grad_norm": 0.031982421875, + "learning_rate": 0.009540632166448796, + "loss": 0.7703, + "num_input_tokens_seen": 51199160, + "step": 88185 + }, + { + "epoch": 13.135239797438189, + "grad_norm": 0.041015625, + "learning_rate": 0.009538816275824985, + "loss": 0.8077, + "num_input_tokens_seen": 51201944, + "step": 88190 + }, + { + "epoch": 13.135984509979147, + "grad_norm": 0.040283203125, + "learning_rate": 0.009537000477462327, + "loss": 0.8484, + "num_input_tokens_seen": 51204728, + "step": 88195 + }, + { + "epoch": 13.136729222520108, + "grad_norm": 0.0439453125, + "learning_rate": 0.009535184771391486, + "loss": 0.8079, + "num_input_tokens_seen": 51207640, + "step": 88200 + }, + { + "epoch": 13.137473935061067, + "grad_norm": 0.040283203125, + "learning_rate": 0.00953336915764315, + "loss": 0.7906, + "num_input_tokens_seen": 51210360, + "step": 88205 + }, + { + "epoch": 13.138218647602026, + "grad_norm": 0.0296630859375, + "learning_rate": 0.009531553636247985, + "loss": 0.8094, + "num_input_tokens_seen": 51212920, + "step": 88210 + }, + { + "epoch": 13.138963360142984, + "grad_norm": 0.031494140625, + "learning_rate": 0.009529738207236661, + "loss": 0.8046, + "num_input_tokens_seen": 51215736, + "step": 88215 + }, + { + "epoch": 13.139708072683945, + "grad_norm": 0.031982421875, + "learning_rate": 0.009527922870639855, + "loss": 0.7943, + "num_input_tokens_seen": 51218872, + "step": 88220 + }, + { + "epoch": 13.140452785224904, + "grad_norm": 0.050048828125, + "learning_rate": 0.009526107626488228, + "loss": 0.8229, + "num_input_tokens_seen": 51221848, + "step": 88225 + }, + { + "epoch": 13.141197497765862, + "grad_norm": 0.037841796875, + "learning_rate": 0.00952429247481245, + "loss": 0.8114, + "num_input_tokens_seen": 51224632, + "step": 88230 + }, + { + "epoch": 13.141942210306821, + "grad_norm": 0.038818359375, + "learning_rate": 0.009522477415643181, + "loss": 0.8087, + "num_input_tokens_seen": 51227608, + "step": 88235 + }, + { + "epoch": 13.142686922847782, + "grad_norm": 0.0478515625, + "learning_rate": 0.009520662449011093, + "loss": 0.8101, + "num_input_tokens_seen": 51230232, + "step": 88240 + }, + { + "epoch": 13.14343163538874, + "grad_norm": 0.039794921875, + "learning_rate": 0.009518847574946844, + "loss": 0.8002, + "num_input_tokens_seen": 51233656, + "step": 88245 + }, + { + "epoch": 13.1441763479297, + "grad_norm": 0.06982421875, + "learning_rate": 0.009517032793481092, + "loss": 0.8017, + "num_input_tokens_seen": 51236696, + "step": 88250 + }, + { + "epoch": 13.144921060470658, + "grad_norm": 0.048828125, + "learning_rate": 0.009515218104644499, + "loss": 0.8076, + "num_input_tokens_seen": 51239768, + "step": 88255 + }, + { + "epoch": 13.145665773011617, + "grad_norm": 0.03466796875, + "learning_rate": 0.009513403508467725, + "loss": 0.8137, + "num_input_tokens_seen": 51242584, + "step": 88260 + }, + { + "epoch": 13.146410485552577, + "grad_norm": 0.037109375, + "learning_rate": 0.009511589004981422, + "loss": 0.8225, + "num_input_tokens_seen": 51245688, + "step": 88265 + }, + { + "epoch": 13.147155198093536, + "grad_norm": 0.03271484375, + "learning_rate": 0.00950977459421624, + "loss": 0.7997, + "num_input_tokens_seen": 51248696, + "step": 88270 + }, + { + "epoch": 13.147899910634495, + "grad_norm": 0.048583984375, + "learning_rate": 0.00950796027620284, + "loss": 0.7879, + "num_input_tokens_seen": 51251512, + "step": 88275 + }, + { + "epoch": 13.148644623175453, + "grad_norm": 0.051513671875, + "learning_rate": 0.009506146050971869, + "loss": 0.7898, + "num_input_tokens_seen": 51254200, + "step": 88280 + }, + { + "epoch": 13.149389335716414, + "grad_norm": 0.031494140625, + "learning_rate": 0.009504331918553977, + "loss": 0.7853, + "num_input_tokens_seen": 51257176, + "step": 88285 + }, + { + "epoch": 13.150134048257373, + "grad_norm": 0.025390625, + "learning_rate": 0.009502517878979806, + "loss": 0.7945, + "num_input_tokens_seen": 51260312, + "step": 88290 + }, + { + "epoch": 13.150878760798332, + "grad_norm": 0.04931640625, + "learning_rate": 0.009500703932280015, + "loss": 0.8131, + "num_input_tokens_seen": 51263320, + "step": 88295 + }, + { + "epoch": 13.15162347333929, + "grad_norm": 0.03857421875, + "learning_rate": 0.009498890078485242, + "loss": 0.791, + "num_input_tokens_seen": 51266200, + "step": 88300 + }, + { + "epoch": 13.15236818588025, + "grad_norm": 0.0546875, + "learning_rate": 0.009497076317626125, + "loss": 0.7972, + "num_input_tokens_seen": 51268984, + "step": 88305 + }, + { + "epoch": 13.15311289842121, + "grad_norm": 0.03076171875, + "learning_rate": 0.009495262649733313, + "loss": 0.8221, + "num_input_tokens_seen": 51271800, + "step": 88310 + }, + { + "epoch": 13.153857610962168, + "grad_norm": 0.039306640625, + "learning_rate": 0.009493449074837443, + "loss": 0.8047, + "num_input_tokens_seen": 51274712, + "step": 88315 + }, + { + "epoch": 13.154602323503127, + "grad_norm": 0.03662109375, + "learning_rate": 0.00949163559296916, + "loss": 0.7791, + "num_input_tokens_seen": 51278168, + "step": 88320 + }, + { + "epoch": 13.155347036044088, + "grad_norm": 0.020263671875, + "learning_rate": 0.009489822204159086, + "loss": 0.7882, + "num_input_tokens_seen": 51280728, + "step": 88325 + }, + { + "epoch": 13.156091748585046, + "grad_norm": 0.0303955078125, + "learning_rate": 0.009488008908437871, + "loss": 0.7865, + "num_input_tokens_seen": 51283800, + "step": 88330 + }, + { + "epoch": 13.156836461126005, + "grad_norm": 0.036376953125, + "learning_rate": 0.009486195705836139, + "loss": 0.8006, + "num_input_tokens_seen": 51286808, + "step": 88335 + }, + { + "epoch": 13.157581173666964, + "grad_norm": 0.0289306640625, + "learning_rate": 0.009484382596384532, + "loss": 0.7968, + "num_input_tokens_seen": 51289816, + "step": 88340 + }, + { + "epoch": 13.158325886207924, + "grad_norm": 0.022216796875, + "learning_rate": 0.009482569580113676, + "loss": 0.8124, + "num_input_tokens_seen": 51292696, + "step": 88345 + }, + { + "epoch": 13.159070598748883, + "grad_norm": 0.043212890625, + "learning_rate": 0.009480756657054194, + "loss": 0.8109, + "num_input_tokens_seen": 51295896, + "step": 88350 + }, + { + "epoch": 13.159815311289842, + "grad_norm": 0.03857421875, + "learning_rate": 0.009478943827236724, + "loss": 0.7831, + "num_input_tokens_seen": 51298744, + "step": 88355 + }, + { + "epoch": 13.1605600238308, + "grad_norm": 0.03271484375, + "learning_rate": 0.00947713109069188, + "loss": 0.7949, + "num_input_tokens_seen": 51301816, + "step": 88360 + }, + { + "epoch": 13.161304736371761, + "grad_norm": 0.050048828125, + "learning_rate": 0.009475318447450299, + "loss": 0.8055, + "num_input_tokens_seen": 51304664, + "step": 88365 + }, + { + "epoch": 13.16204944891272, + "grad_norm": 0.022705078125, + "learning_rate": 0.009473505897542592, + "loss": 0.8031, + "num_input_tokens_seen": 51307544, + "step": 88370 + }, + { + "epoch": 13.162794161453679, + "grad_norm": 0.040283203125, + "learning_rate": 0.009471693440999393, + "loss": 0.7867, + "num_input_tokens_seen": 51310360, + "step": 88375 + }, + { + "epoch": 13.163538873994638, + "grad_norm": 0.06201171875, + "learning_rate": 0.009469881077851313, + "loss": 0.8134, + "num_input_tokens_seen": 51313720, + "step": 88380 + }, + { + "epoch": 13.164283586535598, + "grad_norm": 0.035888671875, + "learning_rate": 0.00946806880812897, + "loss": 0.7904, + "num_input_tokens_seen": 51316504, + "step": 88385 + }, + { + "epoch": 13.165028299076557, + "grad_norm": 0.03173828125, + "learning_rate": 0.009466256631862983, + "loss": 0.8008, + "num_input_tokens_seen": 51319672, + "step": 88390 + }, + { + "epoch": 13.165773011617516, + "grad_norm": 0.0439453125, + "learning_rate": 0.009464444549083961, + "loss": 0.7944, + "num_input_tokens_seen": 51322616, + "step": 88395 + }, + { + "epoch": 13.166517724158474, + "grad_norm": 0.03271484375, + "learning_rate": 0.009462632559822529, + "loss": 0.7962, + "num_input_tokens_seen": 51325656, + "step": 88400 + }, + { + "epoch": 13.167262436699435, + "grad_norm": 0.041015625, + "learning_rate": 0.009460820664109287, + "loss": 0.7892, + "num_input_tokens_seen": 51328472, + "step": 88405 + }, + { + "epoch": 13.168007149240394, + "grad_norm": 0.056640625, + "learning_rate": 0.009459008861974852, + "loss": 0.7984, + "num_input_tokens_seen": 51331384, + "step": 88410 + }, + { + "epoch": 13.168751861781352, + "grad_norm": 0.03466796875, + "learning_rate": 0.00945719715344983, + "loss": 0.7684, + "num_input_tokens_seen": 51334264, + "step": 88415 + }, + { + "epoch": 13.169496574322311, + "grad_norm": 0.04443359375, + "learning_rate": 0.009455385538564832, + "loss": 0.795, + "num_input_tokens_seen": 51337176, + "step": 88420 + }, + { + "epoch": 13.17024128686327, + "grad_norm": 0.046142578125, + "learning_rate": 0.009453574017350459, + "loss": 0.7901, + "num_input_tokens_seen": 51339928, + "step": 88425 + }, + { + "epoch": 13.17098599940423, + "grad_norm": 0.0537109375, + "learning_rate": 0.009451762589837312, + "loss": 0.7892, + "num_input_tokens_seen": 51342648, + "step": 88430 + }, + { + "epoch": 13.17173071194519, + "grad_norm": 0.06787109375, + "learning_rate": 0.009449951256056003, + "loss": 0.8203, + "num_input_tokens_seen": 51345208, + "step": 88435 + }, + { + "epoch": 13.172475424486148, + "grad_norm": 0.031494140625, + "learning_rate": 0.009448140016037119, + "loss": 0.781, + "num_input_tokens_seen": 51348184, + "step": 88440 + }, + { + "epoch": 13.173220137027107, + "grad_norm": 0.0311279296875, + "learning_rate": 0.009446328869811272, + "loss": 0.7898, + "num_input_tokens_seen": 51350808, + "step": 88445 + }, + { + "epoch": 13.173964849568067, + "grad_norm": 0.035888671875, + "learning_rate": 0.009444517817409053, + "loss": 0.7982, + "num_input_tokens_seen": 51353464, + "step": 88450 + }, + { + "epoch": 13.174709562109026, + "grad_norm": 0.040771484375, + "learning_rate": 0.00944270685886106, + "loss": 0.7988, + "num_input_tokens_seen": 51356600, + "step": 88455 + }, + { + "epoch": 13.175454274649985, + "grad_norm": 0.0322265625, + "learning_rate": 0.009440895994197888, + "loss": 0.7939, + "num_input_tokens_seen": 51359352, + "step": 88460 + }, + { + "epoch": 13.176198987190944, + "grad_norm": 0.0225830078125, + "learning_rate": 0.009439085223450123, + "loss": 0.8107, + "num_input_tokens_seen": 51362136, + "step": 88465 + }, + { + "epoch": 13.176943699731904, + "grad_norm": 0.025146484375, + "learning_rate": 0.009437274546648366, + "loss": 0.7876, + "num_input_tokens_seen": 51365112, + "step": 88470 + }, + { + "epoch": 13.177688412272863, + "grad_norm": 0.0556640625, + "learning_rate": 0.009435463963823196, + "loss": 0.8166, + "num_input_tokens_seen": 51367960, + "step": 88475 + }, + { + "epoch": 13.178433124813822, + "grad_norm": 0.043701171875, + "learning_rate": 0.009433653475005214, + "loss": 0.7823, + "num_input_tokens_seen": 51370744, + "step": 88480 + }, + { + "epoch": 13.17917783735478, + "grad_norm": 0.03271484375, + "learning_rate": 0.009431843080224996, + "loss": 0.823, + "num_input_tokens_seen": 51373560, + "step": 88485 + }, + { + "epoch": 13.17992254989574, + "grad_norm": 0.033447265625, + "learning_rate": 0.00943003277951313, + "loss": 0.788, + "num_input_tokens_seen": 51376728, + "step": 88490 + }, + { + "epoch": 13.1806672624367, + "grad_norm": 0.04248046875, + "learning_rate": 0.009428222572900198, + "loss": 0.8286, + "num_input_tokens_seen": 51379576, + "step": 88495 + }, + { + "epoch": 13.181411974977658, + "grad_norm": 0.058349609375, + "learning_rate": 0.009426412460416785, + "loss": 0.7861, + "num_input_tokens_seen": 51382488, + "step": 88500 + }, + { + "epoch": 13.182156687518617, + "grad_norm": 0.05224609375, + "learning_rate": 0.009424602442093468, + "loss": 0.7818, + "num_input_tokens_seen": 51385176, + "step": 88505 + }, + { + "epoch": 13.182901400059578, + "grad_norm": 0.03369140625, + "learning_rate": 0.009422792517960825, + "loss": 0.7946, + "num_input_tokens_seen": 51387864, + "step": 88510 + }, + { + "epoch": 13.183646112600536, + "grad_norm": 0.04638671875, + "learning_rate": 0.009420982688049435, + "loss": 0.7987, + "num_input_tokens_seen": 51390584, + "step": 88515 + }, + { + "epoch": 13.184390825141495, + "grad_norm": 0.048583984375, + "learning_rate": 0.009419172952389872, + "loss": 0.7932, + "num_input_tokens_seen": 51393368, + "step": 88520 + }, + { + "epoch": 13.185135537682454, + "grad_norm": 0.042724609375, + "learning_rate": 0.009417363311012712, + "loss": 0.8022, + "num_input_tokens_seen": 51396216, + "step": 88525 + }, + { + "epoch": 13.185880250223414, + "grad_norm": 0.040283203125, + "learning_rate": 0.00941555376394852, + "loss": 0.8173, + "num_input_tokens_seen": 51399288, + "step": 88530 + }, + { + "epoch": 13.186624962764373, + "grad_norm": 0.039794921875, + "learning_rate": 0.009413744311227875, + "loss": 0.7913, + "num_input_tokens_seen": 51402264, + "step": 88535 + }, + { + "epoch": 13.187369675305332, + "grad_norm": 0.0380859375, + "learning_rate": 0.009411934952881344, + "loss": 0.8099, + "num_input_tokens_seen": 51405048, + "step": 88540 + }, + { + "epoch": 13.18811438784629, + "grad_norm": 0.031005859375, + "learning_rate": 0.009410125688939485, + "loss": 0.7691, + "num_input_tokens_seen": 51407800, + "step": 88545 + }, + { + "epoch": 13.188859100387251, + "grad_norm": 0.041015625, + "learning_rate": 0.00940831651943288, + "loss": 0.7804, + "num_input_tokens_seen": 51410424, + "step": 88550 + }, + { + "epoch": 13.18960381292821, + "grad_norm": 0.03369140625, + "learning_rate": 0.009406507444392078, + "loss": 0.801, + "num_input_tokens_seen": 51413464, + "step": 88555 + }, + { + "epoch": 13.190348525469169, + "grad_norm": 0.0322265625, + "learning_rate": 0.009404698463847653, + "loss": 0.8079, + "num_input_tokens_seen": 51416568, + "step": 88560 + }, + { + "epoch": 13.191093238010128, + "grad_norm": 0.031982421875, + "learning_rate": 0.009402889577830155, + "loss": 0.7982, + "num_input_tokens_seen": 51419512, + "step": 88565 + }, + { + "epoch": 13.191837950551088, + "grad_norm": 0.036376953125, + "learning_rate": 0.009401080786370154, + "loss": 0.8028, + "num_input_tokens_seen": 51422424, + "step": 88570 + }, + { + "epoch": 13.192582663092047, + "grad_norm": 0.031494140625, + "learning_rate": 0.009399272089498199, + "loss": 0.8001, + "num_input_tokens_seen": 51425240, + "step": 88575 + }, + { + "epoch": 13.193327375633006, + "grad_norm": 0.045166015625, + "learning_rate": 0.009397463487244849, + "loss": 0.8043, + "num_input_tokens_seen": 51427832, + "step": 88580 + }, + { + "epoch": 13.194072088173964, + "grad_norm": 0.031005859375, + "learning_rate": 0.009395654979640665, + "loss": 0.8076, + "num_input_tokens_seen": 51430552, + "step": 88585 + }, + { + "epoch": 13.194816800714925, + "grad_norm": 0.0257568359375, + "learning_rate": 0.009393846566716189, + "loss": 0.799, + "num_input_tokens_seen": 51433496, + "step": 88590 + }, + { + "epoch": 13.195561513255884, + "grad_norm": 0.047607421875, + "learning_rate": 0.00939203824850198, + "loss": 0.7939, + "num_input_tokens_seen": 51436248, + "step": 88595 + }, + { + "epoch": 13.196306225796842, + "grad_norm": 0.0242919921875, + "learning_rate": 0.00939023002502858, + "loss": 0.7913, + "num_input_tokens_seen": 51439224, + "step": 88600 + }, + { + "epoch": 13.197050938337801, + "grad_norm": 0.046630859375, + "learning_rate": 0.009388421896326549, + "loss": 0.7982, + "num_input_tokens_seen": 51442168, + "step": 88605 + }, + { + "epoch": 13.19779565087876, + "grad_norm": 0.043212890625, + "learning_rate": 0.009386613862426422, + "loss": 0.782, + "num_input_tokens_seen": 51445048, + "step": 88610 + }, + { + "epoch": 13.19854036341972, + "grad_norm": 0.0546875, + "learning_rate": 0.009384805923358754, + "loss": 0.7923, + "num_input_tokens_seen": 51447896, + "step": 88615 + }, + { + "epoch": 13.19928507596068, + "grad_norm": 0.041748046875, + "learning_rate": 0.009382998079154077, + "loss": 0.7887, + "num_input_tokens_seen": 51450904, + "step": 88620 + }, + { + "epoch": 13.200029788501638, + "grad_norm": 0.03759765625, + "learning_rate": 0.009381190329842942, + "loss": 0.7844, + "num_input_tokens_seen": 51453784, + "step": 88625 + }, + { + "epoch": 13.200774501042597, + "grad_norm": 0.06005859375, + "learning_rate": 0.009379382675455886, + "loss": 0.8112, + "num_input_tokens_seen": 51456600, + "step": 88630 + }, + { + "epoch": 13.201519213583557, + "grad_norm": 0.053466796875, + "learning_rate": 0.00937757511602344, + "loss": 0.79, + "num_input_tokens_seen": 51459352, + "step": 88635 + }, + { + "epoch": 13.202263926124516, + "grad_norm": 0.032470703125, + "learning_rate": 0.00937576765157616, + "loss": 0.7902, + "num_input_tokens_seen": 51462424, + "step": 88640 + }, + { + "epoch": 13.203008638665475, + "grad_norm": 0.062255859375, + "learning_rate": 0.00937396028214456, + "loss": 0.7989, + "num_input_tokens_seen": 51464984, + "step": 88645 + }, + { + "epoch": 13.203753351206434, + "grad_norm": 0.061767578125, + "learning_rate": 0.009372153007759189, + "loss": 0.8012, + "num_input_tokens_seen": 51467896, + "step": 88650 + }, + { + "epoch": 13.204498063747394, + "grad_norm": 0.03759765625, + "learning_rate": 0.009370345828450567, + "loss": 0.8009, + "num_input_tokens_seen": 51470648, + "step": 88655 + }, + { + "epoch": 13.205242776288353, + "grad_norm": 0.0390625, + "learning_rate": 0.009368538744249234, + "loss": 0.8669, + "num_input_tokens_seen": 51473976, + "step": 88660 + }, + { + "epoch": 13.205987488829312, + "grad_norm": 0.03759765625, + "learning_rate": 0.009366731755185717, + "loss": 0.8084, + "num_input_tokens_seen": 51476920, + "step": 88665 + }, + { + "epoch": 13.20673220137027, + "grad_norm": 0.037841796875, + "learning_rate": 0.009364924861290536, + "loss": 0.8066, + "num_input_tokens_seen": 51479736, + "step": 88670 + }, + { + "epoch": 13.207476913911231, + "grad_norm": 0.0185546875, + "learning_rate": 0.009363118062594229, + "loss": 0.7878, + "num_input_tokens_seen": 51482392, + "step": 88675 + }, + { + "epoch": 13.20822162645219, + "grad_norm": 0.0439453125, + "learning_rate": 0.009361311359127308, + "loss": 0.7755, + "num_input_tokens_seen": 51485208, + "step": 88680 + }, + { + "epoch": 13.208966338993148, + "grad_norm": 0.0240478515625, + "learning_rate": 0.009359504750920304, + "loss": 0.8095, + "num_input_tokens_seen": 51488120, + "step": 88685 + }, + { + "epoch": 13.209711051534107, + "grad_norm": 0.06396484375, + "learning_rate": 0.00935769823800373, + "loss": 0.8092, + "num_input_tokens_seen": 51490968, + "step": 88690 + }, + { + "epoch": 13.210455764075068, + "grad_norm": 0.040283203125, + "learning_rate": 0.009355891820408116, + "loss": 0.8027, + "num_input_tokens_seen": 51493848, + "step": 88695 + }, + { + "epoch": 13.211200476616026, + "grad_norm": 0.034912109375, + "learning_rate": 0.009354085498163972, + "loss": 0.8074, + "num_input_tokens_seen": 51496888, + "step": 88700 + }, + { + "epoch": 13.211945189156985, + "grad_norm": 0.023681640625, + "learning_rate": 0.009352279271301809, + "loss": 0.7875, + "num_input_tokens_seen": 51499864, + "step": 88705 + }, + { + "epoch": 13.212689901697944, + "grad_norm": 0.031494140625, + "learning_rate": 0.009350473139852156, + "loss": 0.7815, + "num_input_tokens_seen": 51502872, + "step": 88710 + }, + { + "epoch": 13.213434614238905, + "grad_norm": 0.06396484375, + "learning_rate": 0.009348667103845512, + "loss": 0.807, + "num_input_tokens_seen": 51505528, + "step": 88715 + }, + { + "epoch": 13.214179326779863, + "grad_norm": 0.0478515625, + "learning_rate": 0.009346861163312396, + "loss": 0.7974, + "num_input_tokens_seen": 51508440, + "step": 88720 + }, + { + "epoch": 13.214924039320822, + "grad_norm": 0.0341796875, + "learning_rate": 0.00934505531828331, + "loss": 0.7831, + "num_input_tokens_seen": 51511384, + "step": 88725 + }, + { + "epoch": 13.21566875186178, + "grad_norm": 0.052978515625, + "learning_rate": 0.009343249568788774, + "loss": 0.8216, + "num_input_tokens_seen": 51514008, + "step": 88730 + }, + { + "epoch": 13.216413464402741, + "grad_norm": 0.0419921875, + "learning_rate": 0.009341443914859282, + "loss": 0.803, + "num_input_tokens_seen": 51516920, + "step": 88735 + }, + { + "epoch": 13.2171581769437, + "grad_norm": 0.042236328125, + "learning_rate": 0.009339638356525347, + "loss": 0.7847, + "num_input_tokens_seen": 51520504, + "step": 88740 + }, + { + "epoch": 13.217902889484659, + "grad_norm": 0.043701171875, + "learning_rate": 0.009337832893817472, + "loss": 0.8153, + "num_input_tokens_seen": 51523512, + "step": 88745 + }, + { + "epoch": 13.218647602025618, + "grad_norm": 0.040771484375, + "learning_rate": 0.00933602752676615, + "loss": 0.8086, + "num_input_tokens_seen": 51526264, + "step": 88750 + }, + { + "epoch": 13.219392314566578, + "grad_norm": 0.042236328125, + "learning_rate": 0.00933422225540189, + "loss": 0.8225, + "num_input_tokens_seen": 51529528, + "step": 88755 + }, + { + "epoch": 13.220137027107537, + "grad_norm": 0.043701171875, + "learning_rate": 0.00933241707975518, + "loss": 0.8265, + "num_input_tokens_seen": 51532056, + "step": 88760 + }, + { + "epoch": 13.220881739648496, + "grad_norm": 0.049560546875, + "learning_rate": 0.00933061199985653, + "loss": 0.7902, + "num_input_tokens_seen": 51534776, + "step": 88765 + }, + { + "epoch": 13.221626452189454, + "grad_norm": 0.0301513671875, + "learning_rate": 0.009328807015736424, + "loss": 0.8013, + "num_input_tokens_seen": 51537592, + "step": 88770 + }, + { + "epoch": 13.222371164730415, + "grad_norm": 0.032470703125, + "learning_rate": 0.009327002127425365, + "loss": 0.7894, + "num_input_tokens_seen": 51540248, + "step": 88775 + }, + { + "epoch": 13.223115877271374, + "grad_norm": 0.0311279296875, + "learning_rate": 0.009325197334953836, + "loss": 0.7918, + "num_input_tokens_seen": 51542968, + "step": 88780 + }, + { + "epoch": 13.223860589812332, + "grad_norm": 0.0703125, + "learning_rate": 0.009323392638352329, + "loss": 0.7905, + "num_input_tokens_seen": 51545688, + "step": 88785 + }, + { + "epoch": 13.224605302353291, + "grad_norm": 0.13671875, + "learning_rate": 0.009321588037651337, + "loss": 0.8362, + "num_input_tokens_seen": 51548568, + "step": 88790 + }, + { + "epoch": 13.22535001489425, + "grad_norm": 0.03515625, + "learning_rate": 0.00931978353288134, + "loss": 0.7918, + "num_input_tokens_seen": 51551288, + "step": 88795 + }, + { + "epoch": 13.22609472743521, + "grad_norm": 0.032470703125, + "learning_rate": 0.009317979124072829, + "loss": 0.8091, + "num_input_tokens_seen": 51554200, + "step": 88800 + }, + { + "epoch": 13.22683943997617, + "grad_norm": 0.049072265625, + "learning_rate": 0.009316174811256283, + "loss": 0.8083, + "num_input_tokens_seen": 51557048, + "step": 88805 + }, + { + "epoch": 13.227584152517128, + "grad_norm": 0.021240234375, + "learning_rate": 0.00931437059446219, + "loss": 0.8028, + "num_input_tokens_seen": 51559960, + "step": 88810 + }, + { + "epoch": 13.228328865058087, + "grad_norm": 0.034423828125, + "learning_rate": 0.009312566473721027, + "loss": 0.7789, + "num_input_tokens_seen": 51562840, + "step": 88815 + }, + { + "epoch": 13.229073577599047, + "grad_norm": 0.041015625, + "learning_rate": 0.009310762449063275, + "loss": 0.798, + "num_input_tokens_seen": 51565976, + "step": 88820 + }, + { + "epoch": 13.229818290140006, + "grad_norm": 0.031982421875, + "learning_rate": 0.00930895852051941, + "loss": 0.8066, + "num_input_tokens_seen": 51568760, + "step": 88825 + }, + { + "epoch": 13.230563002680965, + "grad_norm": 0.06005859375, + "learning_rate": 0.009307154688119901, + "loss": 0.773, + "num_input_tokens_seen": 51571608, + "step": 88830 + }, + { + "epoch": 13.231307715221924, + "grad_norm": 0.0308837890625, + "learning_rate": 0.009305350951895231, + "loss": 0.7854, + "num_input_tokens_seen": 51574520, + "step": 88835 + }, + { + "epoch": 13.232052427762884, + "grad_norm": 0.04052734375, + "learning_rate": 0.009303547311875864, + "loss": 0.8061, + "num_input_tokens_seen": 51577496, + "step": 88840 + }, + { + "epoch": 13.232797140303843, + "grad_norm": 0.04833984375, + "learning_rate": 0.009301743768092284, + "loss": 0.8069, + "num_input_tokens_seen": 51580344, + "step": 88845 + }, + { + "epoch": 13.233541852844802, + "grad_norm": 0.030029296875, + "learning_rate": 0.009299940320574945, + "loss": 0.7716, + "num_input_tokens_seen": 51583352, + "step": 88850 + }, + { + "epoch": 13.23428656538576, + "grad_norm": 0.0322265625, + "learning_rate": 0.009298136969354323, + "loss": 0.7796, + "num_input_tokens_seen": 51586008, + "step": 88855 + }, + { + "epoch": 13.235031277926721, + "grad_norm": 0.03515625, + "learning_rate": 0.00929633371446088, + "loss": 0.7979, + "num_input_tokens_seen": 51588760, + "step": 88860 + }, + { + "epoch": 13.23577599046768, + "grad_norm": 0.03369140625, + "learning_rate": 0.009294530555925086, + "loss": 0.7932, + "num_input_tokens_seen": 51591416, + "step": 88865 + }, + { + "epoch": 13.236520703008638, + "grad_norm": 0.037353515625, + "learning_rate": 0.009292727493777398, + "loss": 0.7938, + "num_input_tokens_seen": 51594552, + "step": 88870 + }, + { + "epoch": 13.237265415549597, + "grad_norm": 0.037109375, + "learning_rate": 0.009290924528048274, + "loss": 0.7997, + "num_input_tokens_seen": 51597528, + "step": 88875 + }, + { + "epoch": 13.238010128090558, + "grad_norm": 0.042236328125, + "learning_rate": 0.009289121658768185, + "loss": 0.8022, + "num_input_tokens_seen": 51600472, + "step": 88880 + }, + { + "epoch": 13.238754840631517, + "grad_norm": 0.046630859375, + "learning_rate": 0.009287318885967578, + "loss": 0.7868, + "num_input_tokens_seen": 51603224, + "step": 88885 + }, + { + "epoch": 13.239499553172475, + "grad_norm": 0.040283203125, + "learning_rate": 0.009285516209676914, + "loss": 0.8396, + "num_input_tokens_seen": 51606200, + "step": 88890 + }, + { + "epoch": 13.240244265713434, + "grad_norm": 0.0247802734375, + "learning_rate": 0.009283713629926638, + "loss": 0.7788, + "num_input_tokens_seen": 51609112, + "step": 88895 + }, + { + "epoch": 13.240988978254395, + "grad_norm": 0.0390625, + "learning_rate": 0.009281911146747219, + "loss": 0.7941, + "num_input_tokens_seen": 51612248, + "step": 88900 + }, + { + "epoch": 13.241733690795353, + "grad_norm": 0.03173828125, + "learning_rate": 0.0092801087601691, + "loss": 0.7896, + "num_input_tokens_seen": 51614840, + "step": 88905 + }, + { + "epoch": 13.242478403336312, + "grad_norm": 0.047119140625, + "learning_rate": 0.009278306470222724, + "loss": 0.7831, + "num_input_tokens_seen": 51617592, + "step": 88910 + }, + { + "epoch": 13.24322311587727, + "grad_norm": 0.041748046875, + "learning_rate": 0.009276504276938546, + "loss": 0.7731, + "num_input_tokens_seen": 51620600, + "step": 88915 + }, + { + "epoch": 13.243967828418231, + "grad_norm": 0.03759765625, + "learning_rate": 0.009274702180347014, + "loss": 0.8044, + "num_input_tokens_seen": 51623768, + "step": 88920 + }, + { + "epoch": 13.24471254095919, + "grad_norm": 0.033447265625, + "learning_rate": 0.009272900180478572, + "loss": 0.781, + "num_input_tokens_seen": 51626808, + "step": 88925 + }, + { + "epoch": 13.245457253500149, + "grad_norm": 0.02392578125, + "learning_rate": 0.009271098277363651, + "loss": 0.8014, + "num_input_tokens_seen": 51629880, + "step": 88930 + }, + { + "epoch": 13.246201966041108, + "grad_norm": 0.03955078125, + "learning_rate": 0.009269296471032711, + "loss": 0.8151, + "num_input_tokens_seen": 51632696, + "step": 88935 + }, + { + "epoch": 13.246946678582066, + "grad_norm": 0.06201171875, + "learning_rate": 0.00926749476151618, + "loss": 0.8112, + "num_input_tokens_seen": 51635288, + "step": 88940 + }, + { + "epoch": 13.247691391123027, + "grad_norm": 0.049072265625, + "learning_rate": 0.0092656931488445, + "loss": 0.8464, + "num_input_tokens_seen": 51638008, + "step": 88945 + }, + { + "epoch": 13.248436103663986, + "grad_norm": 0.04443359375, + "learning_rate": 0.009263891633048105, + "loss": 0.8117, + "num_input_tokens_seen": 51640824, + "step": 88950 + }, + { + "epoch": 13.249180816204944, + "grad_norm": 0.03271484375, + "learning_rate": 0.00926209021415743, + "loss": 0.8122, + "num_input_tokens_seen": 51643640, + "step": 88955 + }, + { + "epoch": 13.249925528745903, + "grad_norm": 0.040283203125, + "learning_rate": 0.009260288892202912, + "loss": 0.8002, + "num_input_tokens_seen": 51646296, + "step": 88960 + }, + { + "epoch": 13.250670241286864, + "grad_norm": 0.039794921875, + "learning_rate": 0.009258487667214974, + "loss": 0.7882, + "num_input_tokens_seen": 51649016, + "step": 88965 + }, + { + "epoch": 13.251414953827823, + "grad_norm": 0.0211181640625, + "learning_rate": 0.009256686539224056, + "loss": 0.7799, + "num_input_tokens_seen": 51652216, + "step": 88970 + }, + { + "epoch": 13.252159666368781, + "grad_norm": 0.021728515625, + "learning_rate": 0.00925488550826058, + "loss": 0.7896, + "num_input_tokens_seen": 51654904, + "step": 88975 + }, + { + "epoch": 13.25290437890974, + "grad_norm": 0.047607421875, + "learning_rate": 0.009253084574354978, + "loss": 0.8123, + "num_input_tokens_seen": 51657592, + "step": 88980 + }, + { + "epoch": 13.2536490914507, + "grad_norm": 0.11767578125, + "learning_rate": 0.00925128373753767, + "loss": 0.8447, + "num_input_tokens_seen": 51660376, + "step": 88985 + }, + { + "epoch": 13.25439380399166, + "grad_norm": 0.051025390625, + "learning_rate": 0.009249482997839075, + "loss": 0.8005, + "num_input_tokens_seen": 51663480, + "step": 88990 + }, + { + "epoch": 13.255138516532618, + "grad_norm": 0.0498046875, + "learning_rate": 0.009247682355289626, + "loss": 0.7902, + "num_input_tokens_seen": 51666264, + "step": 88995 + }, + { + "epoch": 13.255883229073577, + "grad_norm": 0.031982421875, + "learning_rate": 0.009245881809919731, + "loss": 0.7917, + "num_input_tokens_seen": 51668984, + "step": 89000 + }, + { + "epoch": 13.256627941614537, + "grad_norm": 0.028564453125, + "learning_rate": 0.00924408136175982, + "loss": 0.7916, + "num_input_tokens_seen": 51671832, + "step": 89005 + }, + { + "epoch": 13.257372654155496, + "grad_norm": 0.04248046875, + "learning_rate": 0.009242281010840304, + "loss": 0.7804, + "num_input_tokens_seen": 51674744, + "step": 89010 + }, + { + "epoch": 13.258117366696455, + "grad_norm": 0.036865234375, + "learning_rate": 0.009240480757191598, + "loss": 0.8202, + "num_input_tokens_seen": 51677624, + "step": 89015 + }, + { + "epoch": 13.258862079237414, + "grad_norm": 0.03271484375, + "learning_rate": 0.009238680600844113, + "loss": 0.791, + "num_input_tokens_seen": 51680664, + "step": 89020 + }, + { + "epoch": 13.259606791778374, + "grad_norm": 0.03173828125, + "learning_rate": 0.009236880541828267, + "loss": 0.7969, + "num_input_tokens_seen": 51683512, + "step": 89025 + }, + { + "epoch": 13.260351504319333, + "grad_norm": 0.034423828125, + "learning_rate": 0.009235080580174469, + "loss": 0.7968, + "num_input_tokens_seen": 51686680, + "step": 89030 + }, + { + "epoch": 13.261096216860292, + "grad_norm": 0.03173828125, + "learning_rate": 0.009233280715913119, + "loss": 0.8106, + "num_input_tokens_seen": 51689592, + "step": 89035 + }, + { + "epoch": 13.26184092940125, + "grad_norm": 0.038330078125, + "learning_rate": 0.009231480949074635, + "loss": 0.8049, + "num_input_tokens_seen": 51692696, + "step": 89040 + }, + { + "epoch": 13.262585641942211, + "grad_norm": 0.044677734375, + "learning_rate": 0.009229681279689416, + "loss": 0.7872, + "num_input_tokens_seen": 51695480, + "step": 89045 + }, + { + "epoch": 13.26333035448317, + "grad_norm": 0.03125, + "learning_rate": 0.009227881707787869, + "loss": 0.8134, + "num_input_tokens_seen": 51698328, + "step": 89050 + }, + { + "epoch": 13.264075067024129, + "grad_norm": 0.03759765625, + "learning_rate": 0.00922608223340039, + "loss": 0.7919, + "num_input_tokens_seen": 51701208, + "step": 89055 + }, + { + "epoch": 13.264819779565087, + "grad_norm": 0.0341796875, + "learning_rate": 0.009224282856557386, + "loss": 0.8068, + "num_input_tokens_seen": 51704056, + "step": 89060 + }, + { + "epoch": 13.265564492106048, + "grad_norm": 0.0439453125, + "learning_rate": 0.009222483577289255, + "loss": 0.7796, + "num_input_tokens_seen": 51706840, + "step": 89065 + }, + { + "epoch": 13.266309204647007, + "grad_norm": 0.02978515625, + "learning_rate": 0.009220684395626384, + "loss": 0.7792, + "num_input_tokens_seen": 51709848, + "step": 89070 + }, + { + "epoch": 13.267053917187965, + "grad_norm": 0.023193359375, + "learning_rate": 0.009218885311599184, + "loss": 0.7975, + "num_input_tokens_seen": 51712888, + "step": 89075 + }, + { + "epoch": 13.267798629728924, + "grad_norm": 0.032470703125, + "learning_rate": 0.009217086325238039, + "loss": 0.8187, + "num_input_tokens_seen": 51715832, + "step": 89080 + }, + { + "epoch": 13.268543342269885, + "grad_norm": 0.037841796875, + "learning_rate": 0.009215287436573344, + "loss": 0.8119, + "num_input_tokens_seen": 51718776, + "step": 89085 + }, + { + "epoch": 13.269288054810843, + "grad_norm": 0.057373046875, + "learning_rate": 0.009213488645635483, + "loss": 0.7905, + "num_input_tokens_seen": 51721656, + "step": 89090 + }, + { + "epoch": 13.270032767351802, + "grad_norm": 0.0274658203125, + "learning_rate": 0.009211689952454857, + "loss": 0.7942, + "num_input_tokens_seen": 51724248, + "step": 89095 + }, + { + "epoch": 13.270777479892761, + "grad_norm": 0.0225830078125, + "learning_rate": 0.009209891357061838, + "loss": 0.7922, + "num_input_tokens_seen": 51727128, + "step": 89100 + }, + { + "epoch": 13.271522192433721, + "grad_norm": 0.045166015625, + "learning_rate": 0.00920809285948683, + "loss": 0.782, + "num_input_tokens_seen": 51730104, + "step": 89105 + }, + { + "epoch": 13.27226690497468, + "grad_norm": 0.05078125, + "learning_rate": 0.009206294459760204, + "loss": 0.8047, + "num_input_tokens_seen": 51732856, + "step": 89110 + }, + { + "epoch": 13.273011617515639, + "grad_norm": 0.042236328125, + "learning_rate": 0.00920449615791234, + "loss": 0.7864, + "num_input_tokens_seen": 51735896, + "step": 89115 + }, + { + "epoch": 13.273756330056598, + "grad_norm": 0.034423828125, + "learning_rate": 0.009202697953973629, + "loss": 0.8157, + "num_input_tokens_seen": 51738872, + "step": 89120 + }, + { + "epoch": 13.274501042597556, + "grad_norm": 0.032958984375, + "learning_rate": 0.009200899847974436, + "loss": 0.8247, + "num_input_tokens_seen": 51741528, + "step": 89125 + }, + { + "epoch": 13.275245755138517, + "grad_norm": 0.0238037109375, + "learning_rate": 0.009199101839945153, + "loss": 0.7957, + "num_input_tokens_seen": 51744568, + "step": 89130 + }, + { + "epoch": 13.275990467679476, + "grad_norm": 0.034423828125, + "learning_rate": 0.009197303929916145, + "loss": 0.7931, + "num_input_tokens_seen": 51747608, + "step": 89135 + }, + { + "epoch": 13.276735180220435, + "grad_norm": 0.038818359375, + "learning_rate": 0.009195506117917791, + "loss": 0.7877, + "num_input_tokens_seen": 51750840, + "step": 89140 + }, + { + "epoch": 13.277479892761393, + "grad_norm": 0.045654296875, + "learning_rate": 0.009193708403980464, + "loss": 0.7743, + "num_input_tokens_seen": 51754040, + "step": 89145 + }, + { + "epoch": 13.278224605302354, + "grad_norm": 0.03515625, + "learning_rate": 0.009191910788134527, + "loss": 0.7773, + "num_input_tokens_seen": 51756856, + "step": 89150 + }, + { + "epoch": 13.278969317843313, + "grad_norm": 0.03564453125, + "learning_rate": 0.00919011327041036, + "loss": 0.7983, + "num_input_tokens_seen": 51759992, + "step": 89155 + }, + { + "epoch": 13.279714030384271, + "grad_norm": 0.0303955078125, + "learning_rate": 0.009188315850838319, + "loss": 0.7933, + "num_input_tokens_seen": 51762680, + "step": 89160 + }, + { + "epoch": 13.28045874292523, + "grad_norm": 0.047607421875, + "learning_rate": 0.00918651852944878, + "loss": 0.8084, + "num_input_tokens_seen": 51765624, + "step": 89165 + }, + { + "epoch": 13.28120345546619, + "grad_norm": 0.044677734375, + "learning_rate": 0.009184721306272096, + "loss": 0.7847, + "num_input_tokens_seen": 51768792, + "step": 89170 + }, + { + "epoch": 13.28194816800715, + "grad_norm": 0.0206298828125, + "learning_rate": 0.009182924181338639, + "loss": 0.7849, + "num_input_tokens_seen": 51771480, + "step": 89175 + }, + { + "epoch": 13.282692880548108, + "grad_norm": 0.035888671875, + "learning_rate": 0.009181127154678764, + "loss": 0.7885, + "num_input_tokens_seen": 51774552, + "step": 89180 + }, + { + "epoch": 13.283437593089067, + "grad_norm": 0.037109375, + "learning_rate": 0.009179330226322837, + "loss": 0.7912, + "num_input_tokens_seen": 51777528, + "step": 89185 + }, + { + "epoch": 13.284182305630027, + "grad_norm": 0.0218505859375, + "learning_rate": 0.009177533396301205, + "loss": 0.823, + "num_input_tokens_seen": 51780504, + "step": 89190 + }, + { + "epoch": 13.284927018170986, + "grad_norm": 0.027587890625, + "learning_rate": 0.009175736664644227, + "loss": 0.7876, + "num_input_tokens_seen": 51783416, + "step": 89195 + }, + { + "epoch": 13.285671730711945, + "grad_norm": 0.034423828125, + "learning_rate": 0.009173940031382263, + "loss": 0.7907, + "num_input_tokens_seen": 51786360, + "step": 89200 + }, + { + "epoch": 13.286416443252904, + "grad_norm": 0.033935546875, + "learning_rate": 0.009172143496545654, + "loss": 0.8039, + "num_input_tokens_seen": 51789048, + "step": 89205 + }, + { + "epoch": 13.287161155793864, + "grad_norm": 0.037353515625, + "learning_rate": 0.009170347060164766, + "loss": 0.7849, + "num_input_tokens_seen": 51791992, + "step": 89210 + }, + { + "epoch": 13.287905868334823, + "grad_norm": 0.0322265625, + "learning_rate": 0.009168550722269934, + "loss": 0.7637, + "num_input_tokens_seen": 51794968, + "step": 89215 + }, + { + "epoch": 13.288650580875782, + "grad_norm": 0.0458984375, + "learning_rate": 0.009166754482891516, + "loss": 0.7834, + "num_input_tokens_seen": 51797720, + "step": 89220 + }, + { + "epoch": 13.28939529341674, + "grad_norm": 0.046142578125, + "learning_rate": 0.00916495834205985, + "loss": 0.792, + "num_input_tokens_seen": 51800472, + "step": 89225 + }, + { + "epoch": 13.290140005957701, + "grad_norm": 0.023681640625, + "learning_rate": 0.009163162299805279, + "loss": 0.802, + "num_input_tokens_seen": 51803448, + "step": 89230 + }, + { + "epoch": 13.29088471849866, + "grad_norm": 0.0439453125, + "learning_rate": 0.00916136635615815, + "loss": 0.7985, + "num_input_tokens_seen": 51806616, + "step": 89235 + }, + { + "epoch": 13.291629431039619, + "grad_norm": 0.030029296875, + "learning_rate": 0.0091595705111488, + "loss": 0.8068, + "num_input_tokens_seen": 51809432, + "step": 89240 + }, + { + "epoch": 13.292374143580577, + "grad_norm": 0.01953125, + "learning_rate": 0.009157774764807577, + "loss": 0.807, + "num_input_tokens_seen": 51812376, + "step": 89245 + }, + { + "epoch": 13.293118856121538, + "grad_norm": 0.0390625, + "learning_rate": 0.009155979117164805, + "loss": 0.7904, + "num_input_tokens_seen": 51815256, + "step": 89250 + }, + { + "epoch": 13.293863568662497, + "grad_norm": 0.039306640625, + "learning_rate": 0.00915418356825083, + "loss": 0.7773, + "num_input_tokens_seen": 51817912, + "step": 89255 + }, + { + "epoch": 13.294608281203455, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00915238811809598, + "loss": 0.7849, + "num_input_tokens_seen": 51820568, + "step": 89260 + }, + { + "epoch": 13.295352993744414, + "grad_norm": 0.038818359375, + "learning_rate": 0.009150592766730591, + "loss": 0.8084, + "num_input_tokens_seen": 51823576, + "step": 89265 + }, + { + "epoch": 13.296097706285375, + "grad_norm": 0.020263671875, + "learning_rate": 0.009148797514184997, + "loss": 0.7808, + "num_input_tokens_seen": 51826552, + "step": 89270 + }, + { + "epoch": 13.296842418826333, + "grad_norm": 0.04443359375, + "learning_rate": 0.009147002360489515, + "loss": 0.8069, + "num_input_tokens_seen": 51829304, + "step": 89275 + }, + { + "epoch": 13.297587131367292, + "grad_norm": 0.03271484375, + "learning_rate": 0.00914520730567448, + "loss": 0.8222, + "num_input_tokens_seen": 51832184, + "step": 89280 + }, + { + "epoch": 13.298331843908251, + "grad_norm": 0.041015625, + "learning_rate": 0.009143412349770218, + "loss": 0.7994, + "num_input_tokens_seen": 51834744, + "step": 89285 + }, + { + "epoch": 13.299076556449211, + "grad_norm": 0.041748046875, + "learning_rate": 0.009141617492807053, + "loss": 0.7965, + "num_input_tokens_seen": 51837720, + "step": 89290 + }, + { + "epoch": 13.29982126899017, + "grad_norm": 0.038818359375, + "learning_rate": 0.009139822734815303, + "loss": 0.8106, + "num_input_tokens_seen": 51840664, + "step": 89295 + }, + { + "epoch": 13.300565981531129, + "grad_norm": 0.043212890625, + "learning_rate": 0.009138028075825297, + "loss": 0.7849, + "num_input_tokens_seen": 51843448, + "step": 89300 + }, + { + "epoch": 13.301310694072088, + "grad_norm": 0.02880859375, + "learning_rate": 0.00913623351586735, + "loss": 0.7828, + "num_input_tokens_seen": 51846584, + "step": 89305 + }, + { + "epoch": 13.302055406613047, + "grad_norm": 0.0673828125, + "learning_rate": 0.009134439054971771, + "loss": 0.8181, + "num_input_tokens_seen": 51849624, + "step": 89310 + }, + { + "epoch": 13.302800119154007, + "grad_norm": 0.03662109375, + "learning_rate": 0.00913264469316889, + "loss": 0.7877, + "num_input_tokens_seen": 51852632, + "step": 89315 + }, + { + "epoch": 13.303544831694966, + "grad_norm": 0.0238037109375, + "learning_rate": 0.009130850430489005, + "loss": 0.7823, + "num_input_tokens_seen": 51855672, + "step": 89320 + }, + { + "epoch": 13.304289544235925, + "grad_norm": 0.033203125, + "learning_rate": 0.009129056266962443, + "loss": 0.7845, + "num_input_tokens_seen": 51858456, + "step": 89325 + }, + { + "epoch": 13.305034256776883, + "grad_norm": 0.053466796875, + "learning_rate": 0.009127262202619503, + "loss": 0.787, + "num_input_tokens_seen": 51861176, + "step": 89330 + }, + { + "epoch": 13.305778969317844, + "grad_norm": 0.04736328125, + "learning_rate": 0.009125468237490506, + "loss": 0.7907, + "num_input_tokens_seen": 51863832, + "step": 89335 + }, + { + "epoch": 13.306523681858803, + "grad_norm": 0.037841796875, + "learning_rate": 0.009123674371605748, + "loss": 0.7926, + "num_input_tokens_seen": 51866872, + "step": 89340 + }, + { + "epoch": 13.307268394399761, + "grad_norm": 0.0458984375, + "learning_rate": 0.009121880604995544, + "loss": 0.794, + "num_input_tokens_seen": 51869976, + "step": 89345 + }, + { + "epoch": 13.30801310694072, + "grad_norm": 0.036376953125, + "learning_rate": 0.00912008693769019, + "loss": 0.8117, + "num_input_tokens_seen": 51872856, + "step": 89350 + }, + { + "epoch": 13.30875781948168, + "grad_norm": 0.04638671875, + "learning_rate": 0.009118293369719989, + "loss": 0.7885, + "num_input_tokens_seen": 51875832, + "step": 89355 + }, + { + "epoch": 13.30950253202264, + "grad_norm": 0.08740234375, + "learning_rate": 0.009116499901115249, + "loss": 0.7805, + "num_input_tokens_seen": 51878680, + "step": 89360 + }, + { + "epoch": 13.310247244563598, + "grad_norm": 0.0242919921875, + "learning_rate": 0.009114706531906255, + "loss": 0.7832, + "num_input_tokens_seen": 51881752, + "step": 89365 + }, + { + "epoch": 13.310991957104557, + "grad_norm": 0.0380859375, + "learning_rate": 0.00911291326212332, + "loss": 0.8092, + "num_input_tokens_seen": 51884760, + "step": 89370 + }, + { + "epoch": 13.311736669645517, + "grad_norm": 0.035888671875, + "learning_rate": 0.009111120091796729, + "loss": 0.7771, + "num_input_tokens_seen": 51887544, + "step": 89375 + }, + { + "epoch": 13.312481382186476, + "grad_norm": 0.051513671875, + "learning_rate": 0.00910932702095678, + "loss": 0.7807, + "num_input_tokens_seen": 51890424, + "step": 89380 + }, + { + "epoch": 13.313226094727435, + "grad_norm": 0.032958984375, + "learning_rate": 0.00910753404963376, + "loss": 0.8088, + "num_input_tokens_seen": 51893464, + "step": 89385 + }, + { + "epoch": 13.313970807268394, + "grad_norm": 0.039306640625, + "learning_rate": 0.009105741177857968, + "loss": 0.8035, + "num_input_tokens_seen": 51896600, + "step": 89390 + }, + { + "epoch": 13.314715519809354, + "grad_norm": 0.035888671875, + "learning_rate": 0.009103948405659689, + "loss": 0.8088, + "num_input_tokens_seen": 51899384, + "step": 89395 + }, + { + "epoch": 13.315460232350313, + "grad_norm": 0.033203125, + "learning_rate": 0.009102155733069205, + "loss": 0.7996, + "num_input_tokens_seen": 51902136, + "step": 89400 + }, + { + "epoch": 13.316204944891272, + "grad_norm": 0.047119140625, + "learning_rate": 0.009100363160116809, + "loss": 0.7979, + "num_input_tokens_seen": 51905016, + "step": 89405 + }, + { + "epoch": 13.31694965743223, + "grad_norm": 0.03271484375, + "learning_rate": 0.00909857068683278, + "loss": 0.8443, + "num_input_tokens_seen": 51907576, + "step": 89410 + }, + { + "epoch": 13.317694369973191, + "grad_norm": 0.04345703125, + "learning_rate": 0.009096778313247404, + "loss": 0.7975, + "num_input_tokens_seen": 51910264, + "step": 89415 + }, + { + "epoch": 13.31843908251415, + "grad_norm": 0.044189453125, + "learning_rate": 0.009094986039390952, + "loss": 0.7974, + "num_input_tokens_seen": 51913208, + "step": 89420 + }, + { + "epoch": 13.319183795055109, + "grad_norm": 0.02978515625, + "learning_rate": 0.009093193865293716, + "loss": 0.807, + "num_input_tokens_seen": 51916120, + "step": 89425 + }, + { + "epoch": 13.319928507596067, + "grad_norm": 0.03515625, + "learning_rate": 0.009091401790985965, + "loss": 0.8121, + "num_input_tokens_seen": 51919704, + "step": 89430 + }, + { + "epoch": 13.320673220137028, + "grad_norm": 0.037109375, + "learning_rate": 0.00908960981649797, + "loss": 0.8071, + "num_input_tokens_seen": 51922456, + "step": 89435 + }, + { + "epoch": 13.321417932677987, + "grad_norm": 0.03271484375, + "learning_rate": 0.009087817941860017, + "loss": 0.7697, + "num_input_tokens_seen": 51925496, + "step": 89440 + }, + { + "epoch": 13.322162645218945, + "grad_norm": 0.03662109375, + "learning_rate": 0.009086026167102367, + "loss": 0.8002, + "num_input_tokens_seen": 51928216, + "step": 89445 + }, + { + "epoch": 13.322907357759904, + "grad_norm": 0.03125, + "learning_rate": 0.009084234492255297, + "loss": 0.7907, + "num_input_tokens_seen": 51931160, + "step": 89450 + }, + { + "epoch": 13.323652070300863, + "grad_norm": 0.056640625, + "learning_rate": 0.009082442917349068, + "loss": 0.8314, + "num_input_tokens_seen": 51934040, + "step": 89455 + }, + { + "epoch": 13.324396782841823, + "grad_norm": 0.04833984375, + "learning_rate": 0.009080651442413955, + "loss": 0.7986, + "num_input_tokens_seen": 51937272, + "step": 89460 + }, + { + "epoch": 13.325141495382782, + "grad_norm": 0.021484375, + "learning_rate": 0.009078860067480217, + "loss": 0.801, + "num_input_tokens_seen": 51939992, + "step": 89465 + }, + { + "epoch": 13.325886207923741, + "grad_norm": 0.0301513671875, + "learning_rate": 0.009077068792578124, + "loss": 0.7955, + "num_input_tokens_seen": 51943096, + "step": 89470 + }, + { + "epoch": 13.3266309204647, + "grad_norm": 0.0228271484375, + "learning_rate": 0.009075277617737935, + "loss": 0.8039, + "num_input_tokens_seen": 51945976, + "step": 89475 + }, + { + "epoch": 13.32737563300566, + "grad_norm": 0.03759765625, + "learning_rate": 0.009073486542989907, + "loss": 0.7914, + "num_input_tokens_seen": 51948856, + "step": 89480 + }, + { + "epoch": 13.328120345546619, + "grad_norm": 0.030517578125, + "learning_rate": 0.009071695568364301, + "loss": 0.8086, + "num_input_tokens_seen": 51951800, + "step": 89485 + }, + { + "epoch": 13.328865058087578, + "grad_norm": 0.049072265625, + "learning_rate": 0.009069904693891368, + "loss": 0.8016, + "num_input_tokens_seen": 51954648, + "step": 89490 + }, + { + "epoch": 13.329609770628537, + "grad_norm": 0.038330078125, + "learning_rate": 0.009068113919601373, + "loss": 0.7819, + "num_input_tokens_seen": 51957496, + "step": 89495 + }, + { + "epoch": 13.330354483169497, + "grad_norm": 0.0218505859375, + "learning_rate": 0.00906632324552456, + "loss": 0.788, + "num_input_tokens_seen": 51960472, + "step": 89500 + }, + { + "epoch": 13.331099195710456, + "grad_norm": 0.039306640625, + "learning_rate": 0.00906453267169119, + "loss": 0.7893, + "num_input_tokens_seen": 51963288, + "step": 89505 + }, + { + "epoch": 13.331843908251415, + "grad_norm": 0.03955078125, + "learning_rate": 0.009062742198131512, + "loss": 0.7908, + "num_input_tokens_seen": 51965976, + "step": 89510 + }, + { + "epoch": 13.332588620792373, + "grad_norm": 0.04150390625, + "learning_rate": 0.009060951824875763, + "loss": 0.7945, + "num_input_tokens_seen": 51969112, + "step": 89515 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 0.039306640625, + "learning_rate": 0.0090591615519542, + "loss": 0.7848, + "num_input_tokens_seen": 51971896, + "step": 89520 + }, + { + "epoch": 13.334078045874293, + "grad_norm": 0.040283203125, + "learning_rate": 0.00905737137939706, + "loss": 0.8178, + "num_input_tokens_seen": 51974712, + "step": 89525 + }, + { + "epoch": 13.334822758415251, + "grad_norm": 0.031982421875, + "learning_rate": 0.009055581307234597, + "loss": 0.8076, + "num_input_tokens_seen": 51977848, + "step": 89530 + }, + { + "epoch": 13.33556747095621, + "grad_norm": 0.0206298828125, + "learning_rate": 0.009053791335497038, + "loss": 0.7894, + "num_input_tokens_seen": 51980792, + "step": 89535 + }, + { + "epoch": 13.33631218349717, + "grad_norm": 0.031494140625, + "learning_rate": 0.009052001464214638, + "loss": 0.7974, + "num_input_tokens_seen": 51983544, + "step": 89540 + }, + { + "epoch": 13.33705689603813, + "grad_norm": 0.052490234375, + "learning_rate": 0.009050211693417624, + "loss": 0.8029, + "num_input_tokens_seen": 51986520, + "step": 89545 + }, + { + "epoch": 13.337801608579088, + "grad_norm": 0.04833984375, + "learning_rate": 0.009048422023136242, + "loss": 0.7972, + "num_input_tokens_seen": 51989272, + "step": 89550 + }, + { + "epoch": 13.338546321120047, + "grad_norm": 0.046875, + "learning_rate": 0.00904663245340072, + "loss": 0.8066, + "num_input_tokens_seen": 51992312, + "step": 89555 + }, + { + "epoch": 13.339291033661008, + "grad_norm": 0.037353515625, + "learning_rate": 0.009044842984241285, + "loss": 0.815, + "num_input_tokens_seen": 51995224, + "step": 89560 + }, + { + "epoch": 13.340035746201966, + "grad_norm": 0.042236328125, + "learning_rate": 0.009043053615688182, + "loss": 0.7866, + "num_input_tokens_seen": 51997880, + "step": 89565 + }, + { + "epoch": 13.340780458742925, + "grad_norm": 0.040771484375, + "learning_rate": 0.009041264347771626, + "loss": 0.8024, + "num_input_tokens_seen": 52000696, + "step": 89570 + }, + { + "epoch": 13.341525171283884, + "grad_norm": 0.06494140625, + "learning_rate": 0.009039475180521861, + "loss": 0.8096, + "num_input_tokens_seen": 52003576, + "step": 89575 + }, + { + "epoch": 13.342269883824844, + "grad_norm": 0.0419921875, + "learning_rate": 0.0090376861139691, + "loss": 0.7948, + "num_input_tokens_seen": 52006232, + "step": 89580 + }, + { + "epoch": 13.343014596365803, + "grad_norm": 0.03857421875, + "learning_rate": 0.009035897148143577, + "loss": 0.7866, + "num_input_tokens_seen": 52009016, + "step": 89585 + }, + { + "epoch": 13.343759308906762, + "grad_norm": 0.02099609375, + "learning_rate": 0.00903410828307551, + "loss": 0.7934, + "num_input_tokens_seen": 52011896, + "step": 89590 + }, + { + "epoch": 13.34450402144772, + "grad_norm": 0.040771484375, + "learning_rate": 0.009032319518795113, + "loss": 0.7839, + "num_input_tokens_seen": 52014872, + "step": 89595 + }, + { + "epoch": 13.345248733988681, + "grad_norm": 0.048583984375, + "learning_rate": 0.009030530855332617, + "loss": 0.7948, + "num_input_tokens_seen": 52017848, + "step": 89600 + }, + { + "epoch": 13.34599344652964, + "grad_norm": 0.0211181640625, + "learning_rate": 0.009028742292718234, + "loss": 0.7864, + "num_input_tokens_seen": 52020664, + "step": 89605 + }, + { + "epoch": 13.346738159070599, + "grad_norm": 0.03662109375, + "learning_rate": 0.009026953830982183, + "loss": 0.7872, + "num_input_tokens_seen": 52023448, + "step": 89610 + }, + { + "epoch": 13.347482871611557, + "grad_norm": 0.0419921875, + "learning_rate": 0.009025165470154674, + "loss": 0.7959, + "num_input_tokens_seen": 52026616, + "step": 89615 + }, + { + "epoch": 13.348227584152518, + "grad_norm": 0.0439453125, + "learning_rate": 0.009023377210265925, + "loss": 0.8303, + "num_input_tokens_seen": 52029464, + "step": 89620 + }, + { + "epoch": 13.348972296693477, + "grad_norm": 0.0302734375, + "learning_rate": 0.009021589051346136, + "loss": 0.8106, + "num_input_tokens_seen": 52032632, + "step": 89625 + }, + { + "epoch": 13.349717009234435, + "grad_norm": 0.03271484375, + "learning_rate": 0.009019800993425533, + "loss": 0.775, + "num_input_tokens_seen": 52035288, + "step": 89630 + }, + { + "epoch": 13.350461721775394, + "grad_norm": 0.052978515625, + "learning_rate": 0.009018013036534312, + "loss": 0.8003, + "num_input_tokens_seen": 52037976, + "step": 89635 + }, + { + "epoch": 13.351206434316353, + "grad_norm": 0.0419921875, + "learning_rate": 0.00901622518070268, + "loss": 0.787, + "num_input_tokens_seen": 52041048, + "step": 89640 + }, + { + "epoch": 13.351951146857314, + "grad_norm": 0.043212890625, + "learning_rate": 0.009014437425960843, + "loss": 0.7981, + "num_input_tokens_seen": 52043704, + "step": 89645 + }, + { + "epoch": 13.352695859398272, + "grad_norm": 0.03369140625, + "learning_rate": 0.009012649772338998, + "loss": 0.7826, + "num_input_tokens_seen": 52046552, + "step": 89650 + }, + { + "epoch": 13.353440571939231, + "grad_norm": 0.03369140625, + "learning_rate": 0.009010862219867353, + "loss": 0.8026, + "num_input_tokens_seen": 52049144, + "step": 89655 + }, + { + "epoch": 13.35418528448019, + "grad_norm": 0.034423828125, + "learning_rate": 0.009009074768576101, + "loss": 0.8046, + "num_input_tokens_seen": 52052056, + "step": 89660 + }, + { + "epoch": 13.35492999702115, + "grad_norm": 0.033203125, + "learning_rate": 0.009007287418495444, + "loss": 0.7929, + "num_input_tokens_seen": 52054872, + "step": 89665 + }, + { + "epoch": 13.35567470956211, + "grad_norm": 0.021728515625, + "learning_rate": 0.009005500169655579, + "loss": 0.8296, + "num_input_tokens_seen": 52057912, + "step": 89670 + }, + { + "epoch": 13.356419422103068, + "grad_norm": 0.03173828125, + "learning_rate": 0.00900371302208669, + "loss": 0.7972, + "num_input_tokens_seen": 52060632, + "step": 89675 + }, + { + "epoch": 13.357164134644027, + "grad_norm": 0.021240234375, + "learning_rate": 0.00900192597581898, + "loss": 0.7905, + "num_input_tokens_seen": 52063256, + "step": 89680 + }, + { + "epoch": 13.357908847184987, + "grad_norm": 0.037353515625, + "learning_rate": 0.009000139030882629, + "loss": 0.7953, + "num_input_tokens_seen": 52066104, + "step": 89685 + }, + { + "epoch": 13.358653559725946, + "grad_norm": 0.041015625, + "learning_rate": 0.008998352187307837, + "loss": 0.8392, + "num_input_tokens_seen": 52068920, + "step": 89690 + }, + { + "epoch": 13.359398272266905, + "grad_norm": 0.0205078125, + "learning_rate": 0.008996565445124777, + "loss": 0.8097, + "num_input_tokens_seen": 52071832, + "step": 89695 + }, + { + "epoch": 13.360142984807863, + "grad_norm": 0.038818359375, + "learning_rate": 0.00899477880436365, + "loss": 0.7935, + "num_input_tokens_seen": 52075000, + "step": 89700 + }, + { + "epoch": 13.360887697348824, + "grad_norm": 0.044677734375, + "learning_rate": 0.008992992265054628, + "loss": 0.8026, + "num_input_tokens_seen": 52077592, + "step": 89705 + }, + { + "epoch": 13.361632409889783, + "grad_norm": 0.0498046875, + "learning_rate": 0.0089912058272279, + "loss": 0.8093, + "num_input_tokens_seen": 52080536, + "step": 89710 + }, + { + "epoch": 13.362377122430741, + "grad_norm": 0.033203125, + "learning_rate": 0.008989419490913643, + "loss": 0.7779, + "num_input_tokens_seen": 52083704, + "step": 89715 + }, + { + "epoch": 13.3631218349717, + "grad_norm": 0.04150390625, + "learning_rate": 0.008987633256142026, + "loss": 0.7882, + "num_input_tokens_seen": 52086872, + "step": 89720 + }, + { + "epoch": 13.36386654751266, + "grad_norm": 0.0272216796875, + "learning_rate": 0.008985847122943244, + "loss": 0.8068, + "num_input_tokens_seen": 52089816, + "step": 89725 + }, + { + "epoch": 13.36461126005362, + "grad_norm": 0.03466796875, + "learning_rate": 0.008984061091347454, + "loss": 0.7965, + "num_input_tokens_seen": 52092664, + "step": 89730 + }, + { + "epoch": 13.365355972594578, + "grad_norm": 0.021728515625, + "learning_rate": 0.008982275161384846, + "loss": 0.7967, + "num_input_tokens_seen": 52095640, + "step": 89735 + }, + { + "epoch": 13.366100685135537, + "grad_norm": 0.042724609375, + "learning_rate": 0.008980489333085578, + "loss": 0.8061, + "num_input_tokens_seen": 52098360, + "step": 89740 + }, + { + "epoch": 13.366845397676498, + "grad_norm": 0.03369140625, + "learning_rate": 0.008978703606479826, + "loss": 0.7829, + "num_input_tokens_seen": 52101400, + "step": 89745 + }, + { + "epoch": 13.367590110217456, + "grad_norm": 0.0322265625, + "learning_rate": 0.00897691798159776, + "loss": 0.8181, + "num_input_tokens_seen": 52104504, + "step": 89750 + }, + { + "epoch": 13.368334822758415, + "grad_norm": 0.076171875, + "learning_rate": 0.008975132458469537, + "loss": 0.8386, + "num_input_tokens_seen": 52108536, + "step": 89755 + }, + { + "epoch": 13.369079535299374, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00897334703712533, + "loss": 0.8044, + "num_input_tokens_seen": 52111288, + "step": 89760 + }, + { + "epoch": 13.369824247840334, + "grad_norm": 0.037109375, + "learning_rate": 0.008971561717595299, + "loss": 0.8214, + "num_input_tokens_seen": 52115224, + "step": 89765 + }, + { + "epoch": 13.370568960381293, + "grad_norm": 0.037109375, + "learning_rate": 0.008969776499909609, + "loss": 0.7814, + "num_input_tokens_seen": 52118104, + "step": 89770 + }, + { + "epoch": 13.371313672922252, + "grad_norm": 0.05078125, + "learning_rate": 0.008967991384098413, + "loss": 0.7952, + "num_input_tokens_seen": 52120888, + "step": 89775 + }, + { + "epoch": 13.37205838546321, + "grad_norm": 0.0302734375, + "learning_rate": 0.008966206370191873, + "loss": 0.7815, + "num_input_tokens_seen": 52123832, + "step": 89780 + }, + { + "epoch": 13.372803098004171, + "grad_norm": 0.04541015625, + "learning_rate": 0.00896442145822014, + "loss": 0.7989, + "num_input_tokens_seen": 52126584, + "step": 89785 + }, + { + "epoch": 13.37354781054513, + "grad_norm": 0.04443359375, + "learning_rate": 0.00896263664821338, + "loss": 0.7924, + "num_input_tokens_seen": 52129560, + "step": 89790 + }, + { + "epoch": 13.374292523086089, + "grad_norm": 0.046630859375, + "learning_rate": 0.008960851940201735, + "loss": 0.803, + "num_input_tokens_seen": 52132280, + "step": 89795 + }, + { + "epoch": 13.375037235627047, + "grad_norm": 0.044921875, + "learning_rate": 0.008959067334215354, + "loss": 0.7872, + "num_input_tokens_seen": 52136056, + "step": 89800 + }, + { + "epoch": 13.375781948168008, + "grad_norm": 0.031494140625, + "learning_rate": 0.008957282830284395, + "loss": 0.7884, + "num_input_tokens_seen": 52139128, + "step": 89805 + }, + { + "epoch": 13.376526660708967, + "grad_norm": 0.020751953125, + "learning_rate": 0.008955498428439, + "loss": 0.8154, + "num_input_tokens_seen": 52141848, + "step": 89810 + }, + { + "epoch": 13.377271373249926, + "grad_norm": 0.048095703125, + "learning_rate": 0.008953714128709319, + "loss": 0.8087, + "num_input_tokens_seen": 52144472, + "step": 89815 + }, + { + "epoch": 13.378016085790884, + "grad_norm": 0.023193359375, + "learning_rate": 0.008951929931125483, + "loss": 0.79, + "num_input_tokens_seen": 52147384, + "step": 89820 + }, + { + "epoch": 13.378760798331843, + "grad_norm": 0.038330078125, + "learning_rate": 0.008950145835717654, + "loss": 0.7808, + "num_input_tokens_seen": 52150136, + "step": 89825 + }, + { + "epoch": 13.379505510872804, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00894836184251596, + "loss": 0.8024, + "num_input_tokens_seen": 52153048, + "step": 89830 + }, + { + "epoch": 13.380250223413762, + "grad_norm": 0.034912109375, + "learning_rate": 0.008946577951550539, + "loss": 0.7947, + "num_input_tokens_seen": 52155992, + "step": 89835 + }, + { + "epoch": 13.380994935954721, + "grad_norm": 0.0322265625, + "learning_rate": 0.008944794162851533, + "loss": 0.8114, + "num_input_tokens_seen": 52158808, + "step": 89840 + }, + { + "epoch": 13.38173964849568, + "grad_norm": 0.0419921875, + "learning_rate": 0.008943010476449074, + "loss": 0.7926, + "num_input_tokens_seen": 52161528, + "step": 89845 + }, + { + "epoch": 13.38248436103664, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0089412268923733, + "loss": 0.8041, + "num_input_tokens_seen": 52164280, + "step": 89850 + }, + { + "epoch": 13.3832290735776, + "grad_norm": 0.0301513671875, + "learning_rate": 0.008939443410654335, + "loss": 0.7935, + "num_input_tokens_seen": 52167480, + "step": 89855 + }, + { + "epoch": 13.383973786118558, + "grad_norm": 0.0301513671875, + "learning_rate": 0.008937660031322318, + "loss": 0.7885, + "num_input_tokens_seen": 52170296, + "step": 89860 + }, + { + "epoch": 13.384718498659517, + "grad_norm": 0.048828125, + "learning_rate": 0.008935876754407369, + "loss": 0.7801, + "num_input_tokens_seen": 52173464, + "step": 89865 + }, + { + "epoch": 13.385463211200477, + "grad_norm": 0.039306640625, + "learning_rate": 0.008934093579939623, + "loss": 0.8254, + "num_input_tokens_seen": 52176248, + "step": 89870 + }, + { + "epoch": 13.386207923741436, + "grad_norm": 0.037109375, + "learning_rate": 0.0089323105079492, + "loss": 0.797, + "num_input_tokens_seen": 52179096, + "step": 89875 + }, + { + "epoch": 13.386952636282395, + "grad_norm": 0.03662109375, + "learning_rate": 0.008930527538466225, + "loss": 0.813, + "num_input_tokens_seen": 52182104, + "step": 89880 + }, + { + "epoch": 13.387697348823353, + "grad_norm": 0.031494140625, + "learning_rate": 0.008928744671520819, + "loss": 0.8191, + "num_input_tokens_seen": 52184888, + "step": 89885 + }, + { + "epoch": 13.388442061364314, + "grad_norm": 0.037841796875, + "learning_rate": 0.008926961907143097, + "loss": 0.791, + "num_input_tokens_seen": 52187960, + "step": 89890 + }, + { + "epoch": 13.389186773905273, + "grad_norm": 0.04443359375, + "learning_rate": 0.008925179245363185, + "loss": 0.8077, + "num_input_tokens_seen": 52190904, + "step": 89895 + }, + { + "epoch": 13.389931486446232, + "grad_norm": 0.04052734375, + "learning_rate": 0.00892339668621119, + "loss": 0.8139, + "num_input_tokens_seen": 52193880, + "step": 89900 + }, + { + "epoch": 13.39067619898719, + "grad_norm": 0.1357421875, + "learning_rate": 0.008921614229717238, + "loss": 0.8309, + "num_input_tokens_seen": 52196760, + "step": 89905 + }, + { + "epoch": 13.39142091152815, + "grad_norm": 0.04296875, + "learning_rate": 0.008919831875911435, + "loss": 0.7883, + "num_input_tokens_seen": 52199832, + "step": 89910 + }, + { + "epoch": 13.39216562406911, + "grad_norm": 0.0322265625, + "learning_rate": 0.008918049624823893, + "loss": 0.8375, + "num_input_tokens_seen": 52202488, + "step": 89915 + }, + { + "epoch": 13.392910336610068, + "grad_norm": 0.04638671875, + "learning_rate": 0.008916267476484722, + "loss": 0.7896, + "num_input_tokens_seen": 52205304, + "step": 89920 + }, + { + "epoch": 13.393655049151027, + "grad_norm": 0.11474609375, + "learning_rate": 0.008914485430924024, + "loss": 0.8292, + "num_input_tokens_seen": 52207960, + "step": 89925 + }, + { + "epoch": 13.394399761691988, + "grad_norm": 0.046630859375, + "learning_rate": 0.008912703488171914, + "loss": 0.7849, + "num_input_tokens_seen": 52210904, + "step": 89930 + }, + { + "epoch": 13.395144474232946, + "grad_norm": 0.044189453125, + "learning_rate": 0.008910921648258489, + "loss": 0.7907, + "num_input_tokens_seen": 52213688, + "step": 89935 + }, + { + "epoch": 13.395889186773905, + "grad_norm": 0.042724609375, + "learning_rate": 0.008909139911213854, + "loss": 0.802, + "num_input_tokens_seen": 52216920, + "step": 89940 + }, + { + "epoch": 13.396633899314864, + "grad_norm": 0.03466796875, + "learning_rate": 0.00890735827706811, + "loss": 0.8051, + "num_input_tokens_seen": 52219736, + "step": 89945 + }, + { + "epoch": 13.397378611855824, + "grad_norm": 0.04541015625, + "learning_rate": 0.008905576745851358, + "loss": 0.7923, + "num_input_tokens_seen": 52222776, + "step": 89950 + }, + { + "epoch": 13.398123324396783, + "grad_norm": 0.037841796875, + "learning_rate": 0.008903795317593693, + "loss": 0.7874, + "num_input_tokens_seen": 52225624, + "step": 89955 + }, + { + "epoch": 13.398868036937742, + "grad_norm": 0.039794921875, + "learning_rate": 0.008902013992325205, + "loss": 0.801, + "num_input_tokens_seen": 52228440, + "step": 89960 + }, + { + "epoch": 13.3996127494787, + "grad_norm": 0.028076171875, + "learning_rate": 0.008900232770075999, + "loss": 0.7907, + "num_input_tokens_seen": 52231256, + "step": 89965 + }, + { + "epoch": 13.400357462019661, + "grad_norm": 0.041748046875, + "learning_rate": 0.008898451650876158, + "loss": 0.7997, + "num_input_tokens_seen": 52234072, + "step": 89970 + }, + { + "epoch": 13.40110217456062, + "grad_norm": 0.030517578125, + "learning_rate": 0.008896670634755778, + "loss": 0.8084, + "num_input_tokens_seen": 52237208, + "step": 89975 + }, + { + "epoch": 13.401846887101579, + "grad_norm": 0.0220947265625, + "learning_rate": 0.008894889721744936, + "loss": 0.8111, + "num_input_tokens_seen": 52239960, + "step": 89980 + }, + { + "epoch": 13.402591599642538, + "grad_norm": 0.0242919921875, + "learning_rate": 0.008893108911873737, + "loss": 0.7994, + "num_input_tokens_seen": 52242744, + "step": 89985 + }, + { + "epoch": 13.403336312183498, + "grad_norm": 0.020751953125, + "learning_rate": 0.008891328205172246, + "loss": 0.7906, + "num_input_tokens_seen": 52245528, + "step": 89990 + }, + { + "epoch": 13.404081024724457, + "grad_norm": 0.03564453125, + "learning_rate": 0.008889547601670565, + "loss": 0.7992, + "num_input_tokens_seen": 52248248, + "step": 89995 + }, + { + "epoch": 13.404825737265416, + "grad_norm": 0.0272216796875, + "learning_rate": 0.008887767101398767, + "loss": 0.7994, + "num_input_tokens_seen": 52251256, + "step": 90000 + }, + { + "epoch": 13.405570449806374, + "grad_norm": 0.0537109375, + "learning_rate": 0.008885986704386927, + "loss": 0.7918, + "num_input_tokens_seen": 52254328, + "step": 90005 + }, + { + "epoch": 13.406315162347333, + "grad_norm": 0.04638671875, + "learning_rate": 0.008884206410665134, + "loss": 0.7996, + "num_input_tokens_seen": 52257528, + "step": 90010 + }, + { + "epoch": 13.407059874888294, + "grad_norm": 0.032958984375, + "learning_rate": 0.008882426220263449, + "loss": 0.8088, + "num_input_tokens_seen": 52260760, + "step": 90015 + }, + { + "epoch": 13.407804587429252, + "grad_norm": 0.040771484375, + "learning_rate": 0.008880646133211961, + "loss": 0.8156, + "num_input_tokens_seen": 52263640, + "step": 90020 + }, + { + "epoch": 13.408549299970211, + "grad_norm": 0.042236328125, + "learning_rate": 0.008878866149540733, + "loss": 0.7971, + "num_input_tokens_seen": 52266648, + "step": 90025 + }, + { + "epoch": 13.40929401251117, + "grad_norm": 0.037109375, + "learning_rate": 0.008877086269279844, + "loss": 0.8134, + "num_input_tokens_seen": 52269720, + "step": 90030 + }, + { + "epoch": 13.41003872505213, + "grad_norm": 0.046875, + "learning_rate": 0.008875306492459363, + "loss": 0.7828, + "num_input_tokens_seen": 52272728, + "step": 90035 + }, + { + "epoch": 13.41078343759309, + "grad_norm": 0.033447265625, + "learning_rate": 0.008873526819109348, + "loss": 0.8104, + "num_input_tokens_seen": 52275480, + "step": 90040 + }, + { + "epoch": 13.411528150134048, + "grad_norm": 0.0203857421875, + "learning_rate": 0.008871747249259874, + "loss": 0.7993, + "num_input_tokens_seen": 52278520, + "step": 90045 + }, + { + "epoch": 13.412272862675007, + "grad_norm": 0.0206298828125, + "learning_rate": 0.008869967782940996, + "loss": 0.7992, + "num_input_tokens_seen": 52281112, + "step": 90050 + }, + { + "epoch": 13.413017575215967, + "grad_norm": 0.035400390625, + "learning_rate": 0.008868188420182789, + "loss": 0.7906, + "num_input_tokens_seen": 52284184, + "step": 90055 + }, + { + "epoch": 13.413762287756926, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0088664091610153, + "loss": 0.7948, + "num_input_tokens_seen": 52287096, + "step": 90060 + }, + { + "epoch": 13.414507000297885, + "grad_norm": 0.034912109375, + "learning_rate": 0.0088646300054686, + "loss": 0.7995, + "num_input_tokens_seen": 52290008, + "step": 90065 + }, + { + "epoch": 13.415251712838844, + "grad_norm": 0.0303955078125, + "learning_rate": 0.008862850953572737, + "loss": 0.7956, + "num_input_tokens_seen": 52292792, + "step": 90070 + }, + { + "epoch": 13.415996425379804, + "grad_norm": 0.038330078125, + "learning_rate": 0.008861072005357773, + "loss": 0.8014, + "num_input_tokens_seen": 52295768, + "step": 90075 + }, + { + "epoch": 13.416741137920763, + "grad_norm": 0.03466796875, + "learning_rate": 0.00885929316085376, + "loss": 0.8059, + "num_input_tokens_seen": 52298456, + "step": 90080 + }, + { + "epoch": 13.417485850461722, + "grad_norm": 0.022216796875, + "learning_rate": 0.008857514420090739, + "loss": 0.8079, + "num_input_tokens_seen": 52301592, + "step": 90085 + }, + { + "epoch": 13.41823056300268, + "grad_norm": 0.039306640625, + "learning_rate": 0.008855735783098775, + "loss": 0.8103, + "num_input_tokens_seen": 52304344, + "step": 90090 + }, + { + "epoch": 13.418975275543641, + "grad_norm": 0.033203125, + "learning_rate": 0.008853957249907904, + "loss": 0.796, + "num_input_tokens_seen": 52307352, + "step": 90095 + }, + { + "epoch": 13.4197199880846, + "grad_norm": 0.0380859375, + "learning_rate": 0.008852178820548185, + "loss": 0.7933, + "num_input_tokens_seen": 52310104, + "step": 90100 + }, + { + "epoch": 13.420464700625558, + "grad_norm": 0.03369140625, + "learning_rate": 0.008850400495049657, + "loss": 0.7926, + "num_input_tokens_seen": 52313240, + "step": 90105 + }, + { + "epoch": 13.421209413166517, + "grad_norm": 0.044921875, + "learning_rate": 0.008848622273442363, + "loss": 0.7996, + "num_input_tokens_seen": 52316088, + "step": 90110 + }, + { + "epoch": 13.421954125707478, + "grad_norm": 0.0322265625, + "learning_rate": 0.008846844155756343, + "loss": 0.8127, + "num_input_tokens_seen": 52318904, + "step": 90115 + }, + { + "epoch": 13.422698838248436, + "grad_norm": 0.03369140625, + "learning_rate": 0.008845066142021631, + "loss": 0.7936, + "num_input_tokens_seen": 52321912, + "step": 90120 + }, + { + "epoch": 13.423443550789395, + "grad_norm": 0.040771484375, + "learning_rate": 0.008843288232268277, + "loss": 0.7999, + "num_input_tokens_seen": 52324408, + "step": 90125 + }, + { + "epoch": 13.424188263330354, + "grad_norm": 0.044189453125, + "learning_rate": 0.008841510426526308, + "loss": 0.7991, + "num_input_tokens_seen": 52327512, + "step": 90130 + }, + { + "epoch": 13.424932975871315, + "grad_norm": 0.046875, + "learning_rate": 0.008839732724825763, + "loss": 0.8001, + "num_input_tokens_seen": 52330392, + "step": 90135 + }, + { + "epoch": 13.425677688412273, + "grad_norm": 0.0439453125, + "learning_rate": 0.008837955127196673, + "loss": 0.7955, + "num_input_tokens_seen": 52333240, + "step": 90140 + }, + { + "epoch": 13.426422400953232, + "grad_norm": 0.0341796875, + "learning_rate": 0.00883617763366907, + "loss": 0.8014, + "num_input_tokens_seen": 52336472, + "step": 90145 + }, + { + "epoch": 13.42716711349419, + "grad_norm": 0.039794921875, + "learning_rate": 0.008834400244272974, + "loss": 0.7939, + "num_input_tokens_seen": 52339480, + "step": 90150 + }, + { + "epoch": 13.42791182603515, + "grad_norm": 0.0296630859375, + "learning_rate": 0.008832622959038425, + "loss": 0.8051, + "num_input_tokens_seen": 52342200, + "step": 90155 + }, + { + "epoch": 13.42865653857611, + "grad_norm": 0.031982421875, + "learning_rate": 0.008830845777995446, + "loss": 0.7998, + "num_input_tokens_seen": 52345336, + "step": 90160 + }, + { + "epoch": 13.429401251117069, + "grad_norm": 0.032958984375, + "learning_rate": 0.00882906870117405, + "loss": 0.8018, + "num_input_tokens_seen": 52348152, + "step": 90165 + }, + { + "epoch": 13.430145963658028, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00882729172860427, + "loss": 0.7856, + "num_input_tokens_seen": 52351096, + "step": 90170 + }, + { + "epoch": 13.430890676198986, + "grad_norm": 0.03857421875, + "learning_rate": 0.008825514860316123, + "loss": 0.7973, + "num_input_tokens_seen": 52354104, + "step": 90175 + }, + { + "epoch": 13.431635388739947, + "grad_norm": 0.039794921875, + "learning_rate": 0.008823738096339628, + "loss": 0.7961, + "num_input_tokens_seen": 52356984, + "step": 90180 + }, + { + "epoch": 13.432380101280906, + "grad_norm": 0.038330078125, + "learning_rate": 0.008821961436704797, + "loss": 0.7835, + "num_input_tokens_seen": 52360248, + "step": 90185 + }, + { + "epoch": 13.433124813821864, + "grad_norm": 0.04296875, + "learning_rate": 0.00882018488144165, + "loss": 0.8061, + "num_input_tokens_seen": 52363288, + "step": 90190 + }, + { + "epoch": 13.433869526362823, + "grad_norm": 0.044189453125, + "learning_rate": 0.008818408430580203, + "loss": 0.7979, + "num_input_tokens_seen": 52366168, + "step": 90195 + }, + { + "epoch": 13.434614238903784, + "grad_norm": 0.044921875, + "learning_rate": 0.008816632084150455, + "loss": 0.8001, + "num_input_tokens_seen": 52369240, + "step": 90200 + }, + { + "epoch": 13.435358951444742, + "grad_norm": 0.03662109375, + "learning_rate": 0.008814855842182428, + "loss": 0.8114, + "num_input_tokens_seen": 52371832, + "step": 90205 + }, + { + "epoch": 13.436103663985701, + "grad_norm": 0.03759765625, + "learning_rate": 0.008813079704706125, + "loss": 0.7985, + "num_input_tokens_seen": 52374936, + "step": 90210 + }, + { + "epoch": 13.43684837652666, + "grad_norm": 0.06787109375, + "learning_rate": 0.008811303671751553, + "loss": 0.8089, + "num_input_tokens_seen": 52377720, + "step": 90215 + }, + { + "epoch": 13.43759308906762, + "grad_norm": 0.032958984375, + "learning_rate": 0.008809527743348712, + "loss": 0.7915, + "num_input_tokens_seen": 52380696, + "step": 90220 + }, + { + "epoch": 13.43833780160858, + "grad_norm": 0.031494140625, + "learning_rate": 0.008807751919527615, + "loss": 0.7929, + "num_input_tokens_seen": 52383704, + "step": 90225 + }, + { + "epoch": 13.439082514149538, + "grad_norm": 0.02587890625, + "learning_rate": 0.008805976200318245, + "loss": 0.8156, + "num_input_tokens_seen": 52386552, + "step": 90230 + }, + { + "epoch": 13.439827226690497, + "grad_norm": 0.04638671875, + "learning_rate": 0.008804200585750621, + "loss": 0.8002, + "num_input_tokens_seen": 52389528, + "step": 90235 + }, + { + "epoch": 13.440571939231457, + "grad_norm": 0.034423828125, + "learning_rate": 0.00880242507585473, + "loss": 0.7888, + "num_input_tokens_seen": 52392472, + "step": 90240 + }, + { + "epoch": 13.441316651772416, + "grad_norm": 0.0245361328125, + "learning_rate": 0.008800649670660564, + "loss": 0.7997, + "num_input_tokens_seen": 52395128, + "step": 90245 + }, + { + "epoch": 13.442061364313375, + "grad_norm": 0.037109375, + "learning_rate": 0.008798874370198129, + "loss": 0.7967, + "num_input_tokens_seen": 52398040, + "step": 90250 + }, + { + "epoch": 13.442806076854334, + "grad_norm": 0.023681640625, + "learning_rate": 0.008797099174497398, + "loss": 0.8055, + "num_input_tokens_seen": 52400696, + "step": 90255 + }, + { + "epoch": 13.443550789395294, + "grad_norm": 0.036376953125, + "learning_rate": 0.00879532408358838, + "loss": 0.8002, + "num_input_tokens_seen": 52403416, + "step": 90260 + }, + { + "epoch": 13.444295501936253, + "grad_norm": 0.027587890625, + "learning_rate": 0.008793549097501052, + "loss": 0.8, + "num_input_tokens_seen": 52406072, + "step": 90265 + }, + { + "epoch": 13.445040214477212, + "grad_norm": 0.029541015625, + "learning_rate": 0.008791774216265405, + "loss": 0.7848, + "num_input_tokens_seen": 52408824, + "step": 90270 + }, + { + "epoch": 13.44578492701817, + "grad_norm": 0.044189453125, + "learning_rate": 0.008789999439911425, + "loss": 0.7949, + "num_input_tokens_seen": 52411704, + "step": 90275 + }, + { + "epoch": 13.446529639559131, + "grad_norm": 0.04345703125, + "learning_rate": 0.008788224768469091, + "loss": 0.7872, + "num_input_tokens_seen": 52414520, + "step": 90280 + }, + { + "epoch": 13.44727435210009, + "grad_norm": 0.03515625, + "learning_rate": 0.008786450201968388, + "loss": 0.8062, + "num_input_tokens_seen": 52417432, + "step": 90285 + }, + { + "epoch": 13.448019064641048, + "grad_norm": 0.03466796875, + "learning_rate": 0.008784675740439287, + "loss": 0.7997, + "num_input_tokens_seen": 52420312, + "step": 90290 + }, + { + "epoch": 13.448763777182007, + "grad_norm": 0.026611328125, + "learning_rate": 0.00878290138391178, + "loss": 0.8022, + "num_input_tokens_seen": 52423096, + "step": 90295 + }, + { + "epoch": 13.449508489722968, + "grad_norm": 0.044677734375, + "learning_rate": 0.008781127132415831, + "loss": 0.7929, + "num_input_tokens_seen": 52426232, + "step": 90300 + }, + { + "epoch": 13.450253202263927, + "grad_norm": 0.0262451171875, + "learning_rate": 0.008779352985981424, + "loss": 0.8132, + "num_input_tokens_seen": 52428888, + "step": 90305 + }, + { + "epoch": 13.450997914804885, + "grad_norm": 0.033447265625, + "learning_rate": 0.008777578944638516, + "loss": 0.7893, + "num_input_tokens_seen": 52431608, + "step": 90310 + }, + { + "epoch": 13.451742627345844, + "grad_norm": 0.0458984375, + "learning_rate": 0.008775805008417096, + "loss": 0.7968, + "num_input_tokens_seen": 52434744, + "step": 90315 + }, + { + "epoch": 13.452487339886805, + "grad_norm": 0.03955078125, + "learning_rate": 0.008774031177347125, + "loss": 0.7941, + "num_input_tokens_seen": 52437656, + "step": 90320 + }, + { + "epoch": 13.453232052427763, + "grad_norm": 0.042236328125, + "learning_rate": 0.008772257451458562, + "loss": 0.7864, + "num_input_tokens_seen": 52440728, + "step": 90325 + }, + { + "epoch": 13.453976764968722, + "grad_norm": 0.028564453125, + "learning_rate": 0.008770483830781385, + "loss": 0.7991, + "num_input_tokens_seen": 52443640, + "step": 90330 + }, + { + "epoch": 13.45472147750968, + "grad_norm": 0.0220947265625, + "learning_rate": 0.008768710315345552, + "loss": 0.8013, + "num_input_tokens_seen": 52446904, + "step": 90335 + }, + { + "epoch": 13.45546619005064, + "grad_norm": 0.0380859375, + "learning_rate": 0.008766936905181027, + "loss": 0.7864, + "num_input_tokens_seen": 52449720, + "step": 90340 + }, + { + "epoch": 13.4562109025916, + "grad_norm": 0.03173828125, + "learning_rate": 0.00876516360031776, + "loss": 0.7787, + "num_input_tokens_seen": 52452344, + "step": 90345 + }, + { + "epoch": 13.456955615132559, + "grad_norm": 0.040283203125, + "learning_rate": 0.008763390400785727, + "loss": 0.7989, + "num_input_tokens_seen": 52455384, + "step": 90350 + }, + { + "epoch": 13.457700327673518, + "grad_norm": 0.0458984375, + "learning_rate": 0.00876161730661487, + "loss": 0.8073, + "num_input_tokens_seen": 52458168, + "step": 90355 + }, + { + "epoch": 13.458445040214476, + "grad_norm": 0.043212890625, + "learning_rate": 0.008759844317835145, + "loss": 0.7918, + "num_input_tokens_seen": 52461048, + "step": 90360 + }, + { + "epoch": 13.459189752755437, + "grad_norm": 0.033203125, + "learning_rate": 0.008758071434476515, + "loss": 0.7781, + "num_input_tokens_seen": 52464056, + "step": 90365 + }, + { + "epoch": 13.459934465296396, + "grad_norm": 0.095703125, + "learning_rate": 0.008756298656568916, + "loss": 0.852, + "num_input_tokens_seen": 52467256, + "step": 90370 + }, + { + "epoch": 13.460679177837354, + "grad_norm": 0.03173828125, + "learning_rate": 0.008754525984142312, + "loss": 0.7959, + "num_input_tokens_seen": 52470232, + "step": 90375 + }, + { + "epoch": 13.461423890378313, + "grad_norm": 0.047119140625, + "learning_rate": 0.008752753417226639, + "loss": 0.7864, + "num_input_tokens_seen": 52473496, + "step": 90380 + }, + { + "epoch": 13.462168602919274, + "grad_norm": 0.03515625, + "learning_rate": 0.008750980955851851, + "loss": 0.7874, + "num_input_tokens_seen": 52476408, + "step": 90385 + }, + { + "epoch": 13.462913315460233, + "grad_norm": 0.034423828125, + "learning_rate": 0.008749208600047891, + "loss": 0.7988, + "num_input_tokens_seen": 52479480, + "step": 90390 + }, + { + "epoch": 13.463658028001191, + "grad_norm": 0.0400390625, + "learning_rate": 0.008747436349844697, + "loss": 0.8098, + "num_input_tokens_seen": 52482296, + "step": 90395 + }, + { + "epoch": 13.46440274054215, + "grad_norm": 0.0439453125, + "learning_rate": 0.008745664205272213, + "loss": 0.824, + "num_input_tokens_seen": 52485400, + "step": 90400 + }, + { + "epoch": 13.46514745308311, + "grad_norm": 0.053466796875, + "learning_rate": 0.008743892166360368, + "loss": 0.7939, + "num_input_tokens_seen": 52488088, + "step": 90405 + }, + { + "epoch": 13.46589216562407, + "grad_norm": 0.031982421875, + "learning_rate": 0.008742120233139113, + "loss": 0.791, + "num_input_tokens_seen": 52491128, + "step": 90410 + }, + { + "epoch": 13.466636878165028, + "grad_norm": 0.0673828125, + "learning_rate": 0.008740348405638369, + "loss": 0.7917, + "num_input_tokens_seen": 52493976, + "step": 90415 + }, + { + "epoch": 13.467381590705987, + "grad_norm": 0.044921875, + "learning_rate": 0.008738576683888085, + "loss": 0.8003, + "num_input_tokens_seen": 52497112, + "step": 90420 + }, + { + "epoch": 13.468126303246947, + "grad_norm": 0.03466796875, + "learning_rate": 0.008736805067918177, + "loss": 0.7865, + "num_input_tokens_seen": 52499992, + "step": 90425 + }, + { + "epoch": 13.468871015787906, + "grad_norm": 0.06640625, + "learning_rate": 0.008735033557758587, + "loss": 0.8474, + "num_input_tokens_seen": 52502872, + "step": 90430 + }, + { + "epoch": 13.469615728328865, + "grad_norm": 0.034423828125, + "learning_rate": 0.008733262153439239, + "loss": 0.7966, + "num_input_tokens_seen": 52506104, + "step": 90435 + }, + { + "epoch": 13.470360440869824, + "grad_norm": 0.044189453125, + "learning_rate": 0.00873149085499005, + "loss": 0.7951, + "num_input_tokens_seen": 52509112, + "step": 90440 + }, + { + "epoch": 13.471105153410784, + "grad_norm": 0.021484375, + "learning_rate": 0.00872971966244096, + "loss": 0.8017, + "num_input_tokens_seen": 52512024, + "step": 90445 + }, + { + "epoch": 13.471849865951743, + "grad_norm": 0.043701171875, + "learning_rate": 0.008727948575821882, + "loss": 0.7996, + "num_input_tokens_seen": 52515256, + "step": 90450 + }, + { + "epoch": 13.472594578492702, + "grad_norm": 0.0208740234375, + "learning_rate": 0.008726177595162733, + "loss": 0.7966, + "num_input_tokens_seen": 52518168, + "step": 90455 + }, + { + "epoch": 13.47333929103366, + "grad_norm": 0.0196533203125, + "learning_rate": 0.008724406720493443, + "loss": 0.7956, + "num_input_tokens_seen": 52521048, + "step": 90460 + }, + { + "epoch": 13.474084003574621, + "grad_norm": 0.0546875, + "learning_rate": 0.008722635951843923, + "loss": 0.7939, + "num_input_tokens_seen": 52523960, + "step": 90465 + }, + { + "epoch": 13.47482871611558, + "grad_norm": 0.04248046875, + "learning_rate": 0.008720865289244081, + "loss": 0.8017, + "num_input_tokens_seen": 52527192, + "step": 90470 + }, + { + "epoch": 13.475573428656539, + "grad_norm": 0.044189453125, + "learning_rate": 0.008719094732723847, + "loss": 0.8059, + "num_input_tokens_seen": 52530104, + "step": 90475 + }, + { + "epoch": 13.476318141197497, + "grad_norm": 0.050537109375, + "learning_rate": 0.008717324282313123, + "loss": 0.8006, + "num_input_tokens_seen": 52533112, + "step": 90480 + }, + { + "epoch": 13.477062853738458, + "grad_norm": 0.0208740234375, + "learning_rate": 0.008715553938041812, + "loss": 0.7931, + "num_input_tokens_seen": 52535768, + "step": 90485 + }, + { + "epoch": 13.477807566279417, + "grad_norm": 0.0458984375, + "learning_rate": 0.00871378369993984, + "loss": 0.7893, + "num_input_tokens_seen": 52538584, + "step": 90490 + }, + { + "epoch": 13.478552278820375, + "grad_norm": 0.054443359375, + "learning_rate": 0.008712013568037092, + "loss": 0.797, + "num_input_tokens_seen": 52541528, + "step": 90495 + }, + { + "epoch": 13.479296991361334, + "grad_norm": 0.046875, + "learning_rate": 0.008710243542363494, + "loss": 0.7881, + "num_input_tokens_seen": 52544504, + "step": 90500 + }, + { + "epoch": 13.480041703902295, + "grad_norm": 0.030029296875, + "learning_rate": 0.008708473622948932, + "loss": 0.7953, + "num_input_tokens_seen": 52547288, + "step": 90505 + }, + { + "epoch": 13.480786416443253, + "grad_norm": 0.044189453125, + "learning_rate": 0.008706703809823317, + "loss": 0.8131, + "num_input_tokens_seen": 52550200, + "step": 90510 + }, + { + "epoch": 13.481531128984212, + "grad_norm": 0.0341796875, + "learning_rate": 0.00870493410301655, + "loss": 0.8021, + "num_input_tokens_seen": 52553368, + "step": 90515 + }, + { + "epoch": 13.482275841525171, + "grad_norm": 0.0341796875, + "learning_rate": 0.008703164502558518, + "loss": 0.7937, + "num_input_tokens_seen": 52556120, + "step": 90520 + }, + { + "epoch": 13.48302055406613, + "grad_norm": 0.049072265625, + "learning_rate": 0.008701395008479126, + "loss": 0.8097, + "num_input_tokens_seen": 52559256, + "step": 90525 + }, + { + "epoch": 13.48376526660709, + "grad_norm": 0.0595703125, + "learning_rate": 0.008699625620808256, + "loss": 0.7784, + "num_input_tokens_seen": 52562328, + "step": 90530 + }, + { + "epoch": 13.484509979148049, + "grad_norm": 0.031005859375, + "learning_rate": 0.008697856339575814, + "loss": 0.7967, + "num_input_tokens_seen": 52565080, + "step": 90535 + }, + { + "epoch": 13.485254691689008, + "grad_norm": 0.044677734375, + "learning_rate": 0.008696087164811676, + "loss": 0.802, + "num_input_tokens_seen": 52567768, + "step": 90540 + }, + { + "epoch": 13.485999404229966, + "grad_norm": 0.0244140625, + "learning_rate": 0.008694318096545747, + "loss": 0.8113, + "num_input_tokens_seen": 52570392, + "step": 90545 + }, + { + "epoch": 13.486744116770927, + "grad_norm": 0.0517578125, + "learning_rate": 0.008692549134807897, + "loss": 0.7818, + "num_input_tokens_seen": 52573176, + "step": 90550 + }, + { + "epoch": 13.487488829311886, + "grad_norm": 0.033935546875, + "learning_rate": 0.008690780279628026, + "loss": 0.8008, + "num_input_tokens_seen": 52575800, + "step": 90555 + }, + { + "epoch": 13.488233541852845, + "grad_norm": 0.044677734375, + "learning_rate": 0.00868901153103601, + "loss": 0.8043, + "num_input_tokens_seen": 52578872, + "step": 90560 + }, + { + "epoch": 13.488978254393803, + "grad_norm": 0.0478515625, + "learning_rate": 0.008687242889061723, + "loss": 0.79, + "num_input_tokens_seen": 52581656, + "step": 90565 + }, + { + "epoch": 13.489722966934764, + "grad_norm": 0.03466796875, + "learning_rate": 0.008685474353735057, + "loss": 0.8051, + "num_input_tokens_seen": 52584632, + "step": 90570 + }, + { + "epoch": 13.490467679475723, + "grad_norm": 0.0211181640625, + "learning_rate": 0.00868370592508588, + "loss": 0.7941, + "num_input_tokens_seen": 52587512, + "step": 90575 + }, + { + "epoch": 13.491212392016681, + "grad_norm": 0.033935546875, + "learning_rate": 0.008681937603144077, + "loss": 0.7885, + "num_input_tokens_seen": 52590488, + "step": 90580 + }, + { + "epoch": 13.49195710455764, + "grad_norm": 0.04931640625, + "learning_rate": 0.008680169387939515, + "loss": 0.7913, + "num_input_tokens_seen": 52593368, + "step": 90585 + }, + { + "epoch": 13.4927018170986, + "grad_norm": 0.06689453125, + "learning_rate": 0.00867840127950207, + "loss": 0.8145, + "num_input_tokens_seen": 52596152, + "step": 90590 + }, + { + "epoch": 13.49344652963956, + "grad_norm": 0.047607421875, + "learning_rate": 0.008676633277861603, + "loss": 0.7953, + "num_input_tokens_seen": 52599064, + "step": 90595 + }, + { + "epoch": 13.494191242180518, + "grad_norm": 0.03466796875, + "learning_rate": 0.008674865383047998, + "loss": 0.8083, + "num_input_tokens_seen": 52602104, + "step": 90600 + }, + { + "epoch": 13.494935954721477, + "grad_norm": 0.048828125, + "learning_rate": 0.008673097595091113, + "loss": 0.789, + "num_input_tokens_seen": 52605208, + "step": 90605 + }, + { + "epoch": 13.495680667262437, + "grad_norm": 0.046630859375, + "learning_rate": 0.008671329914020807, + "loss": 0.7744, + "num_input_tokens_seen": 52608056, + "step": 90610 + }, + { + "epoch": 13.496425379803396, + "grad_norm": 0.032470703125, + "learning_rate": 0.008669562339866956, + "loss": 0.7996, + "num_input_tokens_seen": 52611032, + "step": 90615 + }, + { + "epoch": 13.497170092344355, + "grad_norm": 0.044189453125, + "learning_rate": 0.008667794872659411, + "loss": 0.8105, + "num_input_tokens_seen": 52613944, + "step": 90620 + }, + { + "epoch": 13.497914804885314, + "grad_norm": 0.035400390625, + "learning_rate": 0.00866602751242804, + "loss": 0.7993, + "num_input_tokens_seen": 52617016, + "step": 90625 + }, + { + "epoch": 13.498659517426274, + "grad_norm": 0.043701171875, + "learning_rate": 0.00866426025920269, + "loss": 0.792, + "num_input_tokens_seen": 52619736, + "step": 90630 + }, + { + "epoch": 13.499404229967233, + "grad_norm": 0.06103515625, + "learning_rate": 0.008662493113013231, + "loss": 0.7825, + "num_input_tokens_seen": 52622648, + "step": 90635 + }, + { + "epoch": 13.500148942508192, + "grad_norm": 0.0361328125, + "learning_rate": 0.008660726073889511, + "loss": 0.77, + "num_input_tokens_seen": 52625688, + "step": 90640 + }, + { + "epoch": 13.50089365504915, + "grad_norm": 0.048095703125, + "learning_rate": 0.008658959141861372, + "loss": 0.79, + "num_input_tokens_seen": 52628376, + "step": 90645 + }, + { + "epoch": 13.501638367590111, + "grad_norm": 0.043212890625, + "learning_rate": 0.00865719231695868, + "loss": 0.812, + "num_input_tokens_seen": 52631480, + "step": 90650 + }, + { + "epoch": 13.50238308013107, + "grad_norm": 0.044921875, + "learning_rate": 0.008655425599211279, + "loss": 0.7872, + "num_input_tokens_seen": 52634328, + "step": 90655 + }, + { + "epoch": 13.503127792672029, + "grad_norm": 0.051513671875, + "learning_rate": 0.008653658988649013, + "loss": 0.8165, + "num_input_tokens_seen": 52637144, + "step": 90660 + }, + { + "epoch": 13.503872505212987, + "grad_norm": 0.031494140625, + "learning_rate": 0.008651892485301724, + "loss": 0.7873, + "num_input_tokens_seen": 52640184, + "step": 90665 + }, + { + "epoch": 13.504617217753946, + "grad_norm": 0.049560546875, + "learning_rate": 0.008650126089199265, + "loss": 0.7926, + "num_input_tokens_seen": 52643288, + "step": 90670 + }, + { + "epoch": 13.505361930294907, + "grad_norm": 0.03564453125, + "learning_rate": 0.008648359800371463, + "loss": 0.7767, + "num_input_tokens_seen": 52646232, + "step": 90675 + }, + { + "epoch": 13.506106642835865, + "grad_norm": 0.03515625, + "learning_rate": 0.008646593618848175, + "loss": 0.7762, + "num_input_tokens_seen": 52649336, + "step": 90680 + }, + { + "epoch": 13.506851355376824, + "grad_norm": 0.04736328125, + "learning_rate": 0.00864482754465923, + "loss": 0.7966, + "num_input_tokens_seen": 52651928, + "step": 90685 + }, + { + "epoch": 13.507596067917785, + "grad_norm": 0.134765625, + "learning_rate": 0.008643061577834458, + "loss": 0.8564, + "num_input_tokens_seen": 52654584, + "step": 90690 + }, + { + "epoch": 13.508340780458743, + "grad_norm": 0.032958984375, + "learning_rate": 0.008641295718403708, + "loss": 0.8013, + "num_input_tokens_seen": 52657336, + "step": 90695 + }, + { + "epoch": 13.509085492999702, + "grad_norm": 0.048828125, + "learning_rate": 0.008639529966396796, + "loss": 0.7703, + "num_input_tokens_seen": 52660088, + "step": 90700 + }, + { + "epoch": 13.509830205540661, + "grad_norm": 0.034912109375, + "learning_rate": 0.008637764321843567, + "loss": 0.8143, + "num_input_tokens_seen": 52663288, + "step": 90705 + }, + { + "epoch": 13.51057491808162, + "grad_norm": 0.034423828125, + "learning_rate": 0.008635998784773839, + "loss": 0.7862, + "num_input_tokens_seen": 52666072, + "step": 90710 + }, + { + "epoch": 13.51131963062258, + "grad_norm": 0.045654296875, + "learning_rate": 0.008634233355217447, + "loss": 0.7949, + "num_input_tokens_seen": 52668824, + "step": 90715 + }, + { + "epoch": 13.512064343163539, + "grad_norm": 0.0341796875, + "learning_rate": 0.008632468033204214, + "loss": 0.7934, + "num_input_tokens_seen": 52671416, + "step": 90720 + }, + { + "epoch": 13.512809055704498, + "grad_norm": 0.0341796875, + "learning_rate": 0.008630702818763962, + "loss": 0.7903, + "num_input_tokens_seen": 52674200, + "step": 90725 + }, + { + "epoch": 13.513553768245457, + "grad_norm": 0.035888671875, + "learning_rate": 0.008628937711926512, + "loss": 0.786, + "num_input_tokens_seen": 52677016, + "step": 90730 + }, + { + "epoch": 13.514298480786417, + "grad_norm": 0.07568359375, + "learning_rate": 0.00862717271272168, + "loss": 0.7972, + "num_input_tokens_seen": 52679768, + "step": 90735 + }, + { + "epoch": 13.515043193327376, + "grad_norm": 0.046875, + "learning_rate": 0.008625407821179293, + "loss": 0.7913, + "num_input_tokens_seen": 52682872, + "step": 90740 + }, + { + "epoch": 13.515787905868335, + "grad_norm": 0.046875, + "learning_rate": 0.008623643037329154, + "loss": 0.7828, + "num_input_tokens_seen": 52686072, + "step": 90745 + }, + { + "epoch": 13.516532618409293, + "grad_norm": 0.047607421875, + "learning_rate": 0.008621878361201093, + "loss": 0.7713, + "num_input_tokens_seen": 52688664, + "step": 90750 + }, + { + "epoch": 13.517277330950254, + "grad_norm": 0.055419921875, + "learning_rate": 0.008620113792824909, + "loss": 0.7881, + "num_input_tokens_seen": 52691704, + "step": 90755 + }, + { + "epoch": 13.518022043491213, + "grad_norm": 0.050537109375, + "learning_rate": 0.008618349332230423, + "loss": 0.8002, + "num_input_tokens_seen": 52694648, + "step": 90760 + }, + { + "epoch": 13.518766756032171, + "grad_norm": 0.024658203125, + "learning_rate": 0.008616584979447438, + "loss": 0.8076, + "num_input_tokens_seen": 52697880, + "step": 90765 + }, + { + "epoch": 13.51951146857313, + "grad_norm": 0.056640625, + "learning_rate": 0.008614820734505757, + "loss": 0.7738, + "num_input_tokens_seen": 52700632, + "step": 90770 + }, + { + "epoch": 13.52025618111409, + "grad_norm": 0.0322265625, + "learning_rate": 0.0086130565974352, + "loss": 0.8046, + "num_input_tokens_seen": 52703416, + "step": 90775 + }, + { + "epoch": 13.52100089365505, + "grad_norm": 0.0546875, + "learning_rate": 0.008611292568265554, + "loss": 0.8092, + "num_input_tokens_seen": 52706616, + "step": 90780 + }, + { + "epoch": 13.521745606196008, + "grad_norm": 0.03369140625, + "learning_rate": 0.008609528647026625, + "loss": 0.7834, + "num_input_tokens_seen": 52709400, + "step": 90785 + }, + { + "epoch": 13.522490318736967, + "grad_norm": 0.053466796875, + "learning_rate": 0.008607764833748218, + "loss": 0.7943, + "num_input_tokens_seen": 52712120, + "step": 90790 + }, + { + "epoch": 13.523235031277927, + "grad_norm": 0.03271484375, + "learning_rate": 0.00860600112846013, + "loss": 0.7712, + "num_input_tokens_seen": 52715000, + "step": 90795 + }, + { + "epoch": 13.523979743818886, + "grad_norm": 0.048583984375, + "learning_rate": 0.008604237531192153, + "loss": 0.8012, + "num_input_tokens_seen": 52717880, + "step": 90800 + }, + { + "epoch": 13.524724456359845, + "grad_norm": 0.03271484375, + "learning_rate": 0.008602474041974076, + "loss": 0.7966, + "num_input_tokens_seen": 52720632, + "step": 90805 + }, + { + "epoch": 13.525469168900804, + "grad_norm": 0.049072265625, + "learning_rate": 0.008600710660835705, + "loss": 0.792, + "num_input_tokens_seen": 52723544, + "step": 90810 + }, + { + "epoch": 13.526213881441764, + "grad_norm": 0.047119140625, + "learning_rate": 0.008598947387806817, + "loss": 0.7832, + "num_input_tokens_seen": 52726424, + "step": 90815 + }, + { + "epoch": 13.526958593982723, + "grad_norm": 0.03955078125, + "learning_rate": 0.008597184222917213, + "loss": 0.8021, + "num_input_tokens_seen": 52729048, + "step": 90820 + }, + { + "epoch": 13.527703306523682, + "grad_norm": 0.041015625, + "learning_rate": 0.008595421166196668, + "loss": 0.7757, + "num_input_tokens_seen": 52731896, + "step": 90825 + }, + { + "epoch": 13.52844801906464, + "grad_norm": 0.04736328125, + "learning_rate": 0.008593658217674981, + "loss": 0.8097, + "num_input_tokens_seen": 52734776, + "step": 90830 + }, + { + "epoch": 13.529192731605601, + "grad_norm": 0.027587890625, + "learning_rate": 0.008591895377381918, + "loss": 0.7987, + "num_input_tokens_seen": 52737592, + "step": 90835 + }, + { + "epoch": 13.52993744414656, + "grad_norm": 0.035888671875, + "learning_rate": 0.008590132645347278, + "loss": 0.8007, + "num_input_tokens_seen": 52740632, + "step": 90840 + }, + { + "epoch": 13.530682156687519, + "grad_norm": 0.03662109375, + "learning_rate": 0.008588370021600828, + "loss": 0.809, + "num_input_tokens_seen": 52743864, + "step": 90845 + }, + { + "epoch": 13.531426869228477, + "grad_norm": 0.046142578125, + "learning_rate": 0.008586607506172355, + "loss": 0.821, + "num_input_tokens_seen": 52747288, + "step": 90850 + }, + { + "epoch": 13.532171581769436, + "grad_norm": 0.025146484375, + "learning_rate": 0.00858484509909163, + "loss": 0.7833, + "num_input_tokens_seen": 52750040, + "step": 90855 + }, + { + "epoch": 13.532916294310397, + "grad_norm": 0.046142578125, + "learning_rate": 0.00858308280038842, + "loss": 0.829, + "num_input_tokens_seen": 52752856, + "step": 90860 + }, + { + "epoch": 13.533661006851355, + "grad_norm": 0.0654296875, + "learning_rate": 0.008581320610092508, + "loss": 0.8244, + "num_input_tokens_seen": 52756024, + "step": 90865 + }, + { + "epoch": 13.534405719392314, + "grad_norm": 0.033203125, + "learning_rate": 0.008579558528233657, + "loss": 0.7977, + "num_input_tokens_seen": 52758744, + "step": 90870 + }, + { + "epoch": 13.535150431933273, + "grad_norm": 0.04345703125, + "learning_rate": 0.008577796554841643, + "loss": 0.7957, + "num_input_tokens_seen": 52761432, + "step": 90875 + }, + { + "epoch": 13.535895144474233, + "grad_norm": 0.038330078125, + "learning_rate": 0.00857603468994623, + "loss": 0.7957, + "num_input_tokens_seen": 52764376, + "step": 90880 + }, + { + "epoch": 13.536639857015192, + "grad_norm": 0.032470703125, + "learning_rate": 0.008574272933577176, + "loss": 0.8016, + "num_input_tokens_seen": 52767032, + "step": 90885 + }, + { + "epoch": 13.537384569556151, + "grad_norm": 0.050537109375, + "learning_rate": 0.008572511285764257, + "loss": 0.8048, + "num_input_tokens_seen": 52769976, + "step": 90890 + }, + { + "epoch": 13.53812928209711, + "grad_norm": 0.0286865234375, + "learning_rate": 0.008570749746537222, + "loss": 0.7982, + "num_input_tokens_seen": 52773112, + "step": 90895 + }, + { + "epoch": 13.53887399463807, + "grad_norm": 0.03515625, + "learning_rate": 0.008568988315925839, + "loss": 0.7889, + "num_input_tokens_seen": 52775960, + "step": 90900 + }, + { + "epoch": 13.539618707179029, + "grad_norm": 0.033203125, + "learning_rate": 0.008567226993959858, + "loss": 0.7933, + "num_input_tokens_seen": 52778776, + "step": 90905 + }, + { + "epoch": 13.540363419719988, + "grad_norm": 0.02880859375, + "learning_rate": 0.008565465780669043, + "loss": 0.8073, + "num_input_tokens_seen": 52781464, + "step": 90910 + }, + { + "epoch": 13.541108132260947, + "grad_norm": 0.03564453125, + "learning_rate": 0.008563704676083144, + "loss": 0.8034, + "num_input_tokens_seen": 52784120, + "step": 90915 + }, + { + "epoch": 13.541852844801907, + "grad_norm": 0.055419921875, + "learning_rate": 0.008561943680231917, + "loss": 0.7838, + "num_input_tokens_seen": 52786744, + "step": 90920 + }, + { + "epoch": 13.542597557342866, + "grad_norm": 0.044189453125, + "learning_rate": 0.008560182793145103, + "loss": 0.798, + "num_input_tokens_seen": 52789688, + "step": 90925 + }, + { + "epoch": 13.543342269883825, + "grad_norm": 0.032958984375, + "learning_rate": 0.008558422014852454, + "loss": 0.7852, + "num_input_tokens_seen": 52792472, + "step": 90930 + }, + { + "epoch": 13.544086982424783, + "grad_norm": 0.02099609375, + "learning_rate": 0.00855666134538372, + "loss": 0.8066, + "num_input_tokens_seen": 52795064, + "step": 90935 + }, + { + "epoch": 13.544831694965744, + "grad_norm": 0.056396484375, + "learning_rate": 0.00855490078476864, + "loss": 0.7747, + "num_input_tokens_seen": 52797912, + "step": 90940 + }, + { + "epoch": 13.545576407506703, + "grad_norm": 0.04345703125, + "learning_rate": 0.008553140333036969, + "loss": 0.7829, + "num_input_tokens_seen": 52800792, + "step": 90945 + }, + { + "epoch": 13.546321120047661, + "grad_norm": 0.0380859375, + "learning_rate": 0.008551379990218428, + "loss": 0.7807, + "num_input_tokens_seen": 52803832, + "step": 90950 + }, + { + "epoch": 13.54706583258862, + "grad_norm": 0.0419921875, + "learning_rate": 0.008549619756342778, + "loss": 0.8056, + "num_input_tokens_seen": 52806744, + "step": 90955 + }, + { + "epoch": 13.54781054512958, + "grad_norm": 0.0322265625, + "learning_rate": 0.008547859631439746, + "loss": 0.8028, + "num_input_tokens_seen": 52809592, + "step": 90960 + }, + { + "epoch": 13.54855525767054, + "grad_norm": 0.0498046875, + "learning_rate": 0.008546099615539062, + "loss": 0.8031, + "num_input_tokens_seen": 52812280, + "step": 90965 + }, + { + "epoch": 13.549299970211498, + "grad_norm": 0.0380859375, + "learning_rate": 0.00854433970867047, + "loss": 0.7793, + "num_input_tokens_seen": 52815128, + "step": 90970 + }, + { + "epoch": 13.550044682752457, + "grad_norm": 0.04296875, + "learning_rate": 0.008542579910863693, + "loss": 0.8014, + "num_input_tokens_seen": 52817848, + "step": 90975 + }, + { + "epoch": 13.550789395293418, + "grad_norm": 0.0625, + "learning_rate": 0.008540820222148469, + "loss": 0.7907, + "num_input_tokens_seen": 52820984, + "step": 90980 + }, + { + "epoch": 13.551534107834376, + "grad_norm": 0.055419921875, + "learning_rate": 0.008539060642554526, + "loss": 0.7808, + "num_input_tokens_seen": 52823832, + "step": 90985 + }, + { + "epoch": 13.552278820375335, + "grad_norm": 0.0400390625, + "learning_rate": 0.008537301172111584, + "loss": 0.8123, + "num_input_tokens_seen": 52826936, + "step": 90990 + }, + { + "epoch": 13.553023532916294, + "grad_norm": 0.07958984375, + "learning_rate": 0.008535541810849366, + "loss": 0.8112, + "num_input_tokens_seen": 52830104, + "step": 90995 + }, + { + "epoch": 13.553768245457253, + "grad_norm": 0.0308837890625, + "learning_rate": 0.008533782558797606, + "loss": 0.805, + "num_input_tokens_seen": 52832728, + "step": 91000 + }, + { + "epoch": 13.554512957998213, + "grad_norm": 0.049072265625, + "learning_rate": 0.008532023415986016, + "loss": 0.8073, + "num_input_tokens_seen": 52835640, + "step": 91005 + }, + { + "epoch": 13.555257670539172, + "grad_norm": 0.03369140625, + "learning_rate": 0.00853026438244431, + "loss": 0.7882, + "num_input_tokens_seen": 52838712, + "step": 91010 + }, + { + "epoch": 13.55600238308013, + "grad_norm": 0.023193359375, + "learning_rate": 0.008528505458202218, + "loss": 0.7999, + "num_input_tokens_seen": 52841400, + "step": 91015 + }, + { + "epoch": 13.556747095621091, + "grad_norm": 0.034912109375, + "learning_rate": 0.008526746643289444, + "loss": 0.7964, + "num_input_tokens_seen": 52844056, + "step": 91020 + }, + { + "epoch": 13.55749180816205, + "grad_norm": 0.056884765625, + "learning_rate": 0.008524987937735711, + "loss": 0.7632, + "num_input_tokens_seen": 52846776, + "step": 91025 + }, + { + "epoch": 13.558236520703009, + "grad_norm": 0.05712890625, + "learning_rate": 0.008523229341570722, + "loss": 0.788, + "num_input_tokens_seen": 52849720, + "step": 91030 + }, + { + "epoch": 13.558981233243967, + "grad_norm": 0.0341796875, + "learning_rate": 0.008521470854824194, + "loss": 0.8036, + "num_input_tokens_seen": 52852632, + "step": 91035 + }, + { + "epoch": 13.559725945784926, + "grad_norm": 0.029052734375, + "learning_rate": 0.008519712477525832, + "loss": 0.7965, + "num_input_tokens_seen": 52855512, + "step": 91040 + }, + { + "epoch": 13.560470658325887, + "grad_norm": 0.050537109375, + "learning_rate": 0.008517954209705336, + "loss": 0.7991, + "num_input_tokens_seen": 52858616, + "step": 91045 + }, + { + "epoch": 13.561215370866845, + "grad_norm": 0.058349609375, + "learning_rate": 0.008516196051392422, + "loss": 0.8046, + "num_input_tokens_seen": 52861528, + "step": 91050 + }, + { + "epoch": 13.561960083407804, + "grad_norm": 0.0274658203125, + "learning_rate": 0.008514438002616786, + "loss": 0.7842, + "num_input_tokens_seen": 52864248, + "step": 91055 + }, + { + "epoch": 13.562704795948763, + "grad_norm": 0.0458984375, + "learning_rate": 0.008512680063408128, + "loss": 0.7727, + "num_input_tokens_seen": 52867064, + "step": 91060 + }, + { + "epoch": 13.563449508489724, + "grad_norm": 0.04736328125, + "learning_rate": 0.00851092223379614, + "loss": 0.7948, + "num_input_tokens_seen": 52869976, + "step": 91065 + }, + { + "epoch": 13.564194221030682, + "grad_norm": 0.07080078125, + "learning_rate": 0.008509164513810534, + "loss": 0.7958, + "num_input_tokens_seen": 52872888, + "step": 91070 + }, + { + "epoch": 13.564938933571641, + "grad_norm": 0.035400390625, + "learning_rate": 0.008507406903480988, + "loss": 0.7725, + "num_input_tokens_seen": 52875832, + "step": 91075 + }, + { + "epoch": 13.5656836461126, + "grad_norm": 0.05078125, + "learning_rate": 0.00850564940283721, + "loss": 0.792, + "num_input_tokens_seen": 52878584, + "step": 91080 + }, + { + "epoch": 13.56642835865356, + "grad_norm": 0.0517578125, + "learning_rate": 0.008503892011908884, + "loss": 0.7615, + "num_input_tokens_seen": 52881720, + "step": 91085 + }, + { + "epoch": 13.567173071194519, + "grad_norm": 0.050537109375, + "learning_rate": 0.008502134730725695, + "loss": 0.7869, + "num_input_tokens_seen": 52884504, + "step": 91090 + }, + { + "epoch": 13.567917783735478, + "grad_norm": 0.1943359375, + "learning_rate": 0.008500377559317344, + "loss": 0.8308, + "num_input_tokens_seen": 52887512, + "step": 91095 + }, + { + "epoch": 13.568662496276437, + "grad_norm": 0.0693359375, + "learning_rate": 0.0084986204977135, + "loss": 0.7551, + "num_input_tokens_seen": 52890360, + "step": 91100 + }, + { + "epoch": 13.569407208817397, + "grad_norm": 0.068359375, + "learning_rate": 0.00849686354594386, + "loss": 0.7773, + "num_input_tokens_seen": 52893400, + "step": 91105 + }, + { + "epoch": 13.570151921358356, + "grad_norm": 0.04736328125, + "learning_rate": 0.008495106704038103, + "loss": 0.8737, + "num_input_tokens_seen": 52896344, + "step": 91110 + }, + { + "epoch": 13.570896633899315, + "grad_norm": 0.05517578125, + "learning_rate": 0.0084933499720259, + "loss": 0.778, + "num_input_tokens_seen": 52899352, + "step": 91115 + }, + { + "epoch": 13.571641346440273, + "grad_norm": 0.0625, + "learning_rate": 0.008491593349936941, + "loss": 0.7893, + "num_input_tokens_seen": 52902264, + "step": 91120 + }, + { + "epoch": 13.572386058981234, + "grad_norm": 0.045166015625, + "learning_rate": 0.008489836837800899, + "loss": 0.761, + "num_input_tokens_seen": 52905080, + "step": 91125 + }, + { + "epoch": 13.573130771522193, + "grad_norm": 0.1328125, + "learning_rate": 0.008488080435647447, + "loss": 0.8112, + "num_input_tokens_seen": 52908120, + "step": 91130 + }, + { + "epoch": 13.573875484063151, + "grad_norm": 0.08203125, + "learning_rate": 0.00848632414350625, + "loss": 0.7815, + "num_input_tokens_seen": 52910904, + "step": 91135 + }, + { + "epoch": 13.57462019660411, + "grad_norm": 0.03955078125, + "learning_rate": 0.008484567961406994, + "loss": 0.7939, + "num_input_tokens_seen": 52913624, + "step": 91140 + }, + { + "epoch": 13.57536490914507, + "grad_norm": 0.035888671875, + "learning_rate": 0.008482811889379334, + "loss": 0.762, + "num_input_tokens_seen": 52916504, + "step": 91145 + }, + { + "epoch": 13.57610962168603, + "grad_norm": 0.037841796875, + "learning_rate": 0.00848105592745295, + "loss": 0.7982, + "num_input_tokens_seen": 52919288, + "step": 91150 + }, + { + "epoch": 13.576854334226988, + "grad_norm": 0.04833984375, + "learning_rate": 0.008479300075657495, + "loss": 0.7946, + "num_input_tokens_seen": 52922328, + "step": 91155 + }, + { + "epoch": 13.577599046767947, + "grad_norm": 0.05517578125, + "learning_rate": 0.008477544334022641, + "loss": 0.7904, + "num_input_tokens_seen": 52925240, + "step": 91160 + }, + { + "epoch": 13.578343759308908, + "grad_norm": 0.046630859375, + "learning_rate": 0.00847578870257805, + "loss": 0.7666, + "num_input_tokens_seen": 52928088, + "step": 91165 + }, + { + "epoch": 13.579088471849866, + "grad_norm": 0.037109375, + "learning_rate": 0.008474033181353369, + "loss": 0.7961, + "num_input_tokens_seen": 52931320, + "step": 91170 + }, + { + "epoch": 13.579833184390825, + "grad_norm": 0.0341796875, + "learning_rate": 0.008472277770378272, + "loss": 0.8016, + "num_input_tokens_seen": 52934136, + "step": 91175 + }, + { + "epoch": 13.580577896931784, + "grad_norm": 0.03955078125, + "learning_rate": 0.008470522469682407, + "loss": 0.774, + "num_input_tokens_seen": 52936952, + "step": 91180 + }, + { + "epoch": 13.581322609472743, + "grad_norm": 0.046875, + "learning_rate": 0.008468767279295429, + "loss": 0.773, + "num_input_tokens_seen": 52939672, + "step": 91185 + }, + { + "epoch": 13.582067322013703, + "grad_norm": 0.12060546875, + "learning_rate": 0.008467012199246985, + "loss": 0.7911, + "num_input_tokens_seen": 52942392, + "step": 91190 + }, + { + "epoch": 13.582812034554662, + "grad_norm": 0.05419921875, + "learning_rate": 0.008465257229566734, + "loss": 0.7949, + "num_input_tokens_seen": 52945432, + "step": 91195 + }, + { + "epoch": 13.58355674709562, + "grad_norm": 0.044189453125, + "learning_rate": 0.008463502370284315, + "loss": 0.7809, + "num_input_tokens_seen": 52948632, + "step": 91200 + }, + { + "epoch": 13.584301459636581, + "grad_norm": 0.054443359375, + "learning_rate": 0.008461747621429384, + "loss": 0.7854, + "num_input_tokens_seen": 52951416, + "step": 91205 + }, + { + "epoch": 13.58504617217754, + "grad_norm": 0.031494140625, + "learning_rate": 0.008459992983031586, + "loss": 0.7662, + "num_input_tokens_seen": 52954360, + "step": 91210 + }, + { + "epoch": 13.585790884718499, + "grad_norm": 0.051513671875, + "learning_rate": 0.008458238455120552, + "loss": 0.7481, + "num_input_tokens_seen": 52957144, + "step": 91215 + }, + { + "epoch": 13.586535597259457, + "grad_norm": 0.037353515625, + "learning_rate": 0.008456484037725935, + "loss": 0.8065, + "num_input_tokens_seen": 52960184, + "step": 91220 + }, + { + "epoch": 13.587280309800416, + "grad_norm": 0.05810546875, + "learning_rate": 0.008454729730877365, + "loss": 0.8039, + "num_input_tokens_seen": 52963288, + "step": 91225 + }, + { + "epoch": 13.588025022341377, + "grad_norm": 0.0303955078125, + "learning_rate": 0.008452975534604492, + "loss": 0.7832, + "num_input_tokens_seen": 52966136, + "step": 91230 + }, + { + "epoch": 13.588769734882336, + "grad_norm": 0.0458984375, + "learning_rate": 0.008451221448936933, + "loss": 0.8241, + "num_input_tokens_seen": 52969080, + "step": 91235 + }, + { + "epoch": 13.589514447423294, + "grad_norm": 0.036865234375, + "learning_rate": 0.008449467473904342, + "loss": 0.7814, + "num_input_tokens_seen": 52972280, + "step": 91240 + }, + { + "epoch": 13.590259159964253, + "grad_norm": 0.0869140625, + "learning_rate": 0.008447713609536337, + "loss": 0.8047, + "num_input_tokens_seen": 52975288, + "step": 91245 + }, + { + "epoch": 13.591003872505214, + "grad_norm": 0.06884765625, + "learning_rate": 0.00844595985586255, + "loss": 0.8443, + "num_input_tokens_seen": 52977912, + "step": 91250 + }, + { + "epoch": 13.591748585046172, + "grad_norm": 0.05517578125, + "learning_rate": 0.008444206212912614, + "loss": 0.7931, + "num_input_tokens_seen": 52980856, + "step": 91255 + }, + { + "epoch": 13.592493297587131, + "grad_norm": 0.07763671875, + "learning_rate": 0.008442452680716143, + "loss": 0.7817, + "num_input_tokens_seen": 52983576, + "step": 91260 + }, + { + "epoch": 13.59323801012809, + "grad_norm": 0.0478515625, + "learning_rate": 0.008440699259302775, + "loss": 0.8058, + "num_input_tokens_seen": 52986648, + "step": 91265 + }, + { + "epoch": 13.59398272266905, + "grad_norm": 0.058837890625, + "learning_rate": 0.008438945948702122, + "loss": 0.829, + "num_input_tokens_seen": 52989816, + "step": 91270 + }, + { + "epoch": 13.59472743521001, + "grad_norm": 0.0458984375, + "learning_rate": 0.008437192748943813, + "loss": 0.7807, + "num_input_tokens_seen": 52992568, + "step": 91275 + }, + { + "epoch": 13.595472147750968, + "grad_norm": 0.026611328125, + "learning_rate": 0.008435439660057458, + "loss": 0.7838, + "num_input_tokens_seen": 52995384, + "step": 91280 + }, + { + "epoch": 13.596216860291927, + "grad_norm": 0.036376953125, + "learning_rate": 0.008433686682072682, + "loss": 0.8222, + "num_input_tokens_seen": 52998360, + "step": 91285 + }, + { + "epoch": 13.596961572832887, + "grad_norm": 0.056884765625, + "learning_rate": 0.008431933815019098, + "loss": 0.7922, + "num_input_tokens_seen": 53001240, + "step": 91290 + }, + { + "epoch": 13.597706285373846, + "grad_norm": 0.056396484375, + "learning_rate": 0.008430181058926308, + "loss": 0.8094, + "num_input_tokens_seen": 53004088, + "step": 91295 + }, + { + "epoch": 13.598450997914805, + "grad_norm": 0.045166015625, + "learning_rate": 0.00842842841382394, + "loss": 0.7876, + "num_input_tokens_seen": 53006904, + "step": 91300 + }, + { + "epoch": 13.599195710455763, + "grad_norm": 0.039306640625, + "learning_rate": 0.008426675879741588, + "loss": 0.8021, + "num_input_tokens_seen": 53009752, + "step": 91305 + }, + { + "epoch": 13.599940422996724, + "grad_norm": 0.036865234375, + "learning_rate": 0.008424923456708873, + "loss": 0.7843, + "num_input_tokens_seen": 53012920, + "step": 91310 + }, + { + "epoch": 13.600685135537683, + "grad_norm": 0.052978515625, + "learning_rate": 0.008423171144755392, + "loss": 0.7852, + "num_input_tokens_seen": 53015576, + "step": 91315 + }, + { + "epoch": 13.601429848078642, + "grad_norm": 0.055419921875, + "learning_rate": 0.008421418943910752, + "loss": 0.8091, + "num_input_tokens_seen": 53018680, + "step": 91320 + }, + { + "epoch": 13.6021745606196, + "grad_norm": 0.0703125, + "learning_rate": 0.008419666854204552, + "loss": 0.8106, + "num_input_tokens_seen": 53021912, + "step": 91325 + }, + { + "epoch": 13.60291927316056, + "grad_norm": 0.030517578125, + "learning_rate": 0.008417914875666387, + "loss": 0.8169, + "num_input_tokens_seen": 53024472, + "step": 91330 + }, + { + "epoch": 13.60366398570152, + "grad_norm": 0.0260009765625, + "learning_rate": 0.008416163008325867, + "loss": 0.8041, + "num_input_tokens_seen": 53027352, + "step": 91335 + }, + { + "epoch": 13.604408698242478, + "grad_norm": 0.044189453125, + "learning_rate": 0.008414411252212574, + "loss": 0.8046, + "num_input_tokens_seen": 53030328, + "step": 91340 + }, + { + "epoch": 13.605153410783437, + "grad_norm": 0.045654296875, + "learning_rate": 0.008412659607356114, + "loss": 0.7858, + "num_input_tokens_seen": 53033240, + "step": 91345 + }, + { + "epoch": 13.605898123324398, + "grad_norm": 0.0517578125, + "learning_rate": 0.008410908073786071, + "loss": 0.7981, + "num_input_tokens_seen": 53036536, + "step": 91350 + }, + { + "epoch": 13.606642835865356, + "grad_norm": 0.051025390625, + "learning_rate": 0.008409156651532046, + "loss": 0.8001, + "num_input_tokens_seen": 53039224, + "step": 91355 + }, + { + "epoch": 13.607387548406315, + "grad_norm": 0.048095703125, + "learning_rate": 0.008407405340623612, + "loss": 0.7868, + "num_input_tokens_seen": 53042680, + "step": 91360 + }, + { + "epoch": 13.608132260947274, + "grad_norm": 0.06298828125, + "learning_rate": 0.008405654141090374, + "loss": 0.7808, + "num_input_tokens_seen": 53045560, + "step": 91365 + }, + { + "epoch": 13.608876973488233, + "grad_norm": 0.0245361328125, + "learning_rate": 0.008403903052961905, + "loss": 0.8125, + "num_input_tokens_seen": 53048408, + "step": 91370 + }, + { + "epoch": 13.609621686029193, + "grad_norm": 0.0308837890625, + "learning_rate": 0.008402152076267783, + "loss": 0.8102, + "num_input_tokens_seen": 53051704, + "step": 91375 + }, + { + "epoch": 13.610366398570152, + "grad_norm": 0.03271484375, + "learning_rate": 0.008400401211037604, + "loss": 0.7773, + "num_input_tokens_seen": 53054424, + "step": 91380 + }, + { + "epoch": 13.61111111111111, + "grad_norm": 0.058837890625, + "learning_rate": 0.008398650457300939, + "loss": 0.8137, + "num_input_tokens_seen": 53057464, + "step": 91385 + }, + { + "epoch": 13.61185582365207, + "grad_norm": 0.03759765625, + "learning_rate": 0.008396899815087363, + "loss": 0.7993, + "num_input_tokens_seen": 53060536, + "step": 91390 + }, + { + "epoch": 13.61260053619303, + "grad_norm": 0.035400390625, + "learning_rate": 0.008395149284426449, + "loss": 0.7834, + "num_input_tokens_seen": 53063576, + "step": 91395 + }, + { + "epoch": 13.613345248733989, + "grad_norm": 0.056640625, + "learning_rate": 0.008393398865347781, + "loss": 0.8052, + "num_input_tokens_seen": 53066744, + "step": 91400 + }, + { + "epoch": 13.614089961274948, + "grad_norm": 0.03271484375, + "learning_rate": 0.008391648557880925, + "loss": 0.7799, + "num_input_tokens_seen": 53070008, + "step": 91405 + }, + { + "epoch": 13.614834673815906, + "grad_norm": 0.043212890625, + "learning_rate": 0.008389898362055443, + "loss": 0.7964, + "num_input_tokens_seen": 53073080, + "step": 91410 + }, + { + "epoch": 13.615579386356867, + "grad_norm": 0.059814453125, + "learning_rate": 0.008388148277900916, + "loss": 0.7846, + "num_input_tokens_seen": 53076216, + "step": 91415 + }, + { + "epoch": 13.616324098897826, + "grad_norm": 0.025146484375, + "learning_rate": 0.008386398305446898, + "loss": 0.803, + "num_input_tokens_seen": 53079256, + "step": 91420 + }, + { + "epoch": 13.617068811438784, + "grad_norm": 0.033447265625, + "learning_rate": 0.008384648444722967, + "loss": 0.8087, + "num_input_tokens_seen": 53081976, + "step": 91425 + }, + { + "epoch": 13.617813523979743, + "grad_norm": 0.044921875, + "learning_rate": 0.00838289869575867, + "loss": 0.8007, + "num_input_tokens_seen": 53084632, + "step": 91430 + }, + { + "epoch": 13.618558236520704, + "grad_norm": 0.036865234375, + "learning_rate": 0.008381149058583578, + "loss": 0.8303, + "num_input_tokens_seen": 53087672, + "step": 91435 + }, + { + "epoch": 13.619302949061662, + "grad_norm": 0.0458984375, + "learning_rate": 0.008379399533227246, + "loss": 0.8005, + "num_input_tokens_seen": 53090968, + "step": 91440 + }, + { + "epoch": 13.620047661602621, + "grad_norm": 0.03173828125, + "learning_rate": 0.008377650119719224, + "loss": 0.7932, + "num_input_tokens_seen": 53093912, + "step": 91445 + }, + { + "epoch": 13.62079237414358, + "grad_norm": 0.0322265625, + "learning_rate": 0.008375900818089083, + "loss": 0.8029, + "num_input_tokens_seen": 53096728, + "step": 91450 + }, + { + "epoch": 13.62153708668454, + "grad_norm": 0.03125, + "learning_rate": 0.008374151628366359, + "loss": 0.812, + "num_input_tokens_seen": 53099640, + "step": 91455 + }, + { + "epoch": 13.6222817992255, + "grad_norm": 0.03515625, + "learning_rate": 0.008372402550580613, + "loss": 0.8099, + "num_input_tokens_seen": 53102680, + "step": 91460 + }, + { + "epoch": 13.623026511766458, + "grad_norm": 0.0576171875, + "learning_rate": 0.008370653584761382, + "loss": 0.79, + "num_input_tokens_seen": 53105528, + "step": 91465 + }, + { + "epoch": 13.623771224307417, + "grad_norm": 0.038818359375, + "learning_rate": 0.008368904730938228, + "loss": 0.7898, + "num_input_tokens_seen": 53109656, + "step": 91470 + }, + { + "epoch": 13.624515936848377, + "grad_norm": 0.03369140625, + "learning_rate": 0.008367155989140682, + "loss": 0.7853, + "num_input_tokens_seen": 53112440, + "step": 91475 + }, + { + "epoch": 13.625260649389336, + "grad_norm": 0.037353515625, + "learning_rate": 0.008365407359398301, + "loss": 0.785, + "num_input_tokens_seen": 53115160, + "step": 91480 + }, + { + "epoch": 13.626005361930295, + "grad_norm": 0.036376953125, + "learning_rate": 0.00836365884174062, + "loss": 0.7674, + "num_input_tokens_seen": 53118136, + "step": 91485 + }, + { + "epoch": 13.626750074471254, + "grad_norm": 0.046142578125, + "learning_rate": 0.008361910436197172, + "loss": 0.7968, + "num_input_tokens_seen": 53120952, + "step": 91490 + }, + { + "epoch": 13.627494787012214, + "grad_norm": 0.06396484375, + "learning_rate": 0.008360162142797507, + "loss": 0.7872, + "num_input_tokens_seen": 53124152, + "step": 91495 + }, + { + "epoch": 13.628239499553173, + "grad_norm": 0.053466796875, + "learning_rate": 0.008358413961571148, + "loss": 0.7772, + "num_input_tokens_seen": 53126904, + "step": 91500 + }, + { + "epoch": 13.628984212094132, + "grad_norm": 0.043212890625, + "learning_rate": 0.008356665892547637, + "loss": 0.7852, + "num_input_tokens_seen": 53129816, + "step": 91505 + }, + { + "epoch": 13.62972892463509, + "grad_norm": 0.032958984375, + "learning_rate": 0.008354917935756509, + "loss": 0.7689, + "num_input_tokens_seen": 53132792, + "step": 91510 + }, + { + "epoch": 13.63047363717605, + "grad_norm": 0.033447265625, + "learning_rate": 0.008353170091227284, + "loss": 0.7973, + "num_input_tokens_seen": 53135640, + "step": 91515 + }, + { + "epoch": 13.63121834971701, + "grad_norm": 0.03759765625, + "learning_rate": 0.008351422358989493, + "loss": 0.8207, + "num_input_tokens_seen": 53138584, + "step": 91520 + }, + { + "epoch": 13.631963062257968, + "grad_norm": 0.03466796875, + "learning_rate": 0.008349674739072667, + "loss": 0.7815, + "num_input_tokens_seen": 53141240, + "step": 91525 + }, + { + "epoch": 13.632707774798927, + "grad_norm": 0.048583984375, + "learning_rate": 0.00834792723150633, + "loss": 0.792, + "num_input_tokens_seen": 53144120, + "step": 91530 + }, + { + "epoch": 13.633452487339888, + "grad_norm": 0.03173828125, + "learning_rate": 0.008346179836319995, + "loss": 0.7892, + "num_input_tokens_seen": 53146808, + "step": 91535 + }, + { + "epoch": 13.634197199880846, + "grad_norm": 0.037109375, + "learning_rate": 0.008344432553543195, + "loss": 0.7659, + "num_input_tokens_seen": 53149720, + "step": 91540 + }, + { + "epoch": 13.634941912421805, + "grad_norm": 0.031982421875, + "learning_rate": 0.008342685383205433, + "loss": 0.8016, + "num_input_tokens_seen": 53153016, + "step": 91545 + }, + { + "epoch": 13.635686624962764, + "grad_norm": 0.036376953125, + "learning_rate": 0.008340938325336245, + "loss": 0.7882, + "num_input_tokens_seen": 53156216, + "step": 91550 + }, + { + "epoch": 13.636431337503723, + "grad_norm": 0.04052734375, + "learning_rate": 0.00833919137996513, + "loss": 0.7916, + "num_input_tokens_seen": 53159160, + "step": 91555 + }, + { + "epoch": 13.637176050044683, + "grad_norm": 0.06787109375, + "learning_rate": 0.00833744454712161, + "loss": 0.8141, + "num_input_tokens_seen": 53162136, + "step": 91560 + }, + { + "epoch": 13.637920762585642, + "grad_norm": 0.0400390625, + "learning_rate": 0.008335697826835194, + "loss": 0.782, + "num_input_tokens_seen": 53165112, + "step": 91565 + }, + { + "epoch": 13.6386654751266, + "grad_norm": 0.049072265625, + "learning_rate": 0.008333951219135384, + "loss": 0.7586, + "num_input_tokens_seen": 53167928, + "step": 91570 + }, + { + "epoch": 13.63941018766756, + "grad_norm": 0.04638671875, + "learning_rate": 0.008332204724051702, + "loss": 0.773, + "num_input_tokens_seen": 53171256, + "step": 91575 + }, + { + "epoch": 13.64015490020852, + "grad_norm": 0.045166015625, + "learning_rate": 0.00833045834161364, + "loss": 0.8103, + "num_input_tokens_seen": 53174104, + "step": 91580 + }, + { + "epoch": 13.640899612749479, + "grad_norm": 0.03173828125, + "learning_rate": 0.008328712071850708, + "loss": 0.8185, + "num_input_tokens_seen": 53176920, + "step": 91585 + }, + { + "epoch": 13.641644325290438, + "grad_norm": 0.0284423828125, + "learning_rate": 0.008326965914792398, + "loss": 0.8043, + "num_input_tokens_seen": 53179832, + "step": 91590 + }, + { + "epoch": 13.642389037831396, + "grad_norm": 0.02587890625, + "learning_rate": 0.008325219870468224, + "loss": 0.8232, + "num_input_tokens_seen": 53182616, + "step": 91595 + }, + { + "epoch": 13.643133750372357, + "grad_norm": 0.03857421875, + "learning_rate": 0.008323473938907672, + "loss": 0.7804, + "num_input_tokens_seen": 53185624, + "step": 91600 + }, + { + "epoch": 13.643878462913316, + "grad_norm": 0.0517578125, + "learning_rate": 0.008321728120140245, + "loss": 0.7834, + "num_input_tokens_seen": 53188312, + "step": 91605 + }, + { + "epoch": 13.644623175454274, + "grad_norm": 0.026611328125, + "learning_rate": 0.008319982414195435, + "loss": 0.7888, + "num_input_tokens_seen": 53191128, + "step": 91610 + }, + { + "epoch": 13.645367887995233, + "grad_norm": 0.052490234375, + "learning_rate": 0.008318236821102727, + "loss": 0.8278, + "num_input_tokens_seen": 53193976, + "step": 91615 + }, + { + "epoch": 13.646112600536194, + "grad_norm": 0.03564453125, + "learning_rate": 0.008316491340891623, + "loss": 0.8033, + "num_input_tokens_seen": 53196920, + "step": 91620 + }, + { + "epoch": 13.646857313077152, + "grad_norm": 0.046875, + "learning_rate": 0.008314745973591598, + "loss": 0.7908, + "num_input_tokens_seen": 53199608, + "step": 91625 + }, + { + "epoch": 13.647602025618111, + "grad_norm": 0.1630859375, + "learning_rate": 0.008313000719232152, + "loss": 0.8593, + "num_input_tokens_seen": 53202424, + "step": 91630 + }, + { + "epoch": 13.64834673815907, + "grad_norm": 0.056884765625, + "learning_rate": 0.008311255577842756, + "loss": 0.7968, + "num_input_tokens_seen": 53205080, + "step": 91635 + }, + { + "epoch": 13.64909145070003, + "grad_norm": 0.046875, + "learning_rate": 0.008309510549452906, + "loss": 0.791, + "num_input_tokens_seen": 53207960, + "step": 91640 + }, + { + "epoch": 13.64983616324099, + "grad_norm": 0.038818359375, + "learning_rate": 0.008307765634092074, + "loss": 0.7938, + "num_input_tokens_seen": 53210648, + "step": 91645 + }, + { + "epoch": 13.650580875781948, + "grad_norm": 0.032958984375, + "learning_rate": 0.00830602083178974, + "loss": 0.7817, + "num_input_tokens_seen": 53213752, + "step": 91650 + }, + { + "epoch": 13.651325588322907, + "grad_norm": 0.0498046875, + "learning_rate": 0.008304276142575381, + "loss": 0.8031, + "num_input_tokens_seen": 53216728, + "step": 91655 + }, + { + "epoch": 13.652070300863867, + "grad_norm": 0.03125, + "learning_rate": 0.008302531566478465, + "loss": 0.7732, + "num_input_tokens_seen": 53219448, + "step": 91660 + }, + { + "epoch": 13.652815013404826, + "grad_norm": 0.03564453125, + "learning_rate": 0.00830078710352848, + "loss": 0.8221, + "num_input_tokens_seen": 53222392, + "step": 91665 + }, + { + "epoch": 13.653559725945785, + "grad_norm": 0.039306640625, + "learning_rate": 0.008299042753754877, + "loss": 0.8026, + "num_input_tokens_seen": 53225080, + "step": 91670 + }, + { + "epoch": 13.654304438486744, + "grad_norm": 0.035888671875, + "learning_rate": 0.008297298517187147, + "loss": 0.7865, + "num_input_tokens_seen": 53228088, + "step": 91675 + }, + { + "epoch": 13.655049151027704, + "grad_norm": 0.0238037109375, + "learning_rate": 0.008295554393854737, + "loss": 0.8025, + "num_input_tokens_seen": 53230936, + "step": 91680 + }, + { + "epoch": 13.655793863568663, + "grad_norm": 0.0400390625, + "learning_rate": 0.008293810383787127, + "loss": 0.7988, + "num_input_tokens_seen": 53234008, + "step": 91685 + }, + { + "epoch": 13.656538576109622, + "grad_norm": 0.031005859375, + "learning_rate": 0.008292066487013777, + "loss": 0.8068, + "num_input_tokens_seen": 53236856, + "step": 91690 + }, + { + "epoch": 13.65728328865058, + "grad_norm": 0.047119140625, + "learning_rate": 0.00829032270356414, + "loss": 0.7912, + "num_input_tokens_seen": 53239608, + "step": 91695 + }, + { + "epoch": 13.65802800119154, + "grad_norm": 0.033447265625, + "learning_rate": 0.008288579033467686, + "loss": 0.7857, + "num_input_tokens_seen": 53242488, + "step": 91700 + }, + { + "epoch": 13.6587727137325, + "grad_norm": 0.048583984375, + "learning_rate": 0.008286835476753863, + "loss": 0.8051, + "num_input_tokens_seen": 53245368, + "step": 91705 + }, + { + "epoch": 13.659517426273458, + "grad_norm": 0.04833984375, + "learning_rate": 0.00828509203345214, + "loss": 0.8202, + "num_input_tokens_seen": 53248216, + "step": 91710 + }, + { + "epoch": 13.660262138814417, + "grad_norm": 0.032470703125, + "learning_rate": 0.008283348703591958, + "loss": 0.7879, + "num_input_tokens_seen": 53251096, + "step": 91715 + }, + { + "epoch": 13.661006851355378, + "grad_norm": 0.042236328125, + "learning_rate": 0.008281605487202776, + "loss": 0.777, + "num_input_tokens_seen": 53254104, + "step": 91720 + }, + { + "epoch": 13.661751563896336, + "grad_norm": 0.0294189453125, + "learning_rate": 0.008279862384314038, + "loss": 0.8035, + "num_input_tokens_seen": 53257048, + "step": 91725 + }, + { + "epoch": 13.662496276437295, + "grad_norm": 0.043212890625, + "learning_rate": 0.008278119394955192, + "loss": 0.801, + "num_input_tokens_seen": 53260280, + "step": 91730 + }, + { + "epoch": 13.663240988978254, + "grad_norm": 0.03466796875, + "learning_rate": 0.008276376519155691, + "loss": 0.7967, + "num_input_tokens_seen": 53263320, + "step": 91735 + }, + { + "epoch": 13.663985701519213, + "grad_norm": 0.061279296875, + "learning_rate": 0.00827463375694497, + "loss": 0.7831, + "num_input_tokens_seen": 53266104, + "step": 91740 + }, + { + "epoch": 13.664730414060173, + "grad_norm": 0.028076171875, + "learning_rate": 0.008272891108352482, + "loss": 0.7844, + "num_input_tokens_seen": 53268920, + "step": 91745 + }, + { + "epoch": 13.665475126601132, + "grad_norm": 0.04443359375, + "learning_rate": 0.008271148573407659, + "loss": 0.7879, + "num_input_tokens_seen": 53271736, + "step": 91750 + }, + { + "epoch": 13.66621983914209, + "grad_norm": 0.11962890625, + "learning_rate": 0.008269406152139943, + "loss": 0.8096, + "num_input_tokens_seen": 53274808, + "step": 91755 + }, + { + "epoch": 13.66696455168305, + "grad_norm": 0.058349609375, + "learning_rate": 0.008267663844578766, + "loss": 0.7972, + "num_input_tokens_seen": 53277592, + "step": 91760 + }, + { + "epoch": 13.66770926422401, + "grad_norm": 0.039306640625, + "learning_rate": 0.008265921650753572, + "loss": 0.7898, + "num_input_tokens_seen": 53280568, + "step": 91765 + }, + { + "epoch": 13.668453976764969, + "grad_norm": 0.03369140625, + "learning_rate": 0.008264179570693787, + "loss": 0.8083, + "num_input_tokens_seen": 53283512, + "step": 91770 + }, + { + "epoch": 13.669198689305928, + "grad_norm": 0.044921875, + "learning_rate": 0.008262437604428836, + "loss": 0.7883, + "num_input_tokens_seen": 53286872, + "step": 91775 + }, + { + "epoch": 13.669943401846886, + "grad_norm": 0.041748046875, + "learning_rate": 0.008260695751988158, + "loss": 0.796, + "num_input_tokens_seen": 53290168, + "step": 91780 + }, + { + "epoch": 13.670688114387847, + "grad_norm": 0.043212890625, + "learning_rate": 0.008258954013401177, + "loss": 0.78, + "num_input_tokens_seen": 53293464, + "step": 91785 + }, + { + "epoch": 13.671432826928806, + "grad_norm": 0.07958984375, + "learning_rate": 0.008257212388697319, + "loss": 0.7852, + "num_input_tokens_seen": 53296344, + "step": 91790 + }, + { + "epoch": 13.672177539469764, + "grad_norm": 0.04345703125, + "learning_rate": 0.008255470877905997, + "loss": 0.7931, + "num_input_tokens_seen": 53299480, + "step": 91795 + }, + { + "epoch": 13.672922252010723, + "grad_norm": 0.1474609375, + "learning_rate": 0.008253729481056644, + "loss": 0.8821, + "num_input_tokens_seen": 53302104, + "step": 91800 + }, + { + "epoch": 13.673666964551684, + "grad_norm": 0.04296875, + "learning_rate": 0.00825198819817867, + "loss": 0.8036, + "num_input_tokens_seen": 53305144, + "step": 91805 + }, + { + "epoch": 13.674411677092642, + "grad_norm": 0.04248046875, + "learning_rate": 0.008250247029301504, + "loss": 0.7659, + "num_input_tokens_seen": 53307864, + "step": 91810 + }, + { + "epoch": 13.675156389633601, + "grad_norm": 0.031005859375, + "learning_rate": 0.008248505974454555, + "loss": 0.8119, + "num_input_tokens_seen": 53310616, + "step": 91815 + }, + { + "epoch": 13.67590110217456, + "grad_norm": 0.0419921875, + "learning_rate": 0.008246765033667227, + "loss": 0.8057, + "num_input_tokens_seen": 53313432, + "step": 91820 + }, + { + "epoch": 13.67664581471552, + "grad_norm": 0.041259765625, + "learning_rate": 0.008245024206968947, + "loss": 0.8033, + "num_input_tokens_seen": 53316056, + "step": 91825 + }, + { + "epoch": 13.67739052725648, + "grad_norm": 0.03271484375, + "learning_rate": 0.008243283494389114, + "loss": 0.7797, + "num_input_tokens_seen": 53318680, + "step": 91830 + }, + { + "epoch": 13.678135239797438, + "grad_norm": 0.04638671875, + "learning_rate": 0.008241542895957141, + "loss": 0.7929, + "num_input_tokens_seen": 53321688, + "step": 91835 + }, + { + "epoch": 13.678879952338397, + "grad_norm": 0.040283203125, + "learning_rate": 0.008239802411702435, + "loss": 0.7917, + "num_input_tokens_seen": 53324152, + "step": 91840 + }, + { + "epoch": 13.679624664879357, + "grad_norm": 0.039306640625, + "learning_rate": 0.008238062041654396, + "loss": 0.7903, + "num_input_tokens_seen": 53327096, + "step": 91845 + }, + { + "epoch": 13.680369377420316, + "grad_norm": 0.040771484375, + "learning_rate": 0.008236321785842418, + "loss": 0.7961, + "num_input_tokens_seen": 53329912, + "step": 91850 + }, + { + "epoch": 13.681114089961275, + "grad_norm": 0.0291748046875, + "learning_rate": 0.008234581644295918, + "loss": 0.7972, + "num_input_tokens_seen": 53332632, + "step": 91855 + }, + { + "epoch": 13.681858802502234, + "grad_norm": 0.0224609375, + "learning_rate": 0.008232841617044285, + "loss": 0.7927, + "num_input_tokens_seen": 53335544, + "step": 91860 + }, + { + "epoch": 13.682603515043194, + "grad_norm": 0.039794921875, + "learning_rate": 0.008231101704116908, + "loss": 0.7951, + "num_input_tokens_seen": 53338456, + "step": 91865 + }, + { + "epoch": 13.683348227584153, + "grad_norm": 0.022216796875, + "learning_rate": 0.008229361905543194, + "loss": 0.8003, + "num_input_tokens_seen": 53341464, + "step": 91870 + }, + { + "epoch": 13.684092940125112, + "grad_norm": 0.0267333984375, + "learning_rate": 0.008227622221352522, + "loss": 0.7916, + "num_input_tokens_seen": 53344568, + "step": 91875 + }, + { + "epoch": 13.68483765266607, + "grad_norm": 0.044189453125, + "learning_rate": 0.008225882651574297, + "loss": 0.7988, + "num_input_tokens_seen": 53347096, + "step": 91880 + }, + { + "epoch": 13.68558236520703, + "grad_norm": 0.05029296875, + "learning_rate": 0.008224143196237894, + "loss": 0.7955, + "num_input_tokens_seen": 53350008, + "step": 91885 + }, + { + "epoch": 13.68632707774799, + "grad_norm": 0.036376953125, + "learning_rate": 0.00822240385537271, + "loss": 0.8002, + "num_input_tokens_seen": 53353208, + "step": 91890 + }, + { + "epoch": 13.687071790288948, + "grad_norm": 0.028076171875, + "learning_rate": 0.008220664629008124, + "loss": 0.8181, + "num_input_tokens_seen": 53356120, + "step": 91895 + }, + { + "epoch": 13.687816502829907, + "grad_norm": 0.03271484375, + "learning_rate": 0.008218925517173514, + "loss": 0.8157, + "num_input_tokens_seen": 53358840, + "step": 91900 + }, + { + "epoch": 13.688561215370868, + "grad_norm": 0.040771484375, + "learning_rate": 0.00821718651989827, + "loss": 0.7869, + "num_input_tokens_seen": 53361880, + "step": 91905 + }, + { + "epoch": 13.689305927911827, + "grad_norm": 0.034423828125, + "learning_rate": 0.008215447637211768, + "loss": 0.8012, + "num_input_tokens_seen": 53365016, + "step": 91910 + }, + { + "epoch": 13.690050640452785, + "grad_norm": 0.032470703125, + "learning_rate": 0.008213708869143382, + "loss": 0.8063, + "num_input_tokens_seen": 53367864, + "step": 91915 + }, + { + "epoch": 13.690795352993744, + "grad_norm": 0.0235595703125, + "learning_rate": 0.008211970215722482, + "loss": 0.7829, + "num_input_tokens_seen": 53370776, + "step": 91920 + }, + { + "epoch": 13.691540065534703, + "grad_norm": 0.05029296875, + "learning_rate": 0.008210231676978449, + "loss": 0.7992, + "num_input_tokens_seen": 53373848, + "step": 91925 + }, + { + "epoch": 13.692284778075663, + "grad_norm": 0.041259765625, + "learning_rate": 0.008208493252940655, + "loss": 0.8094, + "num_input_tokens_seen": 53376856, + "step": 91930 + }, + { + "epoch": 13.693029490616622, + "grad_norm": 0.0546875, + "learning_rate": 0.008206754943638457, + "loss": 0.7763, + "num_input_tokens_seen": 53379928, + "step": 91935 + }, + { + "epoch": 13.69377420315758, + "grad_norm": 0.023193359375, + "learning_rate": 0.008205016749101237, + "loss": 0.806, + "num_input_tokens_seen": 53383032, + "step": 91940 + }, + { + "epoch": 13.69451891569854, + "grad_norm": 0.048583984375, + "learning_rate": 0.008203278669358344, + "loss": 0.7505, + "num_input_tokens_seen": 53385784, + "step": 91945 + }, + { + "epoch": 13.6952636282395, + "grad_norm": 0.04443359375, + "learning_rate": 0.00820154070443916, + "loss": 0.7881, + "num_input_tokens_seen": 53388856, + "step": 91950 + }, + { + "epoch": 13.696008340780459, + "grad_norm": 0.0238037109375, + "learning_rate": 0.00819980285437303, + "loss": 0.7945, + "num_input_tokens_seen": 53391672, + "step": 91955 + }, + { + "epoch": 13.696753053321418, + "grad_norm": 0.0203857421875, + "learning_rate": 0.008198065119189323, + "loss": 0.7982, + "num_input_tokens_seen": 53394392, + "step": 91960 + }, + { + "epoch": 13.697497765862376, + "grad_norm": 0.046630859375, + "learning_rate": 0.008196327498917385, + "loss": 0.793, + "num_input_tokens_seen": 53397336, + "step": 91965 + }, + { + "epoch": 13.698242478403337, + "grad_norm": 0.0218505859375, + "learning_rate": 0.008194589993586588, + "loss": 0.7963, + "num_input_tokens_seen": 53400280, + "step": 91970 + }, + { + "epoch": 13.698987190944296, + "grad_norm": 0.0198974609375, + "learning_rate": 0.008192852603226275, + "loss": 0.799, + "num_input_tokens_seen": 53403224, + "step": 91975 + }, + { + "epoch": 13.699731903485254, + "grad_norm": 0.036376953125, + "learning_rate": 0.008191115327865801, + "loss": 0.7939, + "num_input_tokens_seen": 53406168, + "step": 91980 + }, + { + "epoch": 13.700476616026213, + "grad_norm": 0.0294189453125, + "learning_rate": 0.008189378167534511, + "loss": 0.8009, + "num_input_tokens_seen": 53409528, + "step": 91985 + }, + { + "epoch": 13.701221328567174, + "grad_norm": 0.051513671875, + "learning_rate": 0.008187641122261748, + "loss": 0.8016, + "num_input_tokens_seen": 53412504, + "step": 91990 + }, + { + "epoch": 13.701966041108133, + "grad_norm": 0.053466796875, + "learning_rate": 0.008185904192076873, + "loss": 0.7616, + "num_input_tokens_seen": 53415416, + "step": 91995 + }, + { + "epoch": 13.702710753649091, + "grad_norm": 0.064453125, + "learning_rate": 0.008184167377009213, + "loss": 0.8036, + "num_input_tokens_seen": 53418232, + "step": 92000 + }, + { + "epoch": 13.70345546619005, + "grad_norm": 0.034912109375, + "learning_rate": 0.008182430677088126, + "loss": 0.7938, + "num_input_tokens_seen": 53421112, + "step": 92005 + }, + { + "epoch": 13.70420017873101, + "grad_norm": 0.03271484375, + "learning_rate": 0.008180694092342943, + "loss": 0.7932, + "num_input_tokens_seen": 53424120, + "step": 92010 + }, + { + "epoch": 13.70494489127197, + "grad_norm": 0.04345703125, + "learning_rate": 0.008178957622802996, + "loss": 0.7752, + "num_input_tokens_seen": 53426968, + "step": 92015 + }, + { + "epoch": 13.705689603812928, + "grad_norm": 0.04541015625, + "learning_rate": 0.008177221268497632, + "loss": 0.7811, + "num_input_tokens_seen": 53429656, + "step": 92020 + }, + { + "epoch": 13.706434316353887, + "grad_norm": 0.0284423828125, + "learning_rate": 0.008175485029456177, + "loss": 0.7931, + "num_input_tokens_seen": 53432696, + "step": 92025 + }, + { + "epoch": 13.707179028894847, + "grad_norm": 0.042236328125, + "learning_rate": 0.00817374890570797, + "loss": 0.8222, + "num_input_tokens_seen": 53435512, + "step": 92030 + }, + { + "epoch": 13.707923741435806, + "grad_norm": 0.0400390625, + "learning_rate": 0.008172012897282333, + "loss": 0.786, + "num_input_tokens_seen": 53438392, + "step": 92035 + }, + { + "epoch": 13.708668453976765, + "grad_norm": 0.042724609375, + "learning_rate": 0.008170277004208604, + "loss": 0.7801, + "num_input_tokens_seen": 53441176, + "step": 92040 + }, + { + "epoch": 13.709413166517724, + "grad_norm": 0.03857421875, + "learning_rate": 0.008168541226516104, + "loss": 0.833, + "num_input_tokens_seen": 53444088, + "step": 92045 + }, + { + "epoch": 13.710157879058684, + "grad_norm": 0.03857421875, + "learning_rate": 0.008166805564234155, + "loss": 0.8124, + "num_input_tokens_seen": 53447096, + "step": 92050 + }, + { + "epoch": 13.710902591599643, + "grad_norm": 0.072265625, + "learning_rate": 0.008165070017392083, + "loss": 0.8171, + "num_input_tokens_seen": 53449912, + "step": 92055 + }, + { + "epoch": 13.711647304140602, + "grad_norm": 0.03515625, + "learning_rate": 0.0081633345860192, + "loss": 0.7799, + "num_input_tokens_seen": 53452920, + "step": 92060 + }, + { + "epoch": 13.71239201668156, + "grad_norm": 0.064453125, + "learning_rate": 0.008161599270144834, + "loss": 0.7955, + "num_input_tokens_seen": 53455704, + "step": 92065 + }, + { + "epoch": 13.71313672922252, + "grad_norm": 0.0341796875, + "learning_rate": 0.008159864069798295, + "loss": 0.7841, + "num_input_tokens_seen": 53458584, + "step": 92070 + }, + { + "epoch": 13.71388144176348, + "grad_norm": 0.044921875, + "learning_rate": 0.008158128985008908, + "loss": 0.7979, + "num_input_tokens_seen": 53461784, + "step": 92075 + }, + { + "epoch": 13.714626154304439, + "grad_norm": 0.024658203125, + "learning_rate": 0.00815639401580597, + "loss": 0.7955, + "num_input_tokens_seen": 53464376, + "step": 92080 + }, + { + "epoch": 13.715370866845397, + "grad_norm": 0.0849609375, + "learning_rate": 0.008154659162218804, + "loss": 0.8116, + "num_input_tokens_seen": 53467544, + "step": 92085 + }, + { + "epoch": 13.716115579386356, + "grad_norm": 0.048095703125, + "learning_rate": 0.008152924424276715, + "loss": 0.7918, + "num_input_tokens_seen": 53470296, + "step": 92090 + }, + { + "epoch": 13.716860291927317, + "grad_norm": 0.03564453125, + "learning_rate": 0.008151189802009004, + "loss": 0.8277, + "num_input_tokens_seen": 53473432, + "step": 92095 + }, + { + "epoch": 13.717605004468275, + "grad_norm": 0.03662109375, + "learning_rate": 0.008149455295444988, + "loss": 0.8028, + "num_input_tokens_seen": 53476600, + "step": 92100 + }, + { + "epoch": 13.718349717009234, + "grad_norm": 0.05029296875, + "learning_rate": 0.008147720904613952, + "loss": 0.7703, + "num_input_tokens_seen": 53479448, + "step": 92105 + }, + { + "epoch": 13.719094429550193, + "grad_norm": 0.0306396484375, + "learning_rate": 0.008145986629545215, + "loss": 0.8438, + "num_input_tokens_seen": 53482136, + "step": 92110 + }, + { + "epoch": 13.719839142091153, + "grad_norm": 0.042236328125, + "learning_rate": 0.008144252470268067, + "loss": 0.7732, + "num_input_tokens_seen": 53484984, + "step": 92115 + }, + { + "epoch": 13.720583854632112, + "grad_norm": 0.034423828125, + "learning_rate": 0.00814251842681181, + "loss": 0.8438, + "num_input_tokens_seen": 53487800, + "step": 92120 + }, + { + "epoch": 13.721328567173071, + "grad_norm": 0.03271484375, + "learning_rate": 0.008140784499205724, + "loss": 0.8073, + "num_input_tokens_seen": 53490648, + "step": 92125 + }, + { + "epoch": 13.72207327971403, + "grad_norm": 0.052734375, + "learning_rate": 0.008139050687479119, + "loss": 0.7998, + "num_input_tokens_seen": 53493304, + "step": 92130 + }, + { + "epoch": 13.72281799225499, + "grad_norm": 0.041015625, + "learning_rate": 0.00813731699166128, + "loss": 0.781, + "num_input_tokens_seen": 53496248, + "step": 92135 + }, + { + "epoch": 13.723562704795949, + "grad_norm": 0.0311279296875, + "learning_rate": 0.008135583411781487, + "loss": 0.8017, + "num_input_tokens_seen": 53499256, + "step": 92140 + }, + { + "epoch": 13.724307417336908, + "grad_norm": 0.0419921875, + "learning_rate": 0.008133849947869046, + "loss": 0.7659, + "num_input_tokens_seen": 53502072, + "step": 92145 + }, + { + "epoch": 13.725052129877866, + "grad_norm": 0.034423828125, + "learning_rate": 0.008132116599953224, + "loss": 0.8005, + "num_input_tokens_seen": 53505176, + "step": 92150 + }, + { + "epoch": 13.725796842418827, + "grad_norm": 0.0262451171875, + "learning_rate": 0.008130383368063319, + "loss": 0.7954, + "num_input_tokens_seen": 53508056, + "step": 92155 + }, + { + "epoch": 13.726541554959786, + "grad_norm": 0.032470703125, + "learning_rate": 0.008128650252228597, + "loss": 0.8023, + "num_input_tokens_seen": 53510808, + "step": 92160 + }, + { + "epoch": 13.727286267500745, + "grad_norm": 0.02978515625, + "learning_rate": 0.008126917252478353, + "loss": 0.7833, + "num_input_tokens_seen": 53513784, + "step": 92165 + }, + { + "epoch": 13.728030980041703, + "grad_norm": 0.0390625, + "learning_rate": 0.008125184368841853, + "loss": 0.8044, + "num_input_tokens_seen": 53516472, + "step": 92170 + }, + { + "epoch": 13.728775692582664, + "grad_norm": 0.033203125, + "learning_rate": 0.00812345160134838, + "loss": 0.7771, + "num_input_tokens_seen": 53519288, + "step": 92175 + }, + { + "epoch": 13.729520405123623, + "grad_norm": 0.033447265625, + "learning_rate": 0.008121718950027196, + "loss": 0.7901, + "num_input_tokens_seen": 53522136, + "step": 92180 + }, + { + "epoch": 13.730265117664581, + "grad_norm": 0.0281982421875, + "learning_rate": 0.008119986414907583, + "loss": 0.8097, + "num_input_tokens_seen": 53525272, + "step": 92185 + }, + { + "epoch": 13.73100983020554, + "grad_norm": 0.034912109375, + "learning_rate": 0.00811825399601881, + "loss": 0.7845, + "num_input_tokens_seen": 53527928, + "step": 92190 + }, + { + "epoch": 13.7317545427465, + "grad_norm": 0.03662109375, + "learning_rate": 0.008116521693390135, + "loss": 0.7679, + "num_input_tokens_seen": 53530712, + "step": 92195 + }, + { + "epoch": 13.73249925528746, + "grad_norm": 0.040771484375, + "learning_rate": 0.008114789507050834, + "loss": 0.8268, + "num_input_tokens_seen": 53533592, + "step": 92200 + }, + { + "epoch": 13.733243967828418, + "grad_norm": 0.04541015625, + "learning_rate": 0.008113057437030162, + "loss": 0.8609, + "num_input_tokens_seen": 53536568, + "step": 92205 + }, + { + "epoch": 13.733988680369377, + "grad_norm": 0.03369140625, + "learning_rate": 0.008111325483357391, + "loss": 0.7982, + "num_input_tokens_seen": 53539320, + "step": 92210 + }, + { + "epoch": 13.734733392910336, + "grad_norm": 0.03125, + "learning_rate": 0.008109593646061774, + "loss": 0.8068, + "num_input_tokens_seen": 53542200, + "step": 92215 + }, + { + "epoch": 13.735478105451296, + "grad_norm": 0.053955078125, + "learning_rate": 0.008107861925172564, + "loss": 0.823, + "num_input_tokens_seen": 53544952, + "step": 92220 + }, + { + "epoch": 13.736222817992255, + "grad_norm": 0.0223388671875, + "learning_rate": 0.008106130320719029, + "loss": 0.7975, + "num_input_tokens_seen": 53547576, + "step": 92225 + }, + { + "epoch": 13.736967530533214, + "grad_norm": 0.033447265625, + "learning_rate": 0.008104398832730408, + "loss": 0.8071, + "num_input_tokens_seen": 53550520, + "step": 92230 + }, + { + "epoch": 13.737712243074174, + "grad_norm": 0.0380859375, + "learning_rate": 0.008102667461235967, + "loss": 0.8217, + "num_input_tokens_seen": 53553560, + "step": 92235 + }, + { + "epoch": 13.738456955615133, + "grad_norm": 0.04638671875, + "learning_rate": 0.008100936206264949, + "loss": 0.7881, + "num_input_tokens_seen": 53556312, + "step": 92240 + }, + { + "epoch": 13.739201668156092, + "grad_norm": 0.043701171875, + "learning_rate": 0.008099205067846602, + "loss": 0.798, + "num_input_tokens_seen": 53559576, + "step": 92245 + }, + { + "epoch": 13.73994638069705, + "grad_norm": 0.06005859375, + "learning_rate": 0.00809747404601017, + "loss": 0.8057, + "num_input_tokens_seen": 53562616, + "step": 92250 + }, + { + "epoch": 13.74069109323801, + "grad_norm": 0.032470703125, + "learning_rate": 0.008095743140784895, + "loss": 0.7975, + "num_input_tokens_seen": 53565624, + "step": 92255 + }, + { + "epoch": 13.74143580577897, + "grad_norm": 0.0400390625, + "learning_rate": 0.008094012352200027, + "loss": 0.8119, + "num_input_tokens_seen": 53568728, + "step": 92260 + }, + { + "epoch": 13.742180518319929, + "grad_norm": 0.0380859375, + "learning_rate": 0.008092281680284795, + "loss": 0.8077, + "num_input_tokens_seen": 53571992, + "step": 92265 + }, + { + "epoch": 13.742925230860887, + "grad_norm": 0.044921875, + "learning_rate": 0.00809055112506845, + "loss": 0.8036, + "num_input_tokens_seen": 53574968, + "step": 92270 + }, + { + "epoch": 13.743669943401846, + "grad_norm": 0.06005859375, + "learning_rate": 0.008088820686580216, + "loss": 0.794, + "num_input_tokens_seen": 53577752, + "step": 92275 + }, + { + "epoch": 13.744414655942807, + "grad_norm": 0.0274658203125, + "learning_rate": 0.008087090364849338, + "loss": 0.7834, + "num_input_tokens_seen": 53581016, + "step": 92280 + }, + { + "epoch": 13.745159368483765, + "grad_norm": 0.041015625, + "learning_rate": 0.008085360159905037, + "loss": 0.7892, + "num_input_tokens_seen": 53583832, + "step": 92285 + }, + { + "epoch": 13.745904081024724, + "grad_norm": 0.03857421875, + "learning_rate": 0.008083630071776551, + "loss": 0.8111, + "num_input_tokens_seen": 53586712, + "step": 92290 + }, + { + "epoch": 13.746648793565683, + "grad_norm": 0.036376953125, + "learning_rate": 0.008081900100493108, + "loss": 0.8009, + "num_input_tokens_seen": 53589400, + "step": 92295 + }, + { + "epoch": 13.747393506106643, + "grad_norm": 0.043212890625, + "learning_rate": 0.008080170246083927, + "loss": 0.8015, + "num_input_tokens_seen": 53592088, + "step": 92300 + }, + { + "epoch": 13.748138218647602, + "grad_norm": 0.0361328125, + "learning_rate": 0.008078440508578241, + "loss": 0.8161, + "num_input_tokens_seen": 53594904, + "step": 92305 + }, + { + "epoch": 13.748882931188561, + "grad_norm": 0.0306396484375, + "learning_rate": 0.008076710888005267, + "loss": 0.8068, + "num_input_tokens_seen": 53597784, + "step": 92310 + }, + { + "epoch": 13.74962764372952, + "grad_norm": 0.040283203125, + "learning_rate": 0.008074981384394228, + "loss": 0.812, + "num_input_tokens_seen": 53600664, + "step": 92315 + }, + { + "epoch": 13.75037235627048, + "grad_norm": 0.03369140625, + "learning_rate": 0.008073251997774334, + "loss": 0.8017, + "num_input_tokens_seen": 53603800, + "step": 92320 + }, + { + "epoch": 13.751117068811439, + "grad_norm": 0.0322265625, + "learning_rate": 0.008071522728174811, + "loss": 0.7806, + "num_input_tokens_seen": 53606840, + "step": 92325 + }, + { + "epoch": 13.751861781352398, + "grad_norm": 0.06298828125, + "learning_rate": 0.008069793575624873, + "loss": 0.7973, + "num_input_tokens_seen": 53609784, + "step": 92330 + }, + { + "epoch": 13.752606493893357, + "grad_norm": 0.043212890625, + "learning_rate": 0.008068064540153722, + "loss": 0.7812, + "num_input_tokens_seen": 53612856, + "step": 92335 + }, + { + "epoch": 13.753351206434317, + "grad_norm": 0.033203125, + "learning_rate": 0.00806633562179058, + "loss": 0.7929, + "num_input_tokens_seen": 53615512, + "step": 92340 + }, + { + "epoch": 13.754095918975276, + "grad_norm": 0.031494140625, + "learning_rate": 0.008064606820564644, + "loss": 0.7977, + "num_input_tokens_seen": 53618360, + "step": 92345 + }, + { + "epoch": 13.754840631516235, + "grad_norm": 0.0380859375, + "learning_rate": 0.008062878136505136, + "loss": 0.8055, + "num_input_tokens_seen": 53621464, + "step": 92350 + }, + { + "epoch": 13.755585344057193, + "grad_norm": 0.0306396484375, + "learning_rate": 0.008061149569641244, + "loss": 0.8028, + "num_input_tokens_seen": 53624344, + "step": 92355 + }, + { + "epoch": 13.756330056598154, + "grad_norm": 0.048095703125, + "learning_rate": 0.00805942112000218, + "loss": 0.8163, + "num_input_tokens_seen": 53627256, + "step": 92360 + }, + { + "epoch": 13.757074769139113, + "grad_norm": 0.130859375, + "learning_rate": 0.008057692787617142, + "loss": 0.86, + "num_input_tokens_seen": 53630168, + "step": 92365 + }, + { + "epoch": 13.757819481680071, + "grad_norm": 0.031494140625, + "learning_rate": 0.008055964572515333, + "loss": 0.784, + "num_input_tokens_seen": 53632888, + "step": 92370 + }, + { + "epoch": 13.75856419422103, + "grad_norm": 0.033203125, + "learning_rate": 0.008054236474725944, + "loss": 0.7863, + "num_input_tokens_seen": 53635480, + "step": 92375 + }, + { + "epoch": 13.75930890676199, + "grad_norm": 0.03271484375, + "learning_rate": 0.008052508494278169, + "loss": 0.7992, + "num_input_tokens_seen": 53638776, + "step": 92380 + }, + { + "epoch": 13.76005361930295, + "grad_norm": 0.042724609375, + "learning_rate": 0.008050780631201203, + "loss": 0.797, + "num_input_tokens_seen": 53641592, + "step": 92385 + }, + { + "epoch": 13.760798331843908, + "grad_norm": 0.048095703125, + "learning_rate": 0.00804905288552423, + "loss": 0.8016, + "num_input_tokens_seen": 53645048, + "step": 92390 + }, + { + "epoch": 13.761543044384867, + "grad_norm": 0.03466796875, + "learning_rate": 0.008047325257276448, + "loss": 0.7918, + "num_input_tokens_seen": 53647928, + "step": 92395 + }, + { + "epoch": 13.762287756925826, + "grad_norm": 0.03662109375, + "learning_rate": 0.008045597746487033, + "loss": 0.7965, + "num_input_tokens_seen": 53650776, + "step": 92400 + }, + { + "epoch": 13.763032469466786, + "grad_norm": 0.034423828125, + "learning_rate": 0.008043870353185186, + "loss": 0.8306, + "num_input_tokens_seen": 53653720, + "step": 92405 + }, + { + "epoch": 13.763777182007745, + "grad_norm": 0.0189208984375, + "learning_rate": 0.00804214307740007, + "loss": 0.7874, + "num_input_tokens_seen": 53656600, + "step": 92410 + }, + { + "epoch": 13.764521894548704, + "grad_norm": 0.05078125, + "learning_rate": 0.008040415919160881, + "loss": 0.8045, + "num_input_tokens_seen": 53659416, + "step": 92415 + }, + { + "epoch": 13.765266607089664, + "grad_norm": 0.038818359375, + "learning_rate": 0.008038688878496795, + "loss": 0.8036, + "num_input_tokens_seen": 53662168, + "step": 92420 + }, + { + "epoch": 13.766011319630623, + "grad_norm": 0.02978515625, + "learning_rate": 0.008036961955436977, + "loss": 0.8099, + "num_input_tokens_seen": 53665048, + "step": 92425 + }, + { + "epoch": 13.766756032171582, + "grad_norm": 0.03466796875, + "learning_rate": 0.008035235150010617, + "loss": 0.783, + "num_input_tokens_seen": 53667928, + "step": 92430 + }, + { + "epoch": 13.76750074471254, + "grad_norm": 0.051025390625, + "learning_rate": 0.008033508462246873, + "loss": 0.7842, + "num_input_tokens_seen": 53671256, + "step": 92435 + }, + { + "epoch": 13.7682454572535, + "grad_norm": 0.045654296875, + "learning_rate": 0.00803178189217493, + "loss": 0.791, + "num_input_tokens_seen": 53673912, + "step": 92440 + }, + { + "epoch": 13.76899016979446, + "grad_norm": 0.0380859375, + "learning_rate": 0.008030055439823948, + "loss": 0.7992, + "num_input_tokens_seen": 53676632, + "step": 92445 + }, + { + "epoch": 13.769734882335419, + "grad_norm": 0.04931640625, + "learning_rate": 0.008028329105223097, + "loss": 0.7916, + "num_input_tokens_seen": 53679256, + "step": 92450 + }, + { + "epoch": 13.770479594876377, + "grad_norm": 0.0380859375, + "learning_rate": 0.00802660288840154, + "loss": 0.7877, + "num_input_tokens_seen": 53682168, + "step": 92455 + }, + { + "epoch": 13.771224307417336, + "grad_norm": 0.0361328125, + "learning_rate": 0.008024876789388432, + "loss": 0.8021, + "num_input_tokens_seen": 53684952, + "step": 92460 + }, + { + "epoch": 13.771969019958297, + "grad_norm": 0.042724609375, + "learning_rate": 0.00802315080821295, + "loss": 0.808, + "num_input_tokens_seen": 53687800, + "step": 92465 + }, + { + "epoch": 13.772713732499255, + "grad_norm": 0.0400390625, + "learning_rate": 0.008021424944904237, + "loss": 0.7905, + "num_input_tokens_seen": 53690584, + "step": 92470 + }, + { + "epoch": 13.773458445040214, + "grad_norm": 0.046875, + "learning_rate": 0.008019699199491463, + "loss": 0.7868, + "num_input_tokens_seen": 53693496, + "step": 92475 + }, + { + "epoch": 13.774203157581173, + "grad_norm": 0.0218505859375, + "learning_rate": 0.008017973572003772, + "loss": 0.7936, + "num_input_tokens_seen": 53696216, + "step": 92480 + }, + { + "epoch": 13.774947870122134, + "grad_norm": 0.03662109375, + "learning_rate": 0.008016248062470324, + "loss": 0.8131, + "num_input_tokens_seen": 53698872, + "step": 92485 + }, + { + "epoch": 13.775692582663092, + "grad_norm": 0.037109375, + "learning_rate": 0.008014522670920262, + "loss": 0.7836, + "num_input_tokens_seen": 53701688, + "step": 92490 + }, + { + "epoch": 13.776437295204051, + "grad_norm": 0.038818359375, + "learning_rate": 0.008012797397382747, + "loss": 0.7926, + "num_input_tokens_seen": 53704664, + "step": 92495 + }, + { + "epoch": 13.77718200774501, + "grad_norm": 0.0225830078125, + "learning_rate": 0.00801107224188692, + "loss": 0.7986, + "num_input_tokens_seen": 53707480, + "step": 92500 + }, + { + "epoch": 13.77792672028597, + "grad_norm": 0.039306640625, + "learning_rate": 0.008009347204461921, + "loss": 0.7866, + "num_input_tokens_seen": 53710296, + "step": 92505 + }, + { + "epoch": 13.778671432826929, + "grad_norm": 0.037109375, + "learning_rate": 0.008007622285136897, + "loss": 0.7833, + "num_input_tokens_seen": 53713400, + "step": 92510 + }, + { + "epoch": 13.779416145367888, + "grad_norm": 0.048095703125, + "learning_rate": 0.008005897483940982, + "loss": 0.782, + "num_input_tokens_seen": 53716120, + "step": 92515 + }, + { + "epoch": 13.780160857908847, + "grad_norm": 0.0303955078125, + "learning_rate": 0.008004172800903327, + "loss": 0.7885, + "num_input_tokens_seen": 53718904, + "step": 92520 + }, + { + "epoch": 13.780905570449807, + "grad_norm": 0.020751953125, + "learning_rate": 0.008002448236053056, + "loss": 0.8015, + "num_input_tokens_seen": 53721784, + "step": 92525 + }, + { + "epoch": 13.781650282990766, + "grad_norm": 0.044677734375, + "learning_rate": 0.008000723789419316, + "loss": 0.8199, + "num_input_tokens_seen": 53724760, + "step": 92530 + }, + { + "epoch": 13.782394995531725, + "grad_norm": 0.031982421875, + "learning_rate": 0.007998999461031232, + "loss": 0.7912, + "num_input_tokens_seen": 53727672, + "step": 92535 + }, + { + "epoch": 13.783139708072683, + "grad_norm": 0.0274658203125, + "learning_rate": 0.007997275250917934, + "loss": 0.8037, + "num_input_tokens_seen": 53730360, + "step": 92540 + }, + { + "epoch": 13.783884420613644, + "grad_norm": 0.0322265625, + "learning_rate": 0.007995551159108558, + "loss": 0.7858, + "num_input_tokens_seen": 53733048, + "step": 92545 + }, + { + "epoch": 13.784629133154603, + "grad_norm": 0.0400390625, + "learning_rate": 0.00799382718563222, + "loss": 0.7975, + "num_input_tokens_seen": 53735896, + "step": 92550 + }, + { + "epoch": 13.785373845695561, + "grad_norm": 0.04296875, + "learning_rate": 0.007992103330518056, + "loss": 0.7822, + "num_input_tokens_seen": 53738808, + "step": 92555 + }, + { + "epoch": 13.78611855823652, + "grad_norm": 0.04443359375, + "learning_rate": 0.007990379593795179, + "loss": 0.7844, + "num_input_tokens_seen": 53741688, + "step": 92560 + }, + { + "epoch": 13.78686327077748, + "grad_norm": 0.049072265625, + "learning_rate": 0.007988655975492719, + "loss": 0.7922, + "num_input_tokens_seen": 53744280, + "step": 92565 + }, + { + "epoch": 13.78760798331844, + "grad_norm": 0.1005859375, + "learning_rate": 0.007986932475639794, + "loss": 0.8178, + "num_input_tokens_seen": 53747000, + "step": 92570 + }, + { + "epoch": 13.788352695859398, + "grad_norm": 0.03857421875, + "learning_rate": 0.007985209094265512, + "loss": 0.7881, + "num_input_tokens_seen": 53749976, + "step": 92575 + }, + { + "epoch": 13.789097408400357, + "grad_norm": 0.020263671875, + "learning_rate": 0.007983485831398993, + "loss": 0.7736, + "num_input_tokens_seen": 53752760, + "step": 92580 + }, + { + "epoch": 13.789842120941316, + "grad_norm": 0.021728515625, + "learning_rate": 0.00798176268706935, + "loss": 0.7952, + "num_input_tokens_seen": 53755704, + "step": 92585 + }, + { + "epoch": 13.790586833482276, + "grad_norm": 0.049072265625, + "learning_rate": 0.007980039661305694, + "loss": 0.7975, + "num_input_tokens_seen": 53758840, + "step": 92590 + }, + { + "epoch": 13.791331546023235, + "grad_norm": 0.05322265625, + "learning_rate": 0.007978316754137127, + "loss": 0.7682, + "num_input_tokens_seen": 53761624, + "step": 92595 + }, + { + "epoch": 13.792076258564194, + "grad_norm": 0.0322265625, + "learning_rate": 0.007976593965592771, + "loss": 0.7946, + "num_input_tokens_seen": 53764504, + "step": 92600 + }, + { + "epoch": 13.792820971105153, + "grad_norm": 0.0380859375, + "learning_rate": 0.007974871295701711, + "loss": 0.8049, + "num_input_tokens_seen": 53767224, + "step": 92605 + }, + { + "epoch": 13.793565683646113, + "grad_norm": 0.0458984375, + "learning_rate": 0.00797314874449307, + "loss": 0.7943, + "num_input_tokens_seen": 53770104, + "step": 92610 + }, + { + "epoch": 13.794310396187072, + "grad_norm": 0.0228271484375, + "learning_rate": 0.00797142631199594, + "loss": 0.8043, + "num_input_tokens_seen": 53772728, + "step": 92615 + }, + { + "epoch": 13.79505510872803, + "grad_norm": 0.0361328125, + "learning_rate": 0.007969703998239413, + "loss": 0.8095, + "num_input_tokens_seen": 53776024, + "step": 92620 + }, + { + "epoch": 13.79579982126899, + "grad_norm": 0.044921875, + "learning_rate": 0.007967981803252595, + "loss": 0.8051, + "num_input_tokens_seen": 53779064, + "step": 92625 + }, + { + "epoch": 13.79654453380995, + "grad_norm": 0.037841796875, + "learning_rate": 0.007966259727064572, + "loss": 0.804, + "num_input_tokens_seen": 53782200, + "step": 92630 + }, + { + "epoch": 13.797289246350909, + "grad_norm": 0.031494140625, + "learning_rate": 0.007964537769704448, + "loss": 0.8027, + "num_input_tokens_seen": 53785464, + "step": 92635 + }, + { + "epoch": 13.798033958891867, + "grad_norm": 0.041748046875, + "learning_rate": 0.007962815931201309, + "loss": 0.7832, + "num_input_tokens_seen": 53788536, + "step": 92640 + }, + { + "epoch": 13.798778671432826, + "grad_norm": 0.052001953125, + "learning_rate": 0.007961094211584243, + "loss": 0.7847, + "num_input_tokens_seen": 53791512, + "step": 92645 + }, + { + "epoch": 13.799523383973787, + "grad_norm": 0.034423828125, + "learning_rate": 0.00795937261088233, + "loss": 0.7856, + "num_input_tokens_seen": 53794264, + "step": 92650 + }, + { + "epoch": 13.800268096514746, + "grad_norm": 0.023681640625, + "learning_rate": 0.007957651129124665, + "loss": 0.8088, + "num_input_tokens_seen": 53797176, + "step": 92655 + }, + { + "epoch": 13.801012809055704, + "grad_norm": 0.0322265625, + "learning_rate": 0.007955929766340328, + "loss": 0.7857, + "num_input_tokens_seen": 53799928, + "step": 92660 + }, + { + "epoch": 13.801757521596663, + "grad_norm": 0.036376953125, + "learning_rate": 0.007954208522558393, + "loss": 0.8117, + "num_input_tokens_seen": 53803096, + "step": 92665 + }, + { + "epoch": 13.802502234137624, + "grad_norm": 0.04248046875, + "learning_rate": 0.00795248739780795, + "loss": 0.8159, + "num_input_tokens_seen": 53805880, + "step": 92670 + }, + { + "epoch": 13.803246946678582, + "grad_norm": 0.04052734375, + "learning_rate": 0.007950766392118063, + "loss": 0.762, + "num_input_tokens_seen": 53808824, + "step": 92675 + }, + { + "epoch": 13.803991659219541, + "grad_norm": 0.041748046875, + "learning_rate": 0.007949045505517819, + "loss": 0.8401, + "num_input_tokens_seen": 53811640, + "step": 92680 + }, + { + "epoch": 13.8047363717605, + "grad_norm": 0.039306640625, + "learning_rate": 0.007947324738036278, + "loss": 0.8193, + "num_input_tokens_seen": 53814616, + "step": 92685 + }, + { + "epoch": 13.80548108430146, + "grad_norm": 0.02880859375, + "learning_rate": 0.007945604089702527, + "loss": 0.7966, + "num_input_tokens_seen": 53817624, + "step": 92690 + }, + { + "epoch": 13.80622579684242, + "grad_norm": 0.044189453125, + "learning_rate": 0.007943883560545623, + "loss": 0.789, + "num_input_tokens_seen": 53820440, + "step": 92695 + }, + { + "epoch": 13.806970509383378, + "grad_norm": 0.08740234375, + "learning_rate": 0.007942163150594629, + "loss": 0.7767, + "num_input_tokens_seen": 53823288, + "step": 92700 + }, + { + "epoch": 13.807715221924337, + "grad_norm": 0.033203125, + "learning_rate": 0.00794044285987862, + "loss": 0.803, + "num_input_tokens_seen": 53826424, + "step": 92705 + }, + { + "epoch": 13.808459934465297, + "grad_norm": 0.05322265625, + "learning_rate": 0.007938722688426653, + "loss": 0.816, + "num_input_tokens_seen": 53829208, + "step": 92710 + }, + { + "epoch": 13.809204647006256, + "grad_norm": 0.03662109375, + "learning_rate": 0.007937002636267792, + "loss": 0.8277, + "num_input_tokens_seen": 53831864, + "step": 92715 + }, + { + "epoch": 13.809949359547215, + "grad_norm": 0.028076171875, + "learning_rate": 0.007935282703431086, + "loss": 0.7954, + "num_input_tokens_seen": 53835096, + "step": 92720 + }, + { + "epoch": 13.810694072088173, + "grad_norm": 0.028076171875, + "learning_rate": 0.007933562889945607, + "loss": 0.7852, + "num_input_tokens_seen": 53838104, + "step": 92725 + }, + { + "epoch": 13.811438784629132, + "grad_norm": 0.044921875, + "learning_rate": 0.007931843195840392, + "loss": 0.8038, + "num_input_tokens_seen": 53840728, + "step": 92730 + }, + { + "epoch": 13.812183497170093, + "grad_norm": 0.08984375, + "learning_rate": 0.007930123621144508, + "loss": 0.8094, + "num_input_tokens_seen": 53843512, + "step": 92735 + }, + { + "epoch": 13.812928209711052, + "grad_norm": 0.0225830078125, + "learning_rate": 0.007928404165887, + "loss": 0.7864, + "num_input_tokens_seen": 53846648, + "step": 92740 + }, + { + "epoch": 13.81367292225201, + "grad_norm": 0.044921875, + "learning_rate": 0.007926684830096913, + "loss": 0.7847, + "num_input_tokens_seen": 53849496, + "step": 92745 + }, + { + "epoch": 13.81441763479297, + "grad_norm": 0.0498046875, + "learning_rate": 0.007924965613803299, + "loss": 0.7726, + "num_input_tokens_seen": 53852216, + "step": 92750 + }, + { + "epoch": 13.81516234733393, + "grad_norm": 0.032958984375, + "learning_rate": 0.007923246517035198, + "loss": 0.8147, + "num_input_tokens_seen": 53855160, + "step": 92755 + }, + { + "epoch": 13.815907059874888, + "grad_norm": 0.0380859375, + "learning_rate": 0.007921527539821659, + "loss": 0.7807, + "num_input_tokens_seen": 53857784, + "step": 92760 + }, + { + "epoch": 13.816651772415847, + "grad_norm": 0.04052734375, + "learning_rate": 0.007919808682191712, + "loss": 0.8031, + "num_input_tokens_seen": 53860760, + "step": 92765 + }, + { + "epoch": 13.817396484956806, + "grad_norm": 0.032958984375, + "learning_rate": 0.007918089944174407, + "loss": 0.7836, + "num_input_tokens_seen": 53864088, + "step": 92770 + }, + { + "epoch": 13.818141197497766, + "grad_norm": 0.0478515625, + "learning_rate": 0.007916371325798775, + "loss": 0.811, + "num_input_tokens_seen": 53867096, + "step": 92775 + }, + { + "epoch": 13.818885910038725, + "grad_norm": 0.0380859375, + "learning_rate": 0.007914652827093851, + "loss": 0.7944, + "num_input_tokens_seen": 53869752, + "step": 92780 + }, + { + "epoch": 13.819630622579684, + "grad_norm": 0.0693359375, + "learning_rate": 0.007912934448088665, + "loss": 0.7967, + "num_input_tokens_seen": 53872760, + "step": 92785 + }, + { + "epoch": 13.820375335120643, + "grad_norm": 0.050048828125, + "learning_rate": 0.007911216188812242, + "loss": 0.8281, + "num_input_tokens_seen": 53875544, + "step": 92790 + }, + { + "epoch": 13.821120047661603, + "grad_norm": 0.038818359375, + "learning_rate": 0.007909498049293624, + "loss": 0.7798, + "num_input_tokens_seen": 53878424, + "step": 92795 + }, + { + "epoch": 13.821864760202562, + "grad_norm": 0.034912109375, + "learning_rate": 0.007907780029561823, + "loss": 0.8099, + "num_input_tokens_seen": 53881240, + "step": 92800 + }, + { + "epoch": 13.82260947274352, + "grad_norm": 0.0439453125, + "learning_rate": 0.007906062129645876, + "loss": 0.799, + "num_input_tokens_seen": 53884216, + "step": 92805 + }, + { + "epoch": 13.82335418528448, + "grad_norm": 0.03271484375, + "learning_rate": 0.007904344349574795, + "loss": 0.7987, + "num_input_tokens_seen": 53887160, + "step": 92810 + }, + { + "epoch": 13.82409889782544, + "grad_norm": 0.0264892578125, + "learning_rate": 0.007902626689377608, + "loss": 0.8381, + "num_input_tokens_seen": 53890680, + "step": 92815 + }, + { + "epoch": 13.824843610366399, + "grad_norm": 0.02880859375, + "learning_rate": 0.007900909149083332, + "loss": 0.7872, + "num_input_tokens_seen": 53893688, + "step": 92820 + }, + { + "epoch": 13.825588322907358, + "grad_norm": 0.04248046875, + "learning_rate": 0.007899191728720974, + "loss": 0.7848, + "num_input_tokens_seen": 53896920, + "step": 92825 + }, + { + "epoch": 13.826333035448316, + "grad_norm": 0.0380859375, + "learning_rate": 0.00789747442831956, + "loss": 0.7881, + "num_input_tokens_seen": 53899896, + "step": 92830 + }, + { + "epoch": 13.827077747989277, + "grad_norm": 0.051025390625, + "learning_rate": 0.007895757247908098, + "loss": 0.8071, + "num_input_tokens_seen": 53902744, + "step": 92835 + }, + { + "epoch": 13.827822460530236, + "grad_norm": 0.037841796875, + "learning_rate": 0.007894040187515597, + "loss": 0.7847, + "num_input_tokens_seen": 53905784, + "step": 92840 + }, + { + "epoch": 13.828567173071194, + "grad_norm": 0.03564453125, + "learning_rate": 0.007892323247171056, + "loss": 0.8079, + "num_input_tokens_seen": 53908760, + "step": 92845 + }, + { + "epoch": 13.829311885612153, + "grad_norm": 0.040283203125, + "learning_rate": 0.0078906064269035, + "loss": 0.79, + "num_input_tokens_seen": 53911736, + "step": 92850 + }, + { + "epoch": 13.830056598153114, + "grad_norm": 0.03271484375, + "learning_rate": 0.00788888972674192, + "loss": 0.7809, + "num_input_tokens_seen": 53914552, + "step": 92855 + }, + { + "epoch": 13.830801310694072, + "grad_norm": 0.058349609375, + "learning_rate": 0.007887173146715316, + "loss": 0.8003, + "num_input_tokens_seen": 53917400, + "step": 92860 + }, + { + "epoch": 13.831546023235031, + "grad_norm": 0.0250244140625, + "learning_rate": 0.007885456686852699, + "loss": 0.8017, + "num_input_tokens_seen": 53920440, + "step": 92865 + }, + { + "epoch": 13.83229073577599, + "grad_norm": 0.045654296875, + "learning_rate": 0.007883740347183053, + "loss": 0.8106, + "num_input_tokens_seen": 53923640, + "step": 92870 + }, + { + "epoch": 13.83303544831695, + "grad_norm": 0.0546875, + "learning_rate": 0.007882024127735388, + "loss": 0.7735, + "num_input_tokens_seen": 53926584, + "step": 92875 + }, + { + "epoch": 13.83378016085791, + "grad_norm": 0.050048828125, + "learning_rate": 0.007880308028538684, + "loss": 0.7898, + "num_input_tokens_seen": 53929432, + "step": 92880 + }, + { + "epoch": 13.834524873398868, + "grad_norm": 0.034912109375, + "learning_rate": 0.007878592049621946, + "loss": 0.7925, + "num_input_tokens_seen": 53932248, + "step": 92885 + }, + { + "epoch": 13.835269585939827, + "grad_norm": 0.05029296875, + "learning_rate": 0.007876876191014152, + "loss": 0.8032, + "num_input_tokens_seen": 53935064, + "step": 92890 + }, + { + "epoch": 13.836014298480787, + "grad_norm": 0.064453125, + "learning_rate": 0.007875160452744301, + "loss": 0.8144, + "num_input_tokens_seen": 53938232, + "step": 92895 + }, + { + "epoch": 13.836759011021746, + "grad_norm": 0.033935546875, + "learning_rate": 0.007873444834841372, + "loss": 0.7946, + "num_input_tokens_seen": 53941176, + "step": 92900 + }, + { + "epoch": 13.837503723562705, + "grad_norm": 0.04150390625, + "learning_rate": 0.00787172933733435, + "loss": 0.7826, + "num_input_tokens_seen": 53944120, + "step": 92905 + }, + { + "epoch": 13.838248436103664, + "grad_norm": 0.04345703125, + "learning_rate": 0.007870013960252217, + "loss": 0.7595, + "num_input_tokens_seen": 53946968, + "step": 92910 + }, + { + "epoch": 13.838993148644622, + "grad_norm": 0.034912109375, + "learning_rate": 0.007868298703623945, + "loss": 0.7785, + "num_input_tokens_seen": 53950008, + "step": 92915 + }, + { + "epoch": 13.839737861185583, + "grad_norm": 0.03515625, + "learning_rate": 0.007866583567478522, + "loss": 0.7865, + "num_input_tokens_seen": 53953144, + "step": 92920 + }, + { + "epoch": 13.840482573726542, + "grad_norm": 0.048583984375, + "learning_rate": 0.007864868551844916, + "loss": 0.8027, + "num_input_tokens_seen": 53956312, + "step": 92925 + }, + { + "epoch": 13.8412272862675, + "grad_norm": 0.0361328125, + "learning_rate": 0.007863153656752109, + "loss": 0.7941, + "num_input_tokens_seen": 53959448, + "step": 92930 + }, + { + "epoch": 13.84197199880846, + "grad_norm": 0.0380859375, + "learning_rate": 0.00786143888222907, + "loss": 0.8314, + "num_input_tokens_seen": 53962488, + "step": 92935 + }, + { + "epoch": 13.84271671134942, + "grad_norm": 0.035400390625, + "learning_rate": 0.007859724228304756, + "loss": 0.7973, + "num_input_tokens_seen": 53965592, + "step": 92940 + }, + { + "epoch": 13.843461423890378, + "grad_norm": 0.061767578125, + "learning_rate": 0.007858009695008149, + "loss": 0.8185, + "num_input_tokens_seen": 53968312, + "step": 92945 + }, + { + "epoch": 13.844206136431337, + "grad_norm": 0.044189453125, + "learning_rate": 0.007856295282368208, + "loss": 0.7838, + "num_input_tokens_seen": 53971192, + "step": 92950 + }, + { + "epoch": 13.844950848972296, + "grad_norm": 0.03515625, + "learning_rate": 0.007854580990413902, + "loss": 0.8153, + "num_input_tokens_seen": 53974008, + "step": 92955 + }, + { + "epoch": 13.845695561513256, + "grad_norm": 0.041015625, + "learning_rate": 0.00785286681917418, + "loss": 0.7705, + "num_input_tokens_seen": 53976696, + "step": 92960 + }, + { + "epoch": 13.846440274054215, + "grad_norm": 0.0322265625, + "learning_rate": 0.007851152768678017, + "loss": 0.7708, + "num_input_tokens_seen": 53979480, + "step": 92965 + }, + { + "epoch": 13.847184986595174, + "grad_norm": 0.033447265625, + "learning_rate": 0.007849438838954363, + "loss": 0.7937, + "num_input_tokens_seen": 53982360, + "step": 92970 + }, + { + "epoch": 13.847929699136133, + "grad_norm": 0.0322265625, + "learning_rate": 0.00784772503003217, + "loss": 0.7997, + "num_input_tokens_seen": 53984952, + "step": 92975 + }, + { + "epoch": 13.848674411677093, + "grad_norm": 0.032470703125, + "learning_rate": 0.007846011341940394, + "loss": 0.8259, + "num_input_tokens_seen": 53987576, + "step": 92980 + }, + { + "epoch": 13.849419124218052, + "grad_norm": 0.032958984375, + "learning_rate": 0.00784429777470798, + "loss": 0.8197, + "num_input_tokens_seen": 53990136, + "step": 92985 + }, + { + "epoch": 13.85016383675901, + "grad_norm": 0.04833984375, + "learning_rate": 0.00784258432836389, + "loss": 0.7833, + "num_input_tokens_seen": 53993304, + "step": 92990 + }, + { + "epoch": 13.85090854929997, + "grad_norm": 0.0272216796875, + "learning_rate": 0.007840871002937054, + "loss": 0.8123, + "num_input_tokens_seen": 53996440, + "step": 92995 + }, + { + "epoch": 13.85165326184093, + "grad_norm": 0.038330078125, + "learning_rate": 0.007839157798456434, + "loss": 0.8368, + "num_input_tokens_seen": 53999224, + "step": 93000 + }, + { + "epoch": 13.852397974381889, + "grad_norm": 0.06689453125, + "learning_rate": 0.007837444714950959, + "loss": 0.8604, + "num_input_tokens_seen": 54002264, + "step": 93005 + }, + { + "epoch": 13.853142686922848, + "grad_norm": 0.04150390625, + "learning_rate": 0.00783573175244958, + "loss": 0.7977, + "num_input_tokens_seen": 54004920, + "step": 93010 + }, + { + "epoch": 13.853887399463806, + "grad_norm": 0.04833984375, + "learning_rate": 0.007834018910981228, + "loss": 0.8276, + "num_input_tokens_seen": 54007704, + "step": 93015 + }, + { + "epoch": 13.854632112004767, + "grad_norm": 0.05419921875, + "learning_rate": 0.007832306190574848, + "loss": 0.7816, + "num_input_tokens_seen": 54010552, + "step": 93020 + }, + { + "epoch": 13.855376824545726, + "grad_norm": 0.0223388671875, + "learning_rate": 0.007830593591259372, + "loss": 0.7988, + "num_input_tokens_seen": 54013304, + "step": 93025 + }, + { + "epoch": 13.856121537086684, + "grad_norm": 0.04541015625, + "learning_rate": 0.007828881113063721, + "loss": 0.7986, + "num_input_tokens_seen": 54016184, + "step": 93030 + }, + { + "epoch": 13.856866249627643, + "grad_norm": 0.036376953125, + "learning_rate": 0.007827168756016844, + "loss": 0.796, + "num_input_tokens_seen": 54019128, + "step": 93035 + }, + { + "epoch": 13.857610962168604, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00782545652014766, + "loss": 0.7866, + "num_input_tokens_seen": 54022072, + "step": 93040 + }, + { + "epoch": 13.858355674709562, + "grad_norm": 0.05810546875, + "learning_rate": 0.007823744405485097, + "loss": 0.7953, + "num_input_tokens_seen": 54025048, + "step": 93045 + }, + { + "epoch": 13.859100387250521, + "grad_norm": 0.033203125, + "learning_rate": 0.007822032412058071, + "loss": 0.7925, + "num_input_tokens_seen": 54027928, + "step": 93050 + }, + { + "epoch": 13.85984509979148, + "grad_norm": 0.037353515625, + "learning_rate": 0.00782032053989552, + "loss": 0.778, + "num_input_tokens_seen": 54030936, + "step": 93055 + }, + { + "epoch": 13.86058981233244, + "grad_norm": 0.04541015625, + "learning_rate": 0.007818608789026356, + "loss": 0.8169, + "num_input_tokens_seen": 54034072, + "step": 93060 + }, + { + "epoch": 13.8613345248734, + "grad_norm": 0.041015625, + "learning_rate": 0.007816897159479488, + "loss": 0.7868, + "num_input_tokens_seen": 54037144, + "step": 93065 + }, + { + "epoch": 13.862079237414358, + "grad_norm": 0.0230712890625, + "learning_rate": 0.007815185651283852, + "loss": 0.7811, + "num_input_tokens_seen": 54040056, + "step": 93070 + }, + { + "epoch": 13.862823949955317, + "grad_norm": 0.037841796875, + "learning_rate": 0.007813474264468346, + "loss": 0.7746, + "num_input_tokens_seen": 54042904, + "step": 93075 + }, + { + "epoch": 13.863568662496277, + "grad_norm": 0.0296630859375, + "learning_rate": 0.007811762999061894, + "loss": 0.7963, + "num_input_tokens_seen": 54045816, + "step": 93080 + }, + { + "epoch": 13.864313375037236, + "grad_norm": 0.029541015625, + "learning_rate": 0.007810051855093391, + "loss": 0.7834, + "num_input_tokens_seen": 54048280, + "step": 93085 + }, + { + "epoch": 13.865058087578195, + "grad_norm": 0.0625, + "learning_rate": 0.007808340832591764, + "loss": 0.8051, + "num_input_tokens_seen": 54051224, + "step": 93090 + }, + { + "epoch": 13.865802800119154, + "grad_norm": 0.05126953125, + "learning_rate": 0.007806629931585903, + "loss": 0.7732, + "num_input_tokens_seen": 54054200, + "step": 93095 + }, + { + "epoch": 13.866547512660112, + "grad_norm": 0.051025390625, + "learning_rate": 0.007804919152104724, + "loss": 0.8018, + "num_input_tokens_seen": 54056984, + "step": 93100 + }, + { + "epoch": 13.867292225201073, + "grad_norm": 0.049072265625, + "learning_rate": 0.007803208494177123, + "loss": 0.8099, + "num_input_tokens_seen": 54059640, + "step": 93105 + }, + { + "epoch": 13.868036937742032, + "grad_norm": 0.039306640625, + "learning_rate": 0.007801497957831999, + "loss": 0.7837, + "num_input_tokens_seen": 54062264, + "step": 93110 + }, + { + "epoch": 13.86878165028299, + "grad_norm": 0.0308837890625, + "learning_rate": 0.007799787543098252, + "loss": 0.7877, + "num_input_tokens_seen": 54065016, + "step": 93115 + }, + { + "epoch": 13.86952636282395, + "grad_norm": 0.034423828125, + "learning_rate": 0.00779807725000477, + "loss": 0.7754, + "num_input_tokens_seen": 54067928, + "step": 93120 + }, + { + "epoch": 13.87027107536491, + "grad_norm": 0.05126953125, + "learning_rate": 0.007796367078580459, + "loss": 0.7972, + "num_input_tokens_seen": 54070840, + "step": 93125 + }, + { + "epoch": 13.871015787905868, + "grad_norm": 0.04296875, + "learning_rate": 0.0077946570288541985, + "loss": 0.7998, + "num_input_tokens_seen": 54073496, + "step": 93130 + }, + { + "epoch": 13.871760500446827, + "grad_norm": 0.053466796875, + "learning_rate": 0.00779294710085489, + "loss": 0.7693, + "num_input_tokens_seen": 54076408, + "step": 93135 + }, + { + "epoch": 13.872505212987786, + "grad_norm": 0.03564453125, + "learning_rate": 0.007791237294611416, + "loss": 0.7911, + "num_input_tokens_seen": 54079096, + "step": 93140 + }, + { + "epoch": 13.873249925528746, + "grad_norm": 0.044677734375, + "learning_rate": 0.007789527610152655, + "loss": 0.7764, + "num_input_tokens_seen": 54081848, + "step": 93145 + }, + { + "epoch": 13.873994638069705, + "grad_norm": 0.03173828125, + "learning_rate": 0.007787818047507503, + "loss": 0.8083, + "num_input_tokens_seen": 54084984, + "step": 93150 + }, + { + "epoch": 13.874739350610664, + "grad_norm": 0.050048828125, + "learning_rate": 0.0077861086067048295, + "loss": 0.7801, + "num_input_tokens_seen": 54087736, + "step": 93155 + }, + { + "epoch": 13.875484063151623, + "grad_norm": 0.03369140625, + "learning_rate": 0.007784399287773525, + "loss": 0.8174, + "num_input_tokens_seen": 54090456, + "step": 93160 + }, + { + "epoch": 13.876228775692583, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0077826900907424605, + "loss": 0.7853, + "num_input_tokens_seen": 54093336, + "step": 93165 + }, + { + "epoch": 13.876973488233542, + "grad_norm": 0.03857421875, + "learning_rate": 0.007780981015640511, + "loss": 0.8424, + "num_input_tokens_seen": 54096376, + "step": 93170 + }, + { + "epoch": 13.8777182007745, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0077792720624965455, + "loss": 0.8106, + "num_input_tokens_seen": 54099128, + "step": 93175 + }, + { + "epoch": 13.87846291331546, + "grad_norm": 0.033447265625, + "learning_rate": 0.007777563231339444, + "loss": 0.7879, + "num_input_tokens_seen": 54102200, + "step": 93180 + }, + { + "epoch": 13.87920762585642, + "grad_norm": 0.03759765625, + "learning_rate": 0.0077758545221980726, + "loss": 0.7938, + "num_input_tokens_seen": 54104984, + "step": 93185 + }, + { + "epoch": 13.879952338397379, + "grad_norm": 0.05322265625, + "learning_rate": 0.0077741459351012895, + "loss": 0.7703, + "num_input_tokens_seen": 54107704, + "step": 93190 + }, + { + "epoch": 13.880697050938338, + "grad_norm": 0.035888671875, + "learning_rate": 0.007772437470077974, + "loss": 0.7887, + "num_input_tokens_seen": 54110680, + "step": 93195 + }, + { + "epoch": 13.881441763479296, + "grad_norm": 0.039794921875, + "learning_rate": 0.007770729127156974, + "loss": 0.8043, + "num_input_tokens_seen": 54113656, + "step": 93200 + }, + { + "epoch": 13.882186476020257, + "grad_norm": 0.03955078125, + "learning_rate": 0.007769020906367168, + "loss": 0.8297, + "num_input_tokens_seen": 54116760, + "step": 93205 + }, + { + "epoch": 13.882931188561216, + "grad_norm": 0.033447265625, + "learning_rate": 0.007767312807737393, + "loss": 0.8064, + "num_input_tokens_seen": 54119768, + "step": 93210 + }, + { + "epoch": 13.883675901102174, + "grad_norm": 0.042236328125, + "learning_rate": 0.007765604831296527, + "loss": 0.8173, + "num_input_tokens_seen": 54122712, + "step": 93215 + }, + { + "epoch": 13.884420613643133, + "grad_norm": 0.05126953125, + "learning_rate": 0.0077638969770734145, + "loss": 0.8069, + "num_input_tokens_seen": 54125464, + "step": 93220 + }, + { + "epoch": 13.885165326184094, + "grad_norm": 0.031494140625, + "learning_rate": 0.0077621892450969, + "loss": 0.7999, + "num_input_tokens_seen": 54128376, + "step": 93225 + }, + { + "epoch": 13.885910038725052, + "grad_norm": 0.05224609375, + "learning_rate": 0.007760481635395848, + "loss": 0.8099, + "num_input_tokens_seen": 54131512, + "step": 93230 + }, + { + "epoch": 13.886654751266011, + "grad_norm": 0.0301513671875, + "learning_rate": 0.007758774147999103, + "loss": 0.7938, + "num_input_tokens_seen": 54134648, + "step": 93235 + }, + { + "epoch": 13.88739946380697, + "grad_norm": 0.038330078125, + "learning_rate": 0.007757066782935508, + "loss": 0.7799, + "num_input_tokens_seen": 54137848, + "step": 93240 + }, + { + "epoch": 13.88814417634793, + "grad_norm": 0.041259765625, + "learning_rate": 0.0077553595402339035, + "loss": 0.7832, + "num_input_tokens_seen": 54140568, + "step": 93245 + }, + { + "epoch": 13.88888888888889, + "grad_norm": 0.032470703125, + "learning_rate": 0.007753652419923141, + "loss": 0.7901, + "num_input_tokens_seen": 54143736, + "step": 93250 + }, + { + "epoch": 13.889633601429848, + "grad_norm": 0.052490234375, + "learning_rate": 0.0077519454220320505, + "loss": 0.7916, + "num_input_tokens_seen": 54146488, + "step": 93255 + }, + { + "epoch": 13.890378313970807, + "grad_norm": 0.0439453125, + "learning_rate": 0.007750238546589481, + "loss": 0.8099, + "num_input_tokens_seen": 54149368, + "step": 93260 + }, + { + "epoch": 13.891123026511767, + "grad_norm": 0.0245361328125, + "learning_rate": 0.007748531793624262, + "loss": 0.8014, + "num_input_tokens_seen": 54152312, + "step": 93265 + }, + { + "epoch": 13.891867739052726, + "grad_norm": 0.034912109375, + "learning_rate": 0.007746825163165224, + "loss": 0.8087, + "num_input_tokens_seen": 54155032, + "step": 93270 + }, + { + "epoch": 13.892612451593685, + "grad_norm": 0.03271484375, + "learning_rate": 0.007745118655241208, + "loss": 0.7837, + "num_input_tokens_seen": 54158104, + "step": 93275 + }, + { + "epoch": 13.893357164134644, + "grad_norm": 0.030517578125, + "learning_rate": 0.007743412269881033, + "loss": 0.8012, + "num_input_tokens_seen": 54160728, + "step": 93280 + }, + { + "epoch": 13.894101876675602, + "grad_norm": 0.0400390625, + "learning_rate": 0.007741706007113541, + "loss": 0.7813, + "num_input_tokens_seen": 54163672, + "step": 93285 + }, + { + "epoch": 13.894846589216563, + "grad_norm": 0.035400390625, + "learning_rate": 0.00773999986696754, + "loss": 0.7743, + "num_input_tokens_seen": 54166520, + "step": 93290 + }, + { + "epoch": 13.895591301757522, + "grad_norm": 0.05615234375, + "learning_rate": 0.007738293849471869, + "loss": 0.8155, + "num_input_tokens_seen": 54169432, + "step": 93295 + }, + { + "epoch": 13.89633601429848, + "grad_norm": 0.0458984375, + "learning_rate": 0.007736587954655342, + "loss": 0.7971, + "num_input_tokens_seen": 54172472, + "step": 93300 + }, + { + "epoch": 13.89708072683944, + "grad_norm": 0.03271484375, + "learning_rate": 0.007734882182546782, + "loss": 0.7854, + "num_input_tokens_seen": 54175736, + "step": 93305 + }, + { + "epoch": 13.8978254393804, + "grad_norm": 0.134765625, + "learning_rate": 0.007733176533175002, + "loss": 0.8284, + "num_input_tokens_seen": 54178680, + "step": 93310 + }, + { + "epoch": 13.898570151921358, + "grad_norm": 0.038330078125, + "learning_rate": 0.007731471006568813, + "loss": 0.7915, + "num_input_tokens_seen": 54181784, + "step": 93315 + }, + { + "epoch": 13.899314864462317, + "grad_norm": 0.05029296875, + "learning_rate": 0.007729765602757039, + "loss": 0.7677, + "num_input_tokens_seen": 54184664, + "step": 93320 + }, + { + "epoch": 13.900059577003276, + "grad_norm": 0.031982421875, + "learning_rate": 0.007728060321768482, + "loss": 0.8173, + "num_input_tokens_seen": 54187480, + "step": 93325 + }, + { + "epoch": 13.900804289544237, + "grad_norm": 0.0235595703125, + "learning_rate": 0.007726355163631959, + "loss": 0.8104, + "num_input_tokens_seen": 54190520, + "step": 93330 + }, + { + "epoch": 13.901549002085195, + "grad_norm": 0.037841796875, + "learning_rate": 0.007724650128376269, + "loss": 0.7871, + "num_input_tokens_seen": 54193304, + "step": 93335 + }, + { + "epoch": 13.902293714626154, + "grad_norm": 0.056640625, + "learning_rate": 0.007722945216030224, + "loss": 0.7844, + "num_input_tokens_seen": 54196088, + "step": 93340 + }, + { + "epoch": 13.903038427167113, + "grad_norm": 0.03515625, + "learning_rate": 0.007721240426622626, + "loss": 0.7741, + "num_input_tokens_seen": 54198904, + "step": 93345 + }, + { + "epoch": 13.903783139708073, + "grad_norm": 0.04345703125, + "learning_rate": 0.007719535760182267, + "loss": 0.7899, + "num_input_tokens_seen": 54201752, + "step": 93350 + }, + { + "epoch": 13.904527852249032, + "grad_norm": 0.0390625, + "learning_rate": 0.007717831216737957, + "loss": 0.7853, + "num_input_tokens_seen": 54204696, + "step": 93355 + }, + { + "epoch": 13.90527256478999, + "grad_norm": 0.035400390625, + "learning_rate": 0.007716126796318481, + "loss": 0.7795, + "num_input_tokens_seen": 54207640, + "step": 93360 + }, + { + "epoch": 13.90601727733095, + "grad_norm": 0.052490234375, + "learning_rate": 0.007714422498952646, + "loss": 0.8436, + "num_input_tokens_seen": 54210584, + "step": 93365 + }, + { + "epoch": 13.90676198987191, + "grad_norm": 0.038818359375, + "learning_rate": 0.00771271832466924, + "loss": 0.7681, + "num_input_tokens_seen": 54213432, + "step": 93370 + }, + { + "epoch": 13.907506702412869, + "grad_norm": 0.03662109375, + "learning_rate": 0.0077110142734970475, + "loss": 0.78, + "num_input_tokens_seen": 54215992, + "step": 93375 + }, + { + "epoch": 13.908251414953828, + "grad_norm": 0.02294921875, + "learning_rate": 0.007709310345464864, + "loss": 0.7969, + "num_input_tokens_seen": 54218712, + "step": 93380 + }, + { + "epoch": 13.908996127494786, + "grad_norm": 0.0322265625, + "learning_rate": 0.007707606540601464, + "loss": 0.7907, + "num_input_tokens_seen": 54221592, + "step": 93385 + }, + { + "epoch": 13.909740840035747, + "grad_norm": 0.048583984375, + "learning_rate": 0.007705902858935648, + "loss": 0.7744, + "num_input_tokens_seen": 54224248, + "step": 93390 + }, + { + "epoch": 13.910485552576706, + "grad_norm": 0.058349609375, + "learning_rate": 0.007704199300496184, + "loss": 0.7991, + "num_input_tokens_seen": 54227352, + "step": 93395 + }, + { + "epoch": 13.911230265117664, + "grad_norm": 0.042724609375, + "learning_rate": 0.007702495865311862, + "loss": 0.8059, + "num_input_tokens_seen": 54230680, + "step": 93400 + }, + { + "epoch": 13.911974977658623, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00770079255341145, + "loss": 0.8198, + "num_input_tokens_seen": 54233400, + "step": 93405 + }, + { + "epoch": 13.912719690199584, + "grad_norm": 0.036376953125, + "learning_rate": 0.0076990893648237356, + "loss": 0.8177, + "num_input_tokens_seen": 54236120, + "step": 93410 + }, + { + "epoch": 13.913464402740543, + "grad_norm": 0.03369140625, + "learning_rate": 0.0076973862995774805, + "loss": 0.7935, + "num_input_tokens_seen": 54239000, + "step": 93415 + }, + { + "epoch": 13.914209115281501, + "grad_norm": 0.04541015625, + "learning_rate": 0.007695683357701467, + "loss": 0.8069, + "num_input_tokens_seen": 54241976, + "step": 93420 + }, + { + "epoch": 13.91495382782246, + "grad_norm": 0.047607421875, + "learning_rate": 0.007693980539224459, + "loss": 0.7988, + "num_input_tokens_seen": 54245208, + "step": 93425 + }, + { + "epoch": 13.915698540363419, + "grad_norm": 0.039794921875, + "learning_rate": 0.007692277844175219, + "loss": 0.809, + "num_input_tokens_seen": 54248056, + "step": 93430 + }, + { + "epoch": 13.91644325290438, + "grad_norm": 0.03369140625, + "learning_rate": 0.007690575272582522, + "loss": 0.8071, + "num_input_tokens_seen": 54250840, + "step": 93435 + }, + { + "epoch": 13.917187965445338, + "grad_norm": 0.037841796875, + "learning_rate": 0.007688872824475128, + "loss": 0.8236, + "num_input_tokens_seen": 54253912, + "step": 93440 + }, + { + "epoch": 13.917932677986297, + "grad_norm": 0.04833984375, + "learning_rate": 0.007687170499881799, + "loss": 0.7892, + "num_input_tokens_seen": 54256728, + "step": 93445 + }, + { + "epoch": 13.918677390527257, + "grad_norm": 0.031494140625, + "learning_rate": 0.007685468298831283, + "loss": 0.8008, + "num_input_tokens_seen": 54259416, + "step": 93450 + }, + { + "epoch": 13.919422103068216, + "grad_norm": 0.035888671875, + "learning_rate": 0.007683766221352352, + "loss": 0.8052, + "num_input_tokens_seen": 54262232, + "step": 93455 + }, + { + "epoch": 13.920166815609175, + "grad_norm": 0.0341796875, + "learning_rate": 0.0076820642674737556, + "loss": 0.7869, + "num_input_tokens_seen": 54265272, + "step": 93460 + }, + { + "epoch": 13.920911528150134, + "grad_norm": 0.052978515625, + "learning_rate": 0.007680362437224241, + "loss": 0.8131, + "num_input_tokens_seen": 54268408, + "step": 93465 + }, + { + "epoch": 13.921656240691092, + "grad_norm": 0.04150390625, + "learning_rate": 0.007678660730632569, + "loss": 0.7846, + "num_input_tokens_seen": 54271288, + "step": 93470 + }, + { + "epoch": 13.922400953232053, + "grad_norm": 0.042724609375, + "learning_rate": 0.007676959147727476, + "loss": 0.802, + "num_input_tokens_seen": 54274104, + "step": 93475 + }, + { + "epoch": 13.923145665773012, + "grad_norm": 0.032470703125, + "learning_rate": 0.007675257688537722, + "loss": 0.8053, + "num_input_tokens_seen": 54276920, + "step": 93480 + }, + { + "epoch": 13.92389037831397, + "grad_norm": 0.034423828125, + "learning_rate": 0.00767355635309204, + "loss": 0.7997, + "num_input_tokens_seen": 54279704, + "step": 93485 + }, + { + "epoch": 13.92463509085493, + "grad_norm": 0.034423828125, + "learning_rate": 0.00767185514141918, + "loss": 0.8086, + "num_input_tokens_seen": 54282616, + "step": 93490 + }, + { + "epoch": 13.92537980339589, + "grad_norm": 0.09765625, + "learning_rate": 0.0076701540535478825, + "loss": 0.8241, + "num_input_tokens_seen": 54285336, + "step": 93495 + }, + { + "epoch": 13.926124515936849, + "grad_norm": 0.037109375, + "learning_rate": 0.0076684530895068815, + "loss": 0.7877, + "num_input_tokens_seen": 54288312, + "step": 93500 + }, + { + "epoch": 13.926869228477807, + "grad_norm": 0.04296875, + "learning_rate": 0.007666752249324908, + "loss": 0.7944, + "num_input_tokens_seen": 54291512, + "step": 93505 + }, + { + "epoch": 13.927613941018766, + "grad_norm": 0.04150390625, + "learning_rate": 0.007665051533030707, + "loss": 0.8017, + "num_input_tokens_seen": 54294200, + "step": 93510 + }, + { + "epoch": 13.928358653559727, + "grad_norm": 0.0220947265625, + "learning_rate": 0.007663350940653006, + "loss": 0.8005, + "num_input_tokens_seen": 54296600, + "step": 93515 + }, + { + "epoch": 13.929103366100685, + "grad_norm": 0.045654296875, + "learning_rate": 0.007661650472220529, + "loss": 0.8076, + "num_input_tokens_seen": 54299320, + "step": 93520 + }, + { + "epoch": 13.929848078641644, + "grad_norm": 0.03466796875, + "learning_rate": 0.007659950127762016, + "loss": 0.7706, + "num_input_tokens_seen": 54302136, + "step": 93525 + }, + { + "epoch": 13.930592791182603, + "grad_norm": 0.045654296875, + "learning_rate": 0.007658249907306178, + "loss": 0.8039, + "num_input_tokens_seen": 54305304, + "step": 93530 + }, + { + "epoch": 13.931337503723563, + "grad_norm": 0.044677734375, + "learning_rate": 0.007656549810881752, + "loss": 0.7724, + "num_input_tokens_seen": 54308280, + "step": 93535 + }, + { + "epoch": 13.932082216264522, + "grad_norm": 0.03173828125, + "learning_rate": 0.0076548498385174525, + "loss": 0.793, + "num_input_tokens_seen": 54311160, + "step": 93540 + }, + { + "epoch": 13.932826928805481, + "grad_norm": 0.0294189453125, + "learning_rate": 0.007653149990241996, + "loss": 0.7636, + "num_input_tokens_seen": 54313944, + "step": 93545 + }, + { + "epoch": 13.93357164134644, + "grad_norm": 0.034423828125, + "learning_rate": 0.007651450266084108, + "loss": 0.8212, + "num_input_tokens_seen": 54316792, + "step": 93550 + }, + { + "epoch": 13.9343163538874, + "grad_norm": 0.0439453125, + "learning_rate": 0.007649750666072492, + "loss": 0.8043, + "num_input_tokens_seen": 54319992, + "step": 93555 + }, + { + "epoch": 13.935061066428359, + "grad_norm": 0.044677734375, + "learning_rate": 0.007648051190235875, + "loss": 0.7991, + "num_input_tokens_seen": 54322840, + "step": 93560 + }, + { + "epoch": 13.935805778969318, + "grad_norm": 0.04443359375, + "learning_rate": 0.007646351838602962, + "loss": 0.8076, + "num_input_tokens_seen": 54325848, + "step": 93565 + }, + { + "epoch": 13.936550491510276, + "grad_norm": 0.03564453125, + "learning_rate": 0.007644652611202459, + "loss": 0.7868, + "num_input_tokens_seen": 54328600, + "step": 93570 + }, + { + "epoch": 13.937295204051237, + "grad_norm": 0.032958984375, + "learning_rate": 0.00764295350806307, + "loss": 0.7977, + "num_input_tokens_seen": 54331224, + "step": 93575 + }, + { + "epoch": 13.938039916592196, + "grad_norm": 0.03564453125, + "learning_rate": 0.00764125452921351, + "loss": 0.7678, + "num_input_tokens_seen": 54334200, + "step": 93580 + }, + { + "epoch": 13.938784629133155, + "grad_norm": 0.043212890625, + "learning_rate": 0.007639555674682475, + "loss": 0.7868, + "num_input_tokens_seen": 54337016, + "step": 93585 + }, + { + "epoch": 13.939529341674113, + "grad_norm": 0.041259765625, + "learning_rate": 0.00763785694449866, + "loss": 0.7832, + "num_input_tokens_seen": 54339928, + "step": 93590 + }, + { + "epoch": 13.940274054215074, + "grad_norm": 0.033203125, + "learning_rate": 0.007636158338690776, + "loss": 0.7919, + "num_input_tokens_seen": 54342776, + "step": 93595 + }, + { + "epoch": 13.941018766756033, + "grad_norm": 0.052734375, + "learning_rate": 0.0076344598572875046, + "loss": 0.8139, + "num_input_tokens_seen": 54345752, + "step": 93600 + }, + { + "epoch": 13.941763479296991, + "grad_norm": 0.04638671875, + "learning_rate": 0.007632761500317555, + "loss": 0.8167, + "num_input_tokens_seen": 54348376, + "step": 93605 + }, + { + "epoch": 13.94250819183795, + "grad_norm": 0.0400390625, + "learning_rate": 0.0076310632678096065, + "loss": 0.7979, + "num_input_tokens_seen": 54351448, + "step": 93610 + }, + { + "epoch": 13.943252904378909, + "grad_norm": 0.040771484375, + "learning_rate": 0.007629365159792362, + "loss": 0.809, + "num_input_tokens_seen": 54354456, + "step": 93615 + }, + { + "epoch": 13.94399761691987, + "grad_norm": 0.03759765625, + "learning_rate": 0.007627667176294493, + "loss": 0.8016, + "num_input_tokens_seen": 54357432, + "step": 93620 + }, + { + "epoch": 13.944742329460828, + "grad_norm": 0.049072265625, + "learning_rate": 0.007625969317344702, + "loss": 0.8016, + "num_input_tokens_seen": 54360536, + "step": 93625 + }, + { + "epoch": 13.945487042001787, + "grad_norm": 0.0303955078125, + "learning_rate": 0.007624271582971666, + "loss": 0.8003, + "num_input_tokens_seen": 54363352, + "step": 93630 + }, + { + "epoch": 13.946231754542747, + "grad_norm": 0.051025390625, + "learning_rate": 0.007622573973204063, + "loss": 0.8224, + "num_input_tokens_seen": 54366200, + "step": 93635 + }, + { + "epoch": 13.946976467083706, + "grad_norm": 0.05029296875, + "learning_rate": 0.007620876488070576, + "loss": 0.7959, + "num_input_tokens_seen": 54368792, + "step": 93640 + }, + { + "epoch": 13.947721179624665, + "grad_norm": 0.031982421875, + "learning_rate": 0.0076191791275998745, + "loss": 0.7865, + "num_input_tokens_seen": 54371480, + "step": 93645 + }, + { + "epoch": 13.948465892165624, + "grad_norm": 0.05126953125, + "learning_rate": 0.007617481891820646, + "loss": 0.8002, + "num_input_tokens_seen": 54374488, + "step": 93650 + }, + { + "epoch": 13.949210604706582, + "grad_norm": 0.0517578125, + "learning_rate": 0.007615784780761551, + "loss": 0.83, + "num_input_tokens_seen": 54377208, + "step": 93655 + }, + { + "epoch": 13.949955317247543, + "grad_norm": 0.04931640625, + "learning_rate": 0.007614087794451274, + "loss": 0.805, + "num_input_tokens_seen": 54380088, + "step": 93660 + }, + { + "epoch": 13.950700029788502, + "grad_norm": 0.041748046875, + "learning_rate": 0.007612390932918478, + "loss": 0.7819, + "num_input_tokens_seen": 54383064, + "step": 93665 + }, + { + "epoch": 13.95144474232946, + "grad_norm": 0.04443359375, + "learning_rate": 0.007610694196191822, + "loss": 0.7964, + "num_input_tokens_seen": 54385912, + "step": 93670 + }, + { + "epoch": 13.95218945487042, + "grad_norm": 0.06396484375, + "learning_rate": 0.007608997584299981, + "loss": 0.8025, + "num_input_tokens_seen": 54388728, + "step": 93675 + }, + { + "epoch": 13.95293416741138, + "grad_norm": 0.025390625, + "learning_rate": 0.00760730109727161, + "loss": 0.8054, + "num_input_tokens_seen": 54391640, + "step": 93680 + }, + { + "epoch": 13.953678879952339, + "grad_norm": 0.05517578125, + "learning_rate": 0.007605604735135379, + "loss": 0.7982, + "num_input_tokens_seen": 54394680, + "step": 93685 + }, + { + "epoch": 13.954423592493297, + "grad_norm": 0.027587890625, + "learning_rate": 0.007603908497919936, + "loss": 0.8141, + "num_input_tokens_seen": 54397560, + "step": 93690 + }, + { + "epoch": 13.955168305034256, + "grad_norm": 0.062255859375, + "learning_rate": 0.007602212385653946, + "loss": 0.802, + "num_input_tokens_seen": 54400408, + "step": 93695 + }, + { + "epoch": 13.955913017575217, + "grad_norm": 0.04345703125, + "learning_rate": 0.0076005163983660605, + "loss": 0.8128, + "num_input_tokens_seen": 54403256, + "step": 93700 + }, + { + "epoch": 13.956657730116175, + "grad_norm": 0.044921875, + "learning_rate": 0.00759882053608493, + "loss": 0.8097, + "num_input_tokens_seen": 54406040, + "step": 93705 + }, + { + "epoch": 13.957402442657134, + "grad_norm": 0.0230712890625, + "learning_rate": 0.007597124798839202, + "loss": 0.7792, + "num_input_tokens_seen": 54408856, + "step": 93710 + }, + { + "epoch": 13.958147155198093, + "grad_norm": 0.0546875, + "learning_rate": 0.007595429186657523, + "loss": 0.8134, + "num_input_tokens_seen": 54411544, + "step": 93715 + }, + { + "epoch": 13.958891867739053, + "grad_norm": 0.04638671875, + "learning_rate": 0.007593733699568546, + "loss": 0.7927, + "num_input_tokens_seen": 54414296, + "step": 93720 + }, + { + "epoch": 13.959636580280012, + "grad_norm": 0.046630859375, + "learning_rate": 0.007592038337600907, + "loss": 0.7944, + "num_input_tokens_seen": 54416984, + "step": 93725 + }, + { + "epoch": 13.960381292820971, + "grad_norm": 0.046630859375, + "learning_rate": 0.007590343100783258, + "loss": 0.7975, + "num_input_tokens_seen": 54419864, + "step": 93730 + }, + { + "epoch": 13.96112600536193, + "grad_norm": 0.041259765625, + "learning_rate": 0.007588647989144223, + "loss": 0.7996, + "num_input_tokens_seen": 54422808, + "step": 93735 + }, + { + "epoch": 13.96187071790289, + "grad_norm": 0.05224609375, + "learning_rate": 0.007586953002712453, + "loss": 0.8115, + "num_input_tokens_seen": 54426136, + "step": 93740 + }, + { + "epoch": 13.962615430443849, + "grad_norm": 0.05126953125, + "learning_rate": 0.00758525814151658, + "loss": 0.791, + "num_input_tokens_seen": 54428984, + "step": 93745 + }, + { + "epoch": 13.963360142984808, + "grad_norm": 0.040283203125, + "learning_rate": 0.0075835634055852275, + "loss": 0.8081, + "num_input_tokens_seen": 54432088, + "step": 93750 + }, + { + "epoch": 13.964104855525767, + "grad_norm": 0.044677734375, + "learning_rate": 0.00758186879494704, + "loss": 0.805, + "num_input_tokens_seen": 54434904, + "step": 93755 + }, + { + "epoch": 13.964849568066727, + "grad_norm": 0.042724609375, + "learning_rate": 0.007580174309630634, + "loss": 0.805, + "num_input_tokens_seen": 54437720, + "step": 93760 + }, + { + "epoch": 13.965594280607686, + "grad_norm": 0.0439453125, + "learning_rate": 0.007578479949664647, + "loss": 0.7979, + "num_input_tokens_seen": 54440792, + "step": 93765 + }, + { + "epoch": 13.966338993148645, + "grad_norm": 0.03369140625, + "learning_rate": 0.007576785715077698, + "loss": 0.7993, + "num_input_tokens_seen": 54443704, + "step": 93770 + }, + { + "epoch": 13.967083705689603, + "grad_norm": 0.034912109375, + "learning_rate": 0.007575091605898411, + "loss": 0.7809, + "num_input_tokens_seen": 54446392, + "step": 93775 + }, + { + "epoch": 13.967828418230564, + "grad_norm": 0.04638671875, + "learning_rate": 0.0075733976221554, + "loss": 0.8077, + "num_input_tokens_seen": 54449304, + "step": 93780 + }, + { + "epoch": 13.968573130771523, + "grad_norm": 0.03125, + "learning_rate": 0.007571703763877291, + "loss": 0.8047, + "num_input_tokens_seen": 54452056, + "step": 93785 + }, + { + "epoch": 13.969317843312481, + "grad_norm": 0.04736328125, + "learning_rate": 0.007570010031092698, + "loss": 0.8132, + "num_input_tokens_seen": 54454968, + "step": 93790 + }, + { + "epoch": 13.97006255585344, + "grad_norm": 0.035400390625, + "learning_rate": 0.00756831642383023, + "loss": 0.8152, + "num_input_tokens_seen": 54457752, + "step": 93795 + }, + { + "epoch": 13.970807268394399, + "grad_norm": 0.06396484375, + "learning_rate": 0.007566622942118508, + "loss": 0.7952, + "num_input_tokens_seen": 54460440, + "step": 93800 + }, + { + "epoch": 13.97155198093536, + "grad_norm": 0.0341796875, + "learning_rate": 0.007564929585986129, + "loss": 0.7742, + "num_input_tokens_seen": 54463256, + "step": 93805 + }, + { + "epoch": 13.972296693476318, + "grad_norm": 0.0322265625, + "learning_rate": 0.007563236355461715, + "loss": 0.7834, + "num_input_tokens_seen": 54465912, + "step": 93810 + }, + { + "epoch": 13.973041406017277, + "grad_norm": 0.037109375, + "learning_rate": 0.007561543250573859, + "loss": 0.8085, + "num_input_tokens_seen": 54468568, + "step": 93815 + }, + { + "epoch": 13.973786118558236, + "grad_norm": 0.037109375, + "learning_rate": 0.007559850271351175, + "loss": 0.8036, + "num_input_tokens_seen": 54471192, + "step": 93820 + }, + { + "epoch": 13.974530831099196, + "grad_norm": 0.041259765625, + "learning_rate": 0.007558157417822259, + "loss": 0.7786, + "num_input_tokens_seen": 54474072, + "step": 93825 + }, + { + "epoch": 13.975275543640155, + "grad_norm": 0.05078125, + "learning_rate": 0.00755646469001571, + "loss": 0.7821, + "num_input_tokens_seen": 54477080, + "step": 93830 + }, + { + "epoch": 13.976020256181114, + "grad_norm": 0.02490234375, + "learning_rate": 0.00755477208796012, + "loss": 0.8173, + "num_input_tokens_seen": 54480152, + "step": 93835 + }, + { + "epoch": 13.976764968722073, + "grad_norm": 0.034912109375, + "learning_rate": 0.007553079611684092, + "loss": 0.8095, + "num_input_tokens_seen": 54483288, + "step": 93840 + }, + { + "epoch": 13.977509681263033, + "grad_norm": 0.0341796875, + "learning_rate": 0.007551387261216216, + "loss": 0.7941, + "num_input_tokens_seen": 54486456, + "step": 93845 + }, + { + "epoch": 13.978254393803992, + "grad_norm": 0.04150390625, + "learning_rate": 0.0075496950365850765, + "loss": 0.7801, + "num_input_tokens_seen": 54489208, + "step": 93850 + }, + { + "epoch": 13.97899910634495, + "grad_norm": 0.040283203125, + "learning_rate": 0.007548002937819273, + "loss": 0.7962, + "num_input_tokens_seen": 54492408, + "step": 93855 + }, + { + "epoch": 13.97974381888591, + "grad_norm": 0.0263671875, + "learning_rate": 0.00754631096494738, + "loss": 0.7734, + "num_input_tokens_seen": 54495352, + "step": 93860 + }, + { + "epoch": 13.98048853142687, + "grad_norm": 0.038818359375, + "learning_rate": 0.007544619117997993, + "loss": 0.7951, + "num_input_tokens_seen": 54498232, + "step": 93865 + }, + { + "epoch": 13.981233243967829, + "grad_norm": 0.041015625, + "learning_rate": 0.00754292739699969, + "loss": 0.806, + "num_input_tokens_seen": 54501304, + "step": 93870 + }, + { + "epoch": 13.981977956508787, + "grad_norm": 0.0400390625, + "learning_rate": 0.007541235801981043, + "loss": 0.819, + "num_input_tokens_seen": 54503960, + "step": 93875 + }, + { + "epoch": 13.982722669049746, + "grad_norm": 0.039794921875, + "learning_rate": 0.007539544332970642, + "loss": 0.8019, + "num_input_tokens_seen": 54506872, + "step": 93880 + }, + { + "epoch": 13.983467381590707, + "grad_norm": 0.035888671875, + "learning_rate": 0.007537852989997051, + "loss": 0.798, + "num_input_tokens_seen": 54510008, + "step": 93885 + }, + { + "epoch": 13.984212094131665, + "grad_norm": 0.032958984375, + "learning_rate": 0.0075361617730888555, + "loss": 0.7816, + "num_input_tokens_seen": 54513048, + "step": 93890 + }, + { + "epoch": 13.984956806672624, + "grad_norm": 0.041015625, + "learning_rate": 0.007534470682274622, + "loss": 0.8184, + "num_input_tokens_seen": 54515768, + "step": 93895 + }, + { + "epoch": 13.985701519213583, + "grad_norm": 0.03466796875, + "learning_rate": 0.007532779717582917, + "loss": 0.8043, + "num_input_tokens_seen": 54518712, + "step": 93900 + }, + { + "epoch": 13.986446231754543, + "grad_norm": 0.2021484375, + "learning_rate": 0.00753108887904231, + "loss": 0.8086, + "num_input_tokens_seen": 54521944, + "step": 93905 + }, + { + "epoch": 13.987190944295502, + "grad_norm": 0.05419921875, + "learning_rate": 0.00752939816668136, + "loss": 0.7938, + "num_input_tokens_seen": 54524696, + "step": 93910 + }, + { + "epoch": 13.987935656836461, + "grad_norm": 0.041015625, + "learning_rate": 0.0075277075805286415, + "loss": 0.8103, + "num_input_tokens_seen": 54527448, + "step": 93915 + }, + { + "epoch": 13.98868036937742, + "grad_norm": 0.023193359375, + "learning_rate": 0.007526017120612701, + "loss": 0.8001, + "num_input_tokens_seen": 54530456, + "step": 93920 + }, + { + "epoch": 13.98942508191838, + "grad_norm": 0.050048828125, + "learning_rate": 0.007524326786962114, + "loss": 0.8003, + "num_input_tokens_seen": 54533240, + "step": 93925 + }, + { + "epoch": 13.990169794459339, + "grad_norm": 0.04736328125, + "learning_rate": 0.007522636579605419, + "loss": 0.7835, + "num_input_tokens_seen": 54535864, + "step": 93930 + }, + { + "epoch": 13.990914507000298, + "grad_norm": 0.047119140625, + "learning_rate": 0.007520946498571187, + "loss": 0.7874, + "num_input_tokens_seen": 54538616, + "step": 93935 + }, + { + "epoch": 13.991659219541257, + "grad_norm": 0.0244140625, + "learning_rate": 0.007519256543887956, + "loss": 0.7907, + "num_input_tokens_seen": 54541304, + "step": 93940 + }, + { + "epoch": 13.992403932082215, + "grad_norm": 0.02734375, + "learning_rate": 0.007517566715584288, + "loss": 0.7839, + "num_input_tokens_seen": 54544152, + "step": 93945 + }, + { + "epoch": 13.993148644623176, + "grad_norm": 0.052734375, + "learning_rate": 0.007515877013688726, + "loss": 0.7868, + "num_input_tokens_seen": 54547032, + "step": 93950 + }, + { + "epoch": 13.993893357164135, + "grad_norm": 0.032470703125, + "learning_rate": 0.00751418743822981, + "loss": 0.7984, + "num_input_tokens_seen": 54549816, + "step": 93955 + }, + { + "epoch": 13.994638069705093, + "grad_norm": 0.046630859375, + "learning_rate": 0.007512497989236097, + "loss": 0.8068, + "num_input_tokens_seen": 54552760, + "step": 93960 + }, + { + "epoch": 13.995382782246054, + "grad_norm": 0.04296875, + "learning_rate": 0.007510808666736116, + "loss": 0.7956, + "num_input_tokens_seen": 54555736, + "step": 93965 + }, + { + "epoch": 13.996127494787013, + "grad_norm": 0.03466796875, + "learning_rate": 0.007509119470758413, + "loss": 0.8256, + "num_input_tokens_seen": 54558680, + "step": 93970 + }, + { + "epoch": 13.996872207327971, + "grad_norm": 0.025390625, + "learning_rate": 0.007507430401331517, + "loss": 0.7962, + "num_input_tokens_seen": 54561656, + "step": 93975 + }, + { + "epoch": 13.99761691986893, + "grad_norm": 0.048095703125, + "learning_rate": 0.007505741458483976, + "loss": 0.7933, + "num_input_tokens_seen": 54564408, + "step": 93980 + }, + { + "epoch": 13.998361632409889, + "grad_norm": 0.048828125, + "learning_rate": 0.0075040526422443165, + "loss": 0.8037, + "num_input_tokens_seen": 54567736, + "step": 93985 + }, + { + "epoch": 13.99910634495085, + "grad_norm": 0.0299072265625, + "learning_rate": 0.007502363952641061, + "loss": 0.7968, + "num_input_tokens_seen": 54570552, + "step": 93990 + }, + { + "epoch": 13.999851057491808, + "grad_norm": 0.05078125, + "learning_rate": 0.0075006753897027545, + "loss": 0.8153, + "num_input_tokens_seen": 54573752, + "step": 93995 + }, + { + "epoch": 14.0, + "eval_loss": 0.7991399168968201, + "eval_runtime": 70.5269, + "eval_samples_per_second": 42.31, + "eval_steps_per_second": 10.578, + "num_input_tokens_seen": 54573896, + "step": 93996 + }, + { + "epoch": 14.000595770032767, + "grad_norm": 0.049560546875, + "learning_rate": 0.007498986953457908, + "loss": 0.7965, + "num_input_tokens_seen": 54576136, + "step": 94000 + }, + { + "epoch": 14.001340482573726, + "grad_norm": 0.0289306640625, + "learning_rate": 0.007497298643935058, + "loss": 0.7961, + "num_input_tokens_seen": 54579176, + "step": 94005 + }, + { + "epoch": 14.002085195114686, + "grad_norm": 0.045654296875, + "learning_rate": 0.007495610461162719, + "loss": 0.7897, + "num_input_tokens_seen": 54581896, + "step": 94010 + }, + { + "epoch": 14.002829907655645, + "grad_norm": 0.050048828125, + "learning_rate": 0.007493922405169418, + "loss": 0.8156, + "num_input_tokens_seen": 54584648, + "step": 94015 + }, + { + "epoch": 14.003574620196604, + "grad_norm": 0.035888671875, + "learning_rate": 0.007492234475983665, + "loss": 0.7974, + "num_input_tokens_seen": 54587400, + "step": 94020 + }, + { + "epoch": 14.004319332737563, + "grad_norm": 0.057373046875, + "learning_rate": 0.0074905466736339835, + "loss": 0.8129, + "num_input_tokens_seen": 54590728, + "step": 94025 + }, + { + "epoch": 14.005064045278523, + "grad_norm": 0.04150390625, + "learning_rate": 0.007488858998148884, + "loss": 0.7934, + "num_input_tokens_seen": 54593704, + "step": 94030 + }, + { + "epoch": 14.005808757819482, + "grad_norm": 0.0216064453125, + "learning_rate": 0.0074871714495568795, + "loss": 0.7936, + "num_input_tokens_seen": 54596584, + "step": 94035 + }, + { + "epoch": 14.00655347036044, + "grad_norm": 0.04443359375, + "learning_rate": 0.007485484027886476, + "loss": 0.8029, + "num_input_tokens_seen": 54599208, + "step": 94040 + }, + { + "epoch": 14.0072981829014, + "grad_norm": 0.058349609375, + "learning_rate": 0.007483796733166178, + "loss": 0.802, + "num_input_tokens_seen": 54602152, + "step": 94045 + }, + { + "epoch": 14.00804289544236, + "grad_norm": 0.0419921875, + "learning_rate": 0.0074821095654244996, + "loss": 0.8196, + "num_input_tokens_seen": 54604776, + "step": 94050 + }, + { + "epoch": 14.008787607983319, + "grad_norm": 0.0223388671875, + "learning_rate": 0.007480422524689934, + "loss": 0.8165, + "num_input_tokens_seen": 54607528, + "step": 94055 + }, + { + "epoch": 14.009532320524277, + "grad_norm": 0.03662109375, + "learning_rate": 0.007478735610990994, + "loss": 0.8165, + "num_input_tokens_seen": 54610664, + "step": 94060 + }, + { + "epoch": 14.010277033065236, + "grad_norm": 0.060302734375, + "learning_rate": 0.007477048824356172, + "loss": 0.7776, + "num_input_tokens_seen": 54613672, + "step": 94065 + }, + { + "epoch": 14.011021745606197, + "grad_norm": 0.0361328125, + "learning_rate": 0.007475362164813956, + "loss": 0.7931, + "num_input_tokens_seen": 54616520, + "step": 94070 + }, + { + "epoch": 14.011766458147155, + "grad_norm": 0.042724609375, + "learning_rate": 0.007473675632392855, + "loss": 0.812, + "num_input_tokens_seen": 54619208, + "step": 94075 + }, + { + "epoch": 14.012511170688114, + "grad_norm": 0.03564453125, + "learning_rate": 0.0074719892271213504, + "loss": 0.7731, + "num_input_tokens_seen": 54622152, + "step": 94080 + }, + { + "epoch": 14.013255883229073, + "grad_norm": 0.0849609375, + "learning_rate": 0.007470302949027942, + "loss": 0.7773, + "num_input_tokens_seen": 54625096, + "step": 94085 + }, + { + "epoch": 14.014000595770034, + "grad_norm": 0.0400390625, + "learning_rate": 0.007468616798141105, + "loss": 0.793, + "num_input_tokens_seen": 54627880, + "step": 94090 + }, + { + "epoch": 14.014745308310992, + "grad_norm": 0.03125, + "learning_rate": 0.00746693077448934, + "loss": 0.7814, + "num_input_tokens_seen": 54630696, + "step": 94095 + }, + { + "epoch": 14.015490020851951, + "grad_norm": 0.046875, + "learning_rate": 0.007465244878101123, + "loss": 0.7874, + "num_input_tokens_seen": 54633768, + "step": 94100 + }, + { + "epoch": 14.01623473339291, + "grad_norm": 0.06787109375, + "learning_rate": 0.007463559109004936, + "loss": 0.7799, + "num_input_tokens_seen": 54636616, + "step": 94105 + }, + { + "epoch": 14.01697944593387, + "grad_norm": 0.052490234375, + "learning_rate": 0.007461873467229258, + "loss": 0.765, + "num_input_tokens_seen": 54639464, + "step": 94110 + }, + { + "epoch": 14.017724158474829, + "grad_norm": 0.0478515625, + "learning_rate": 0.007460187952802559, + "loss": 0.7889, + "num_input_tokens_seen": 54642600, + "step": 94115 + }, + { + "epoch": 14.018468871015788, + "grad_norm": 0.053466796875, + "learning_rate": 0.007458502565753329, + "loss": 0.77, + "num_input_tokens_seen": 54645480, + "step": 94120 + }, + { + "epoch": 14.019213583556747, + "grad_norm": 0.04541015625, + "learning_rate": 0.007456817306110028, + "loss": 0.7874, + "num_input_tokens_seen": 54648776, + "step": 94125 + }, + { + "epoch": 14.019958296097707, + "grad_norm": 0.046875, + "learning_rate": 0.007455132173901137, + "loss": 0.7556, + "num_input_tokens_seen": 54651432, + "step": 94130 + }, + { + "epoch": 14.020703008638666, + "grad_norm": 0.058349609375, + "learning_rate": 0.007453447169155113, + "loss": 0.8188, + "num_input_tokens_seen": 54654504, + "step": 94135 + }, + { + "epoch": 14.021447721179625, + "grad_norm": 0.05615234375, + "learning_rate": 0.007451762291900436, + "loss": 0.7931, + "num_input_tokens_seen": 54657256, + "step": 94140 + }, + { + "epoch": 14.022192433720583, + "grad_norm": 0.039306640625, + "learning_rate": 0.007450077542165564, + "loss": 0.7905, + "num_input_tokens_seen": 54660424, + "step": 94145 + }, + { + "epoch": 14.022937146261542, + "grad_norm": 0.035400390625, + "learning_rate": 0.0074483929199789505, + "loss": 0.8206, + "num_input_tokens_seen": 54663592, + "step": 94150 + }, + { + "epoch": 14.023681858802503, + "grad_norm": 0.05712890625, + "learning_rate": 0.007446708425369071, + "loss": 0.8091, + "num_input_tokens_seen": 54666472, + "step": 94155 + }, + { + "epoch": 14.024426571343461, + "grad_norm": 0.030517578125, + "learning_rate": 0.0074450240583643764, + "loss": 0.7886, + "num_input_tokens_seen": 54669704, + "step": 94160 + }, + { + "epoch": 14.02517128388442, + "grad_norm": 0.04443359375, + "learning_rate": 0.007443339818993314, + "loss": 0.8084, + "num_input_tokens_seen": 54672904, + "step": 94165 + }, + { + "epoch": 14.025915996425379, + "grad_norm": 0.03857421875, + "learning_rate": 0.007441655707284352, + "loss": 0.7798, + "num_input_tokens_seen": 54675976, + "step": 94170 + }, + { + "epoch": 14.02666070896634, + "grad_norm": 0.0308837890625, + "learning_rate": 0.007439971723265935, + "loss": 0.8033, + "num_input_tokens_seen": 54678952, + "step": 94175 + }, + { + "epoch": 14.027405421507298, + "grad_norm": 0.037109375, + "learning_rate": 0.007438287866966507, + "loss": 0.8012, + "num_input_tokens_seen": 54681672, + "step": 94180 + }, + { + "epoch": 14.028150134048257, + "grad_norm": 0.034423828125, + "learning_rate": 0.007436604138414525, + "loss": 0.7705, + "num_input_tokens_seen": 54684520, + "step": 94185 + }, + { + "epoch": 14.028894846589216, + "grad_norm": 0.03662109375, + "learning_rate": 0.00743492053763843, + "loss": 0.7949, + "num_input_tokens_seen": 54687208, + "step": 94190 + }, + { + "epoch": 14.029639559130176, + "grad_norm": 0.04541015625, + "learning_rate": 0.007433237064666656, + "loss": 0.7804, + "num_input_tokens_seen": 54689768, + "step": 94195 + }, + { + "epoch": 14.030384271671135, + "grad_norm": 0.044921875, + "learning_rate": 0.007431553719527658, + "loss": 0.7831, + "num_input_tokens_seen": 54692776, + "step": 94200 + }, + { + "epoch": 14.031128984212094, + "grad_norm": 0.029296875, + "learning_rate": 0.007429870502249861, + "loss": 0.7673, + "num_input_tokens_seen": 54695464, + "step": 94205 + }, + { + "epoch": 14.031873696753053, + "grad_norm": 0.045166015625, + "learning_rate": 0.0074281874128617155, + "loss": 0.7595, + "num_input_tokens_seen": 54698312, + "step": 94210 + }, + { + "epoch": 14.032618409294013, + "grad_norm": 0.0390625, + "learning_rate": 0.0074265044513916415, + "loss": 0.7753, + "num_input_tokens_seen": 54701480, + "step": 94215 + }, + { + "epoch": 14.033363121834972, + "grad_norm": 0.046630859375, + "learning_rate": 0.007424821617868082, + "loss": 0.8011, + "num_input_tokens_seen": 54704456, + "step": 94220 + }, + { + "epoch": 14.03410783437593, + "grad_norm": 0.05029296875, + "learning_rate": 0.007423138912319464, + "loss": 0.7846, + "num_input_tokens_seen": 54707240, + "step": 94225 + }, + { + "epoch": 14.03485254691689, + "grad_norm": 0.06298828125, + "learning_rate": 0.0074214563347742134, + "loss": 0.7721, + "num_input_tokens_seen": 54710120, + "step": 94230 + }, + { + "epoch": 14.03559725945785, + "grad_norm": 0.0458984375, + "learning_rate": 0.007419773885260755, + "loss": 0.79, + "num_input_tokens_seen": 54712808, + "step": 94235 + }, + { + "epoch": 14.036341971998809, + "grad_norm": 0.046875, + "learning_rate": 0.007418091563807505, + "loss": 0.7879, + "num_input_tokens_seen": 54715848, + "step": 94240 + }, + { + "epoch": 14.037086684539767, + "grad_norm": 0.0595703125, + "learning_rate": 0.0074164093704429, + "loss": 0.7942, + "num_input_tokens_seen": 54718920, + "step": 94245 + }, + { + "epoch": 14.037831397080726, + "grad_norm": 0.052490234375, + "learning_rate": 0.007414727305195345, + "loss": 0.7854, + "num_input_tokens_seen": 54721928, + "step": 94250 + }, + { + "epoch": 14.038576109621687, + "grad_norm": 0.08837890625, + "learning_rate": 0.007413045368093268, + "loss": 0.8018, + "num_input_tokens_seen": 54724712, + "step": 94255 + }, + { + "epoch": 14.039320822162646, + "grad_norm": 0.046142578125, + "learning_rate": 0.0074113635591650726, + "loss": 0.782, + "num_input_tokens_seen": 54727496, + "step": 94260 + }, + { + "epoch": 14.040065534703604, + "grad_norm": 0.072265625, + "learning_rate": 0.007409681878439184, + "loss": 0.8522, + "num_input_tokens_seen": 54730312, + "step": 94265 + }, + { + "epoch": 14.040810247244563, + "grad_norm": 0.043701171875, + "learning_rate": 0.007408000325944005, + "loss": 0.7897, + "num_input_tokens_seen": 54733512, + "step": 94270 + }, + { + "epoch": 14.041554959785524, + "grad_norm": 0.039306640625, + "learning_rate": 0.0074063189017079395, + "loss": 0.8376, + "num_input_tokens_seen": 54736488, + "step": 94275 + }, + { + "epoch": 14.042299672326482, + "grad_norm": 0.033935546875, + "learning_rate": 0.007404637605759402, + "loss": 0.8029, + "num_input_tokens_seen": 54739368, + "step": 94280 + }, + { + "epoch": 14.043044384867441, + "grad_norm": 0.0625, + "learning_rate": 0.007402956438126789, + "loss": 0.7946, + "num_input_tokens_seen": 54742152, + "step": 94285 + }, + { + "epoch": 14.0437890974084, + "grad_norm": 0.04443359375, + "learning_rate": 0.00740127539883851, + "loss": 0.798, + "num_input_tokens_seen": 54745224, + "step": 94290 + }, + { + "epoch": 14.04453380994936, + "grad_norm": 0.057861328125, + "learning_rate": 0.00739959448792296, + "loss": 0.8069, + "num_input_tokens_seen": 54748008, + "step": 94295 + }, + { + "epoch": 14.04527852249032, + "grad_norm": 0.049072265625, + "learning_rate": 0.007397913705408537, + "loss": 0.7838, + "num_input_tokens_seen": 54750888, + "step": 94300 + }, + { + "epoch": 14.046023235031278, + "grad_norm": 0.037109375, + "learning_rate": 0.007396233051323629, + "loss": 0.7802, + "num_input_tokens_seen": 54753928, + "step": 94305 + }, + { + "epoch": 14.046767947572237, + "grad_norm": 0.040283203125, + "learning_rate": 0.007394552525696642, + "loss": 0.8282, + "num_input_tokens_seen": 54757160, + "step": 94310 + }, + { + "epoch": 14.047512660113195, + "grad_norm": 0.039794921875, + "learning_rate": 0.00739287212855596, + "loss": 0.8006, + "num_input_tokens_seen": 54760040, + "step": 94315 + }, + { + "epoch": 14.048257372654156, + "grad_norm": 0.039794921875, + "learning_rate": 0.007391191859929967, + "loss": 0.807, + "num_input_tokens_seen": 54763304, + "step": 94320 + }, + { + "epoch": 14.049002085195115, + "grad_norm": 0.07080078125, + "learning_rate": 0.00738951171984706, + "loss": 0.8084, + "num_input_tokens_seen": 54766248, + "step": 94325 + }, + { + "epoch": 14.049746797736073, + "grad_norm": 0.056640625, + "learning_rate": 0.00738783170833561, + "loss": 0.7982, + "num_input_tokens_seen": 54769000, + "step": 94330 + }, + { + "epoch": 14.050491510277032, + "grad_norm": 0.049560546875, + "learning_rate": 0.0073861518254240135, + "loss": 0.8144, + "num_input_tokens_seen": 54771816, + "step": 94335 + }, + { + "epoch": 14.051236222817993, + "grad_norm": 0.043701171875, + "learning_rate": 0.007384472071140637, + "loss": 0.8175, + "num_input_tokens_seen": 54774536, + "step": 94340 + }, + { + "epoch": 14.051980935358952, + "grad_norm": 0.07958984375, + "learning_rate": 0.0073827924455138715, + "loss": 0.7961, + "num_input_tokens_seen": 54777416, + "step": 94345 + }, + { + "epoch": 14.05272564789991, + "grad_norm": 0.05322265625, + "learning_rate": 0.007381112948572087, + "loss": 0.7782, + "num_input_tokens_seen": 54780168, + "step": 94350 + }, + { + "epoch": 14.053470360440869, + "grad_norm": 0.03857421875, + "learning_rate": 0.0073794335803436485, + "loss": 0.7915, + "num_input_tokens_seen": 54783144, + "step": 94355 + }, + { + "epoch": 14.05421507298183, + "grad_norm": 0.03759765625, + "learning_rate": 0.007377754340856938, + "loss": 0.8069, + "num_input_tokens_seen": 54786024, + "step": 94360 + }, + { + "epoch": 14.054959785522788, + "grad_norm": 0.04150390625, + "learning_rate": 0.007376075230140325, + "loss": 0.8074, + "num_input_tokens_seen": 54789000, + "step": 94365 + }, + { + "epoch": 14.055704498063747, + "grad_norm": 0.03125, + "learning_rate": 0.007374396248222171, + "loss": 0.8173, + "num_input_tokens_seen": 54791912, + "step": 94370 + }, + { + "epoch": 14.056449210604706, + "grad_norm": 0.0517578125, + "learning_rate": 0.007372717395130836, + "loss": 0.7855, + "num_input_tokens_seen": 54794856, + "step": 94375 + }, + { + "epoch": 14.057193923145666, + "grad_norm": 0.0625, + "learning_rate": 0.007371038670894694, + "loss": 0.7882, + "num_input_tokens_seen": 54797768, + "step": 94380 + }, + { + "epoch": 14.057938635686625, + "grad_norm": 0.059326171875, + "learning_rate": 0.0073693600755420936, + "loss": 0.8133, + "num_input_tokens_seen": 54800712, + "step": 94385 + }, + { + "epoch": 14.058683348227584, + "grad_norm": 0.05322265625, + "learning_rate": 0.0073676816091014065, + "loss": 0.793, + "num_input_tokens_seen": 54803400, + "step": 94390 + }, + { + "epoch": 14.059428060768543, + "grad_norm": 0.07470703125, + "learning_rate": 0.007366003271600982, + "loss": 0.7839, + "num_input_tokens_seen": 54806312, + "step": 94395 + }, + { + "epoch": 14.060172773309503, + "grad_norm": 0.053955078125, + "learning_rate": 0.007364325063069165, + "loss": 0.8147, + "num_input_tokens_seen": 54809480, + "step": 94400 + }, + { + "epoch": 14.060917485850462, + "grad_norm": 0.044921875, + "learning_rate": 0.007362646983534324, + "loss": 0.7869, + "num_input_tokens_seen": 54812392, + "step": 94405 + }, + { + "epoch": 14.06166219839142, + "grad_norm": 0.052734375, + "learning_rate": 0.007360969033024792, + "loss": 0.7919, + "num_input_tokens_seen": 54815304, + "step": 94410 + }, + { + "epoch": 14.06240691093238, + "grad_norm": 0.0281982421875, + "learning_rate": 0.007359291211568931, + "loss": 0.7895, + "num_input_tokens_seen": 54818216, + "step": 94415 + }, + { + "epoch": 14.06315162347334, + "grad_norm": 0.053466796875, + "learning_rate": 0.007357613519195075, + "loss": 0.7877, + "num_input_tokens_seen": 54820872, + "step": 94420 + }, + { + "epoch": 14.063896336014299, + "grad_norm": 0.052001953125, + "learning_rate": 0.007355935955931575, + "loss": 0.8089, + "num_input_tokens_seen": 54823720, + "step": 94425 + }, + { + "epoch": 14.064641048555258, + "grad_norm": 0.046630859375, + "learning_rate": 0.007354258521806768, + "loss": 0.7914, + "num_input_tokens_seen": 54826536, + "step": 94430 + }, + { + "epoch": 14.065385761096216, + "grad_norm": 0.037353515625, + "learning_rate": 0.007352581216848994, + "loss": 0.7857, + "num_input_tokens_seen": 54829352, + "step": 94435 + }, + { + "epoch": 14.066130473637177, + "grad_norm": 0.025390625, + "learning_rate": 0.007350904041086586, + "loss": 0.7935, + "num_input_tokens_seen": 54832200, + "step": 94440 + }, + { + "epoch": 14.066875186178136, + "grad_norm": 0.0546875, + "learning_rate": 0.007349226994547875, + "loss": 0.7851, + "num_input_tokens_seen": 54835144, + "step": 94445 + }, + { + "epoch": 14.067619898719094, + "grad_norm": 0.037109375, + "learning_rate": 0.007347550077261204, + "loss": 0.7992, + "num_input_tokens_seen": 54838120, + "step": 94450 + }, + { + "epoch": 14.068364611260053, + "grad_norm": 0.0380859375, + "learning_rate": 0.00734587328925489, + "loss": 0.8079, + "num_input_tokens_seen": 54840840, + "step": 94455 + }, + { + "epoch": 14.069109323801014, + "grad_norm": 0.035888671875, + "learning_rate": 0.007344196630557275, + "loss": 0.7959, + "num_input_tokens_seen": 54843848, + "step": 94460 + }, + { + "epoch": 14.069854036341972, + "grad_norm": 0.09765625, + "learning_rate": 0.007342520101196671, + "loss": 0.8178, + "num_input_tokens_seen": 54846728, + "step": 94465 + }, + { + "epoch": 14.070598748882931, + "grad_norm": 0.03662109375, + "learning_rate": 0.007340843701201412, + "loss": 0.7881, + "num_input_tokens_seen": 54849768, + "step": 94470 + }, + { + "epoch": 14.07134346142389, + "grad_norm": 0.061767578125, + "learning_rate": 0.007339167430599813, + "loss": 0.801, + "num_input_tokens_seen": 54852680, + "step": 94475 + }, + { + "epoch": 14.07208817396485, + "grad_norm": 0.027587890625, + "learning_rate": 0.007337491289420192, + "loss": 0.8199, + "num_input_tokens_seen": 54856040, + "step": 94480 + }, + { + "epoch": 14.07283288650581, + "grad_norm": 0.047607421875, + "learning_rate": 0.00733581527769087, + "loss": 0.7754, + "num_input_tokens_seen": 54859208, + "step": 94485 + }, + { + "epoch": 14.073577599046768, + "grad_norm": 0.042236328125, + "learning_rate": 0.007334139395440162, + "loss": 0.7959, + "num_input_tokens_seen": 54862056, + "step": 94490 + }, + { + "epoch": 14.074322311587727, + "grad_norm": 0.05029296875, + "learning_rate": 0.00733246364269637, + "loss": 0.8156, + "num_input_tokens_seen": 54865128, + "step": 94495 + }, + { + "epoch": 14.075067024128685, + "grad_norm": 0.034912109375, + "learning_rate": 0.00733078801948782, + "loss": 0.7776, + "num_input_tokens_seen": 54867976, + "step": 94500 + }, + { + "epoch": 14.075811736669646, + "grad_norm": 0.03857421875, + "learning_rate": 0.007329112525842809, + "loss": 0.784, + "num_input_tokens_seen": 54870824, + "step": 94505 + }, + { + "epoch": 14.076556449210605, + "grad_norm": 0.0439453125, + "learning_rate": 0.007327437161789647, + "loss": 0.7697, + "num_input_tokens_seen": 54874056, + "step": 94510 + }, + { + "epoch": 14.077301161751564, + "grad_norm": 0.06396484375, + "learning_rate": 0.00732576192735663, + "loss": 0.8058, + "num_input_tokens_seen": 54876744, + "step": 94515 + }, + { + "epoch": 14.078045874292522, + "grad_norm": 0.0537109375, + "learning_rate": 0.007324086822572071, + "loss": 0.7792, + "num_input_tokens_seen": 54879656, + "step": 94520 + }, + { + "epoch": 14.078790586833483, + "grad_norm": 0.06103515625, + "learning_rate": 0.0073224118474642585, + "loss": 0.7835, + "num_input_tokens_seen": 54882600, + "step": 94525 + }, + { + "epoch": 14.079535299374442, + "grad_norm": 0.1943359375, + "learning_rate": 0.007320737002061501, + "loss": 0.874, + "num_input_tokens_seen": 54885352, + "step": 94530 + }, + { + "epoch": 14.0802800119154, + "grad_norm": 0.05615234375, + "learning_rate": 0.007319062286392079, + "loss": 0.7968, + "num_input_tokens_seen": 54888264, + "step": 94535 + }, + { + "epoch": 14.081024724456359, + "grad_norm": 0.043212890625, + "learning_rate": 0.0073173877004843, + "loss": 0.7994, + "num_input_tokens_seen": 54891336, + "step": 94540 + }, + { + "epoch": 14.08176943699732, + "grad_norm": 0.03125, + "learning_rate": 0.0073157132443664435, + "loss": 0.7622, + "num_input_tokens_seen": 54894280, + "step": 94545 + }, + { + "epoch": 14.082514149538278, + "grad_norm": 0.03125, + "learning_rate": 0.007314038918066804, + "loss": 0.7924, + "num_input_tokens_seen": 54897608, + "step": 94550 + }, + { + "epoch": 14.083258862079237, + "grad_norm": 0.045166015625, + "learning_rate": 0.007312364721613669, + "loss": 0.794, + "num_input_tokens_seen": 54900296, + "step": 94555 + }, + { + "epoch": 14.084003574620196, + "grad_norm": 0.038818359375, + "learning_rate": 0.007310690655035317, + "loss": 0.8042, + "num_input_tokens_seen": 54903112, + "step": 94560 + }, + { + "epoch": 14.084748287161156, + "grad_norm": 0.032470703125, + "learning_rate": 0.00730901671836003, + "loss": 0.767, + "num_input_tokens_seen": 54906184, + "step": 94565 + }, + { + "epoch": 14.085492999702115, + "grad_norm": 0.0439453125, + "learning_rate": 0.007307342911616085, + "loss": 0.8189, + "num_input_tokens_seen": 54908904, + "step": 94570 + }, + { + "epoch": 14.086237712243074, + "grad_norm": 0.044677734375, + "learning_rate": 0.007305669234831768, + "loss": 0.7889, + "num_input_tokens_seen": 54911816, + "step": 94575 + }, + { + "epoch": 14.086982424784033, + "grad_norm": 0.048095703125, + "learning_rate": 0.007303995688035345, + "loss": 0.7916, + "num_input_tokens_seen": 54914600, + "step": 94580 + }, + { + "epoch": 14.087727137324993, + "grad_norm": 0.04248046875, + "learning_rate": 0.007302322271255097, + "loss": 0.7946, + "num_input_tokens_seen": 54917640, + "step": 94585 + }, + { + "epoch": 14.088471849865952, + "grad_norm": 0.04736328125, + "learning_rate": 0.00730064898451929, + "loss": 0.8088, + "num_input_tokens_seen": 54920392, + "step": 94590 + }, + { + "epoch": 14.08921656240691, + "grad_norm": 0.072265625, + "learning_rate": 0.007298975827856189, + "loss": 0.7822, + "num_input_tokens_seen": 54923080, + "step": 94595 + }, + { + "epoch": 14.08996127494787, + "grad_norm": 0.05908203125, + "learning_rate": 0.007297302801294069, + "loss": 0.7799, + "num_input_tokens_seen": 54926024, + "step": 94600 + }, + { + "epoch": 14.09070598748883, + "grad_norm": 0.06396484375, + "learning_rate": 0.007295629904861184, + "loss": 0.7864, + "num_input_tokens_seen": 54929064, + "step": 94605 + }, + { + "epoch": 14.091450700029789, + "grad_norm": 0.05859375, + "learning_rate": 0.007293957138585808, + "loss": 0.826, + "num_input_tokens_seen": 54932104, + "step": 94610 + }, + { + "epoch": 14.092195412570748, + "grad_norm": 0.0478515625, + "learning_rate": 0.0072922845024961875, + "loss": 0.8036, + "num_input_tokens_seen": 54934824, + "step": 94615 + }, + { + "epoch": 14.092940125111706, + "grad_norm": 0.0478515625, + "learning_rate": 0.007290611996620592, + "loss": 0.7901, + "num_input_tokens_seen": 54937768, + "step": 94620 + }, + { + "epoch": 14.093684837652667, + "grad_norm": 0.0498046875, + "learning_rate": 0.0072889396209872715, + "loss": 0.7831, + "num_input_tokens_seen": 54940712, + "step": 94625 + }, + { + "epoch": 14.094429550193626, + "grad_norm": 0.034423828125, + "learning_rate": 0.00728726737562448, + "loss": 0.7901, + "num_input_tokens_seen": 54943464, + "step": 94630 + }, + { + "epoch": 14.095174262734584, + "grad_norm": 0.044677734375, + "learning_rate": 0.007285595260560466, + "loss": 0.8002, + "num_input_tokens_seen": 54946184, + "step": 94635 + }, + { + "epoch": 14.095918975275543, + "grad_norm": 0.057861328125, + "learning_rate": 0.007283923275823475, + "loss": 0.8153, + "num_input_tokens_seen": 54948968, + "step": 94640 + }, + { + "epoch": 14.096663687816504, + "grad_norm": 0.04150390625, + "learning_rate": 0.0072822514214417634, + "loss": 0.7882, + "num_input_tokens_seen": 54952040, + "step": 94645 + }, + { + "epoch": 14.097408400357462, + "grad_norm": 0.033935546875, + "learning_rate": 0.0072805796974435635, + "loss": 0.7939, + "num_input_tokens_seen": 54954984, + "step": 94650 + }, + { + "epoch": 14.098153112898421, + "grad_norm": 0.03662109375, + "learning_rate": 0.00727890810385713, + "loss": 0.7821, + "num_input_tokens_seen": 54958184, + "step": 94655 + }, + { + "epoch": 14.09889782543938, + "grad_norm": 0.05078125, + "learning_rate": 0.007277236640710691, + "loss": 0.7871, + "num_input_tokens_seen": 54961384, + "step": 94660 + }, + { + "epoch": 14.099642537980339, + "grad_norm": 0.046142578125, + "learning_rate": 0.0072755653080324965, + "loss": 0.7908, + "num_input_tokens_seen": 54964040, + "step": 94665 + }, + { + "epoch": 14.1003872505213, + "grad_norm": 0.057373046875, + "learning_rate": 0.007273894105850774, + "loss": 0.8013, + "num_input_tokens_seen": 54966824, + "step": 94670 + }, + { + "epoch": 14.101131963062258, + "grad_norm": 0.045654296875, + "learning_rate": 0.007272223034193753, + "loss": 0.7728, + "num_input_tokens_seen": 54969704, + "step": 94675 + }, + { + "epoch": 14.101876675603217, + "grad_norm": 0.047119140625, + "learning_rate": 0.007270552093089675, + "loss": 0.7905, + "num_input_tokens_seen": 54972296, + "step": 94680 + }, + { + "epoch": 14.102621388144176, + "grad_norm": 0.05126953125, + "learning_rate": 0.0072688812825667595, + "loss": 0.7991, + "num_input_tokens_seen": 54975208, + "step": 94685 + }, + { + "epoch": 14.103366100685136, + "grad_norm": 0.0478515625, + "learning_rate": 0.007267210602653242, + "loss": 0.7761, + "num_input_tokens_seen": 54977864, + "step": 94690 + }, + { + "epoch": 14.104110813226095, + "grad_norm": 0.05126953125, + "learning_rate": 0.007265540053377341, + "loss": 0.7876, + "num_input_tokens_seen": 54980712, + "step": 94695 + }, + { + "epoch": 14.104855525767054, + "grad_norm": 0.0517578125, + "learning_rate": 0.0072638696347672805, + "loss": 0.7932, + "num_input_tokens_seen": 54983400, + "step": 94700 + }, + { + "epoch": 14.105600238308012, + "grad_norm": 0.032470703125, + "learning_rate": 0.007262199346851274, + "loss": 0.781, + "num_input_tokens_seen": 54986248, + "step": 94705 + }, + { + "epoch": 14.106344950848973, + "grad_norm": 0.042236328125, + "learning_rate": 0.007260529189657552, + "loss": 0.8112, + "num_input_tokens_seen": 54989704, + "step": 94710 + }, + { + "epoch": 14.107089663389932, + "grad_norm": 0.0517578125, + "learning_rate": 0.007258859163214323, + "loss": 0.8105, + "num_input_tokens_seen": 54992776, + "step": 94715 + }, + { + "epoch": 14.10783437593089, + "grad_norm": 0.0458984375, + "learning_rate": 0.007257189267549793, + "loss": 0.8012, + "num_input_tokens_seen": 54995880, + "step": 94720 + }, + { + "epoch": 14.10857908847185, + "grad_norm": 0.03466796875, + "learning_rate": 0.0072555195026921876, + "loss": 0.8444, + "num_input_tokens_seen": 54998472, + "step": 94725 + }, + { + "epoch": 14.10932380101281, + "grad_norm": 0.06494140625, + "learning_rate": 0.0072538498686697045, + "loss": 0.8081, + "num_input_tokens_seen": 55001384, + "step": 94730 + }, + { + "epoch": 14.110068513553768, + "grad_norm": 0.037841796875, + "learning_rate": 0.007252180365510561, + "loss": 0.812, + "num_input_tokens_seen": 55004264, + "step": 94735 + }, + { + "epoch": 14.110813226094727, + "grad_norm": 0.05224609375, + "learning_rate": 0.007250510993242948, + "loss": 0.795, + "num_input_tokens_seen": 55007368, + "step": 94740 + }, + { + "epoch": 14.111557938635686, + "grad_norm": 0.056640625, + "learning_rate": 0.0072488417518950824, + "loss": 0.8052, + "num_input_tokens_seen": 55010024, + "step": 94745 + }, + { + "epoch": 14.112302651176647, + "grad_norm": 0.06494140625, + "learning_rate": 0.007247172641495157, + "loss": 0.7769, + "num_input_tokens_seen": 55012872, + "step": 94750 + }, + { + "epoch": 14.113047363717605, + "grad_norm": 0.07763671875, + "learning_rate": 0.007245503662071366, + "loss": 0.8415, + "num_input_tokens_seen": 55015784, + "step": 94755 + }, + { + "epoch": 14.113792076258564, + "grad_norm": 0.04150390625, + "learning_rate": 0.0072438348136519125, + "loss": 0.7826, + "num_input_tokens_seen": 55018600, + "step": 94760 + }, + { + "epoch": 14.114536788799523, + "grad_norm": 0.03466796875, + "learning_rate": 0.007242166096264989, + "loss": 0.8252, + "num_input_tokens_seen": 55021576, + "step": 94765 + }, + { + "epoch": 14.115281501340483, + "grad_norm": 0.056884765625, + "learning_rate": 0.007240497509938782, + "loss": 0.7661, + "num_input_tokens_seen": 55024136, + "step": 94770 + }, + { + "epoch": 14.116026213881442, + "grad_norm": 0.0537109375, + "learning_rate": 0.00723882905470148, + "loss": 0.7958, + "num_input_tokens_seen": 55027112, + "step": 94775 + }, + { + "epoch": 14.1167709264224, + "grad_norm": 0.0615234375, + "learning_rate": 0.007237160730581278, + "loss": 0.7964, + "num_input_tokens_seen": 55030088, + "step": 94780 + }, + { + "epoch": 14.11751563896336, + "grad_norm": 0.050537109375, + "learning_rate": 0.0072354925376063505, + "loss": 0.8104, + "num_input_tokens_seen": 55032872, + "step": 94785 + }, + { + "epoch": 14.11826035150432, + "grad_norm": 0.08837890625, + "learning_rate": 0.007233824475804891, + "loss": 0.7962, + "num_input_tokens_seen": 55036168, + "step": 94790 + }, + { + "epoch": 14.119005064045279, + "grad_norm": 0.0595703125, + "learning_rate": 0.007232156545205071, + "loss": 0.8049, + "num_input_tokens_seen": 55039112, + "step": 94795 + }, + { + "epoch": 14.119749776586238, + "grad_norm": 0.06103515625, + "learning_rate": 0.0072304887458350675, + "loss": 0.7946, + "num_input_tokens_seen": 55042024, + "step": 94800 + }, + { + "epoch": 14.120494489127196, + "grad_norm": 0.06640625, + "learning_rate": 0.0072288210777230666, + "loss": 0.8102, + "num_input_tokens_seen": 55044968, + "step": 94805 + }, + { + "epoch": 14.121239201668157, + "grad_norm": 0.052490234375, + "learning_rate": 0.00722715354089723, + "loss": 0.7956, + "num_input_tokens_seen": 55047912, + "step": 94810 + }, + { + "epoch": 14.121983914209116, + "grad_norm": 0.05029296875, + "learning_rate": 0.007225486135385738, + "loss": 0.7807, + "num_input_tokens_seen": 55050984, + "step": 94815 + }, + { + "epoch": 14.122728626750074, + "grad_norm": 0.031005859375, + "learning_rate": 0.0072238188612167575, + "loss": 0.7922, + "num_input_tokens_seen": 55053832, + "step": 94820 + }, + { + "epoch": 14.123473339291033, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0072221517184184476, + "loss": 0.7897, + "num_input_tokens_seen": 55056712, + "step": 94825 + }, + { + "epoch": 14.124218051831992, + "grad_norm": 0.0400390625, + "learning_rate": 0.007220484707018984, + "loss": 0.784, + "num_input_tokens_seen": 55059432, + "step": 94830 + }, + { + "epoch": 14.124962764372953, + "grad_norm": 0.045654296875, + "learning_rate": 0.0072188178270465265, + "loss": 0.8103, + "num_input_tokens_seen": 55062600, + "step": 94835 + }, + { + "epoch": 14.125707476913911, + "grad_norm": 0.036865234375, + "learning_rate": 0.007217151078529233, + "loss": 0.8016, + "num_input_tokens_seen": 55065416, + "step": 94840 + }, + { + "epoch": 14.12645218945487, + "grad_norm": 0.037841796875, + "learning_rate": 0.007215484461495255, + "loss": 0.8048, + "num_input_tokens_seen": 55068104, + "step": 94845 + }, + { + "epoch": 14.127196901995829, + "grad_norm": 0.04541015625, + "learning_rate": 0.007213817975972762, + "loss": 0.8115, + "num_input_tokens_seen": 55070920, + "step": 94850 + }, + { + "epoch": 14.12794161453679, + "grad_norm": 0.05029296875, + "learning_rate": 0.0072121516219898945, + "loss": 0.7757, + "num_input_tokens_seen": 55073896, + "step": 94855 + }, + { + "epoch": 14.128686327077748, + "grad_norm": 0.0439453125, + "learning_rate": 0.007210485399574816, + "loss": 0.7884, + "num_input_tokens_seen": 55076776, + "step": 94860 + }, + { + "epoch": 14.129431039618707, + "grad_norm": 0.029296875, + "learning_rate": 0.007208819308755665, + "loss": 0.7769, + "num_input_tokens_seen": 55079592, + "step": 94865 + }, + { + "epoch": 14.130175752159666, + "grad_norm": 0.03564453125, + "learning_rate": 0.007207153349560597, + "loss": 0.7896, + "num_input_tokens_seen": 55082376, + "step": 94870 + }, + { + "epoch": 14.130920464700626, + "grad_norm": 0.0859375, + "learning_rate": 0.0072054875220177545, + "loss": 0.7855, + "num_input_tokens_seen": 55084936, + "step": 94875 + }, + { + "epoch": 14.131665177241585, + "grad_norm": 0.057373046875, + "learning_rate": 0.007203821826155273, + "loss": 0.8034, + "num_input_tokens_seen": 55087688, + "step": 94880 + }, + { + "epoch": 14.132409889782544, + "grad_norm": 0.05419921875, + "learning_rate": 0.007202156262001301, + "loss": 0.7961, + "num_input_tokens_seen": 55090632, + "step": 94885 + }, + { + "epoch": 14.133154602323502, + "grad_norm": 0.032958984375, + "learning_rate": 0.0072004908295839765, + "loss": 0.7792, + "num_input_tokens_seen": 55093928, + "step": 94890 + }, + { + "epoch": 14.133899314864463, + "grad_norm": 0.046875, + "learning_rate": 0.007198825528931431, + "loss": 0.8033, + "num_input_tokens_seen": 55096712, + "step": 94895 + }, + { + "epoch": 14.134644027405422, + "grad_norm": 0.0267333984375, + "learning_rate": 0.007197160360071795, + "loss": 0.81, + "num_input_tokens_seen": 55099624, + "step": 94900 + }, + { + "epoch": 14.13538873994638, + "grad_norm": 0.05712890625, + "learning_rate": 0.0071954953230332075, + "loss": 0.804, + "num_input_tokens_seen": 55102760, + "step": 94905 + }, + { + "epoch": 14.13613345248734, + "grad_norm": 0.05712890625, + "learning_rate": 0.007193830417843791, + "loss": 0.7928, + "num_input_tokens_seen": 55105640, + "step": 94910 + }, + { + "epoch": 14.1368781650283, + "grad_norm": 0.041015625, + "learning_rate": 0.00719216564453168, + "loss": 0.7875, + "num_input_tokens_seen": 55108200, + "step": 94915 + }, + { + "epoch": 14.137622877569259, + "grad_norm": 0.057861328125, + "learning_rate": 0.007190501003124995, + "loss": 0.8007, + "num_input_tokens_seen": 55111176, + "step": 94920 + }, + { + "epoch": 14.138367590110217, + "grad_norm": 0.05224609375, + "learning_rate": 0.007188836493651853, + "loss": 0.8048, + "num_input_tokens_seen": 55114088, + "step": 94925 + }, + { + "epoch": 14.139112302651176, + "grad_norm": 0.08935546875, + "learning_rate": 0.007187172116140386, + "loss": 0.785, + "num_input_tokens_seen": 55117128, + "step": 94930 + }, + { + "epoch": 14.139857015192137, + "grad_norm": 0.0439453125, + "learning_rate": 0.007185507870618699, + "loss": 0.781, + "num_input_tokens_seen": 55120360, + "step": 94935 + }, + { + "epoch": 14.140601727733095, + "grad_norm": 0.044677734375, + "learning_rate": 0.00718384375711492, + "loss": 0.8006, + "num_input_tokens_seen": 55123240, + "step": 94940 + }, + { + "epoch": 14.141346440274054, + "grad_norm": 0.049560546875, + "learning_rate": 0.007182179775657151, + "loss": 0.7873, + "num_input_tokens_seen": 55126120, + "step": 94945 + }, + { + "epoch": 14.142091152815013, + "grad_norm": 0.047119140625, + "learning_rate": 0.007180515926273515, + "loss": 0.7806, + "num_input_tokens_seen": 55129128, + "step": 94950 + }, + { + "epoch": 14.142835865355973, + "grad_norm": 0.0267333984375, + "learning_rate": 0.007178852208992115, + "loss": 0.8014, + "num_input_tokens_seen": 55132232, + "step": 94955 + }, + { + "epoch": 14.143580577896932, + "grad_norm": 0.044677734375, + "learning_rate": 0.007177188623841057, + "loss": 0.782, + "num_input_tokens_seen": 55134920, + "step": 94960 + }, + { + "epoch": 14.14432529043789, + "grad_norm": 0.033203125, + "learning_rate": 0.007175525170848448, + "loss": 0.8032, + "num_input_tokens_seen": 55137832, + "step": 94965 + }, + { + "epoch": 14.14507000297885, + "grad_norm": 0.048095703125, + "learning_rate": 0.007173861850042382, + "loss": 0.7833, + "num_input_tokens_seen": 55140616, + "step": 94970 + }, + { + "epoch": 14.14581471551981, + "grad_norm": 0.048828125, + "learning_rate": 0.007172198661450972, + "loss": 0.8194, + "num_input_tokens_seen": 55143528, + "step": 94975 + }, + { + "epoch": 14.146559428060769, + "grad_norm": 0.056396484375, + "learning_rate": 0.007170535605102304, + "loss": 0.7976, + "num_input_tokens_seen": 55146568, + "step": 94980 + }, + { + "epoch": 14.147304140601728, + "grad_norm": 0.072265625, + "learning_rate": 0.007168872681024486, + "loss": 0.7716, + "num_input_tokens_seen": 55149256, + "step": 94985 + }, + { + "epoch": 14.148048853142686, + "grad_norm": 0.048095703125, + "learning_rate": 0.007167209889245597, + "loss": 0.7999, + "num_input_tokens_seen": 55152392, + "step": 94990 + }, + { + "epoch": 14.148793565683647, + "grad_norm": 0.154296875, + "learning_rate": 0.007165547229793743, + "loss": 0.8409, + "num_input_tokens_seen": 55154952, + "step": 94995 + }, + { + "epoch": 14.149538278224606, + "grad_norm": 0.049560546875, + "learning_rate": 0.007163884702697007, + "loss": 0.803, + "num_input_tokens_seen": 55157736, + "step": 95000 + }, + { + "epoch": 14.150282990765565, + "grad_norm": 0.07666015625, + "learning_rate": 0.007162222307983467, + "loss": 0.8022, + "num_input_tokens_seen": 55160808, + "step": 95005 + }, + { + "epoch": 14.151027703306523, + "grad_norm": 0.047119140625, + "learning_rate": 0.0071605600456812225, + "loss": 0.8123, + "num_input_tokens_seen": 55163368, + "step": 95010 + }, + { + "epoch": 14.151772415847482, + "grad_norm": 0.0361328125, + "learning_rate": 0.007158897915818341, + "loss": 0.7981, + "num_input_tokens_seen": 55166088, + "step": 95015 + }, + { + "epoch": 14.152517128388443, + "grad_norm": 0.056640625, + "learning_rate": 0.007157235918422916, + "loss": 0.8082, + "num_input_tokens_seen": 55168776, + "step": 95020 + }, + { + "epoch": 14.153261840929401, + "grad_norm": 0.04736328125, + "learning_rate": 0.007155574053523022, + "loss": 0.7785, + "num_input_tokens_seen": 55171560, + "step": 95025 + }, + { + "epoch": 14.15400655347036, + "grad_norm": 0.0458984375, + "learning_rate": 0.007153912321146728, + "loss": 0.7971, + "num_input_tokens_seen": 55174536, + "step": 95030 + }, + { + "epoch": 14.154751266011319, + "grad_norm": 0.031494140625, + "learning_rate": 0.007152250721322112, + "loss": 0.7965, + "num_input_tokens_seen": 55177608, + "step": 95035 + }, + { + "epoch": 14.15549597855228, + "grad_norm": 0.036865234375, + "learning_rate": 0.007150589254077239, + "loss": 0.7893, + "num_input_tokens_seen": 55180264, + "step": 95040 + }, + { + "epoch": 14.156240691093238, + "grad_norm": 0.038330078125, + "learning_rate": 0.007148927919440187, + "loss": 0.7916, + "num_input_tokens_seen": 55183240, + "step": 95045 + }, + { + "epoch": 14.156985403634197, + "grad_norm": 0.046630859375, + "learning_rate": 0.007147266717439015, + "loss": 0.804, + "num_input_tokens_seen": 55186056, + "step": 95050 + }, + { + "epoch": 14.157730116175156, + "grad_norm": 0.0556640625, + "learning_rate": 0.007145605648101794, + "loss": 0.7881, + "num_input_tokens_seen": 55189064, + "step": 95055 + }, + { + "epoch": 14.158474828716116, + "grad_norm": 0.05029296875, + "learning_rate": 0.007143944711456579, + "loss": 0.7857, + "num_input_tokens_seen": 55191624, + "step": 95060 + }, + { + "epoch": 14.159219541257075, + "grad_norm": 0.0498046875, + "learning_rate": 0.007142283907531436, + "loss": 0.8047, + "num_input_tokens_seen": 55194696, + "step": 95065 + }, + { + "epoch": 14.159964253798034, + "grad_norm": 0.042724609375, + "learning_rate": 0.007140623236354419, + "loss": 0.7953, + "num_input_tokens_seen": 55197800, + "step": 95070 + }, + { + "epoch": 14.160708966338992, + "grad_norm": 0.043701171875, + "learning_rate": 0.007138962697953588, + "loss": 0.7874, + "num_input_tokens_seen": 55200616, + "step": 95075 + }, + { + "epoch": 14.161453678879953, + "grad_norm": 0.053466796875, + "learning_rate": 0.007137302292356991, + "loss": 0.7854, + "num_input_tokens_seen": 55203688, + "step": 95080 + }, + { + "epoch": 14.162198391420912, + "grad_norm": 0.055908203125, + "learning_rate": 0.0071356420195926775, + "loss": 0.8018, + "num_input_tokens_seen": 55206504, + "step": 95085 + }, + { + "epoch": 14.16294310396187, + "grad_norm": 0.042724609375, + "learning_rate": 0.007133981879688704, + "loss": 0.7962, + "num_input_tokens_seen": 55209384, + "step": 95090 + }, + { + "epoch": 14.16368781650283, + "grad_norm": 0.054931640625, + "learning_rate": 0.00713232187267311, + "loss": 0.795, + "num_input_tokens_seen": 55212168, + "step": 95095 + }, + { + "epoch": 14.16443252904379, + "grad_norm": 0.027587890625, + "learning_rate": 0.007130661998573944, + "loss": 0.7927, + "num_input_tokens_seen": 55215048, + "step": 95100 + }, + { + "epoch": 14.165177241584749, + "grad_norm": 0.0537109375, + "learning_rate": 0.0071290022574192384, + "loss": 0.8023, + "num_input_tokens_seen": 55217896, + "step": 95105 + }, + { + "epoch": 14.165921954125707, + "grad_norm": 0.037841796875, + "learning_rate": 0.007127342649237046, + "loss": 0.81, + "num_input_tokens_seen": 55220904, + "step": 95110 + }, + { + "epoch": 14.166666666666666, + "grad_norm": 0.034912109375, + "learning_rate": 0.007125683174055399, + "loss": 0.8029, + "num_input_tokens_seen": 55223784, + "step": 95115 + }, + { + "epoch": 14.167411379207627, + "grad_norm": 0.03955078125, + "learning_rate": 0.007124023831902326, + "loss": 0.7874, + "num_input_tokens_seen": 55226600, + "step": 95120 + }, + { + "epoch": 14.168156091748585, + "grad_norm": 0.037109375, + "learning_rate": 0.00712236462280587, + "loss": 0.7893, + "num_input_tokens_seen": 55229640, + "step": 95125 + }, + { + "epoch": 14.168900804289544, + "grad_norm": 0.080078125, + "learning_rate": 0.007120705546794052, + "loss": 0.7899, + "num_input_tokens_seen": 55232680, + "step": 95130 + }, + { + "epoch": 14.169645516830503, + "grad_norm": 0.05859375, + "learning_rate": 0.007119046603894912, + "loss": 0.814, + "num_input_tokens_seen": 55235624, + "step": 95135 + }, + { + "epoch": 14.170390229371463, + "grad_norm": 0.06494140625, + "learning_rate": 0.007117387794136463, + "loss": 0.8025, + "num_input_tokens_seen": 55238536, + "step": 95140 + }, + { + "epoch": 14.171134941912422, + "grad_norm": 0.051513671875, + "learning_rate": 0.007115729117546742, + "loss": 0.7967, + "num_input_tokens_seen": 55242088, + "step": 95145 + }, + { + "epoch": 14.171879654453381, + "grad_norm": 0.0849609375, + "learning_rate": 0.007114070574153765, + "loss": 0.7978, + "num_input_tokens_seen": 55245128, + "step": 95150 + }, + { + "epoch": 14.17262436699434, + "grad_norm": 0.024658203125, + "learning_rate": 0.007112412163985544, + "loss": 0.7955, + "num_input_tokens_seen": 55247880, + "step": 95155 + }, + { + "epoch": 14.1733690795353, + "grad_norm": 0.041259765625, + "learning_rate": 0.007110753887070108, + "loss": 0.81, + "num_input_tokens_seen": 55250952, + "step": 95160 + }, + { + "epoch": 14.174113792076259, + "grad_norm": 0.038818359375, + "learning_rate": 0.007109095743435468, + "loss": 0.7855, + "num_input_tokens_seen": 55253800, + "step": 95165 + }, + { + "epoch": 14.174858504617218, + "grad_norm": 0.06982421875, + "learning_rate": 0.007107437733109636, + "loss": 0.7925, + "num_input_tokens_seen": 55256712, + "step": 95170 + }, + { + "epoch": 14.175603217158177, + "grad_norm": 0.06298828125, + "learning_rate": 0.007105779856120616, + "loss": 0.8189, + "num_input_tokens_seen": 55259528, + "step": 95175 + }, + { + "epoch": 14.176347929699135, + "grad_norm": 0.03466796875, + "learning_rate": 0.007104122112496426, + "loss": 0.7927, + "num_input_tokens_seen": 55262440, + "step": 95180 + }, + { + "epoch": 14.177092642240096, + "grad_norm": 0.046142578125, + "learning_rate": 0.007102464502265064, + "loss": 0.7963, + "num_input_tokens_seen": 55265128, + "step": 95185 + }, + { + "epoch": 14.177837354781055, + "grad_norm": 0.0537109375, + "learning_rate": 0.007100807025454542, + "loss": 0.7857, + "num_input_tokens_seen": 55268040, + "step": 95190 + }, + { + "epoch": 14.178582067322013, + "grad_norm": 0.046142578125, + "learning_rate": 0.007099149682092858, + "loss": 0.7998, + "num_input_tokens_seen": 55270888, + "step": 95195 + }, + { + "epoch": 14.179326779862972, + "grad_norm": 0.034423828125, + "learning_rate": 0.007097492472208002, + "loss": 0.8053, + "num_input_tokens_seen": 55273608, + "step": 95200 + }, + { + "epoch": 14.180071492403933, + "grad_norm": 0.0390625, + "learning_rate": 0.007095835395827988, + "loss": 0.8099, + "num_input_tokens_seen": 55276840, + "step": 95205 + }, + { + "epoch": 14.180816204944891, + "grad_norm": 0.05419921875, + "learning_rate": 0.007094178452980794, + "loss": 0.7915, + "num_input_tokens_seen": 55279688, + "step": 95210 + }, + { + "epoch": 14.18156091748585, + "grad_norm": 0.04296875, + "learning_rate": 0.007092521643694426, + "loss": 0.7887, + "num_input_tokens_seen": 55282664, + "step": 95215 + }, + { + "epoch": 14.182305630026809, + "grad_norm": 0.0517578125, + "learning_rate": 0.007090864967996867, + "loss": 0.7961, + "num_input_tokens_seen": 55285640, + "step": 95220 + }, + { + "epoch": 14.18305034256777, + "grad_norm": 0.025634765625, + "learning_rate": 0.0070892084259161076, + "loss": 0.7979, + "num_input_tokens_seen": 55288648, + "step": 95225 + }, + { + "epoch": 14.183795055108728, + "grad_norm": 0.038330078125, + "learning_rate": 0.007087552017480125, + "loss": 0.7964, + "num_input_tokens_seen": 55291880, + "step": 95230 + }, + { + "epoch": 14.184539767649687, + "grad_norm": 0.0673828125, + "learning_rate": 0.007085895742716915, + "loss": 0.7789, + "num_input_tokens_seen": 55294984, + "step": 95235 + }, + { + "epoch": 14.185284480190646, + "grad_norm": 0.1630859375, + "learning_rate": 0.007084239601654453, + "loss": 0.8312, + "num_input_tokens_seen": 55297992, + "step": 95240 + }, + { + "epoch": 14.186029192731606, + "grad_norm": 0.051025390625, + "learning_rate": 0.0070825835943207115, + "loss": 0.7823, + "num_input_tokens_seen": 55301096, + "step": 95245 + }, + { + "epoch": 14.186773905272565, + "grad_norm": 0.032470703125, + "learning_rate": 0.007080927720743679, + "loss": 0.7939, + "num_input_tokens_seen": 55304072, + "step": 95250 + }, + { + "epoch": 14.187518617813524, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00707927198095132, + "loss": 0.7875, + "num_input_tokens_seen": 55307272, + "step": 95255 + }, + { + "epoch": 14.188263330354483, + "grad_norm": 0.0478515625, + "learning_rate": 0.007077616374971614, + "loss": 0.7986, + "num_input_tokens_seen": 55310216, + "step": 95260 + }, + { + "epoch": 14.189008042895443, + "grad_norm": 0.057373046875, + "learning_rate": 0.007075960902832523, + "loss": 0.7911, + "num_input_tokens_seen": 55313000, + "step": 95265 + }, + { + "epoch": 14.189752755436402, + "grad_norm": 0.053466796875, + "learning_rate": 0.007074305564562025, + "loss": 0.8468, + "num_input_tokens_seen": 55316232, + "step": 95270 + }, + { + "epoch": 14.19049746797736, + "grad_norm": 0.031494140625, + "learning_rate": 0.007072650360188079, + "loss": 0.8125, + "num_input_tokens_seen": 55319432, + "step": 95275 + }, + { + "epoch": 14.19124218051832, + "grad_norm": 0.051513671875, + "learning_rate": 0.007070995289738641, + "loss": 0.7887, + "num_input_tokens_seen": 55322248, + "step": 95280 + }, + { + "epoch": 14.19198689305928, + "grad_norm": 0.028564453125, + "learning_rate": 0.007069340353241685, + "loss": 0.7941, + "num_input_tokens_seen": 55325224, + "step": 95285 + }, + { + "epoch": 14.192731605600239, + "grad_norm": 0.05322265625, + "learning_rate": 0.007067685550725166, + "loss": 0.7999, + "num_input_tokens_seen": 55328072, + "step": 95290 + }, + { + "epoch": 14.193476318141197, + "grad_norm": 0.0419921875, + "learning_rate": 0.007066030882217035, + "loss": 0.7677, + "num_input_tokens_seen": 55330952, + "step": 95295 + }, + { + "epoch": 14.194221030682156, + "grad_norm": 0.02880859375, + "learning_rate": 0.007064376347745244, + "loss": 0.8003, + "num_input_tokens_seen": 55333768, + "step": 95300 + }, + { + "epoch": 14.194965743223117, + "grad_norm": 0.04345703125, + "learning_rate": 0.0070627219473377535, + "loss": 0.8118, + "num_input_tokens_seen": 55336680, + "step": 95305 + }, + { + "epoch": 14.195710455764075, + "grad_norm": 0.048583984375, + "learning_rate": 0.007061067681022505, + "loss": 0.7736, + "num_input_tokens_seen": 55339496, + "step": 95310 + }, + { + "epoch": 14.196455168305034, + "grad_norm": 0.03173828125, + "learning_rate": 0.007059413548827453, + "loss": 0.7879, + "num_input_tokens_seen": 55342440, + "step": 95315 + }, + { + "epoch": 14.197199880845993, + "grad_norm": 0.03125, + "learning_rate": 0.0070577595507805395, + "loss": 0.8096, + "num_input_tokens_seen": 55345384, + "step": 95320 + }, + { + "epoch": 14.197944593386953, + "grad_norm": 0.05859375, + "learning_rate": 0.007056105686909701, + "loss": 0.8349, + "num_input_tokens_seen": 55348776, + "step": 95325 + }, + { + "epoch": 14.198689305927912, + "grad_norm": 0.0595703125, + "learning_rate": 0.007054451957242888, + "loss": 0.7751, + "num_input_tokens_seen": 55351624, + "step": 95330 + }, + { + "epoch": 14.199434018468871, + "grad_norm": 0.045654296875, + "learning_rate": 0.007052798361808027, + "loss": 0.8001, + "num_input_tokens_seen": 55354536, + "step": 95335 + }, + { + "epoch": 14.20017873100983, + "grad_norm": 0.048583984375, + "learning_rate": 0.007051144900633068, + "loss": 0.8121, + "num_input_tokens_seen": 55357320, + "step": 95340 + }, + { + "epoch": 14.200923443550789, + "grad_norm": 0.048828125, + "learning_rate": 0.00704949157374593, + "loss": 0.7786, + "num_input_tokens_seen": 55360136, + "step": 95345 + }, + { + "epoch": 14.201668156091749, + "grad_norm": 0.044677734375, + "learning_rate": 0.007047838381174559, + "loss": 0.7972, + "num_input_tokens_seen": 55363240, + "step": 95350 + }, + { + "epoch": 14.202412868632708, + "grad_norm": 0.044921875, + "learning_rate": 0.007046185322946875, + "loss": 0.7815, + "num_input_tokens_seen": 55366312, + "step": 95355 + }, + { + "epoch": 14.203157581173667, + "grad_norm": 0.046875, + "learning_rate": 0.007044532399090805, + "loss": 0.8105, + "num_input_tokens_seen": 55369480, + "step": 95360 + }, + { + "epoch": 14.203902293714625, + "grad_norm": 0.048828125, + "learning_rate": 0.007042879609634277, + "loss": 0.7885, + "num_input_tokens_seen": 55372392, + "step": 95365 + }, + { + "epoch": 14.204647006255586, + "grad_norm": 0.0478515625, + "learning_rate": 0.007041226954605202, + "loss": 0.7967, + "num_input_tokens_seen": 55375240, + "step": 95370 + }, + { + "epoch": 14.205391718796545, + "grad_norm": 0.037353515625, + "learning_rate": 0.007039574434031517, + "loss": 0.81, + "num_input_tokens_seen": 55377928, + "step": 95375 + }, + { + "epoch": 14.206136431337503, + "grad_norm": 0.050048828125, + "learning_rate": 0.0070379220479411244, + "loss": 0.8631, + "num_input_tokens_seen": 55380840, + "step": 95380 + }, + { + "epoch": 14.206881143878462, + "grad_norm": 0.0771484375, + "learning_rate": 0.007036269796361954, + "loss": 0.7937, + "num_input_tokens_seen": 55383560, + "step": 95385 + }, + { + "epoch": 14.207625856419423, + "grad_norm": 0.042724609375, + "learning_rate": 0.007034617679321903, + "loss": 0.7942, + "num_input_tokens_seen": 55386216, + "step": 95390 + }, + { + "epoch": 14.208370568960381, + "grad_norm": 0.04931640625, + "learning_rate": 0.007032965696848898, + "loss": 0.8013, + "num_input_tokens_seen": 55389000, + "step": 95395 + }, + { + "epoch": 14.20911528150134, + "grad_norm": 0.0306396484375, + "learning_rate": 0.00703131384897084, + "loss": 0.8049, + "num_input_tokens_seen": 55392136, + "step": 95400 + }, + { + "epoch": 14.209859994042299, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00702966213571563, + "loss": 0.781, + "num_input_tokens_seen": 55395208, + "step": 95405 + }, + { + "epoch": 14.21060470658326, + "grad_norm": 0.04931640625, + "learning_rate": 0.007028010557111182, + "loss": 0.7992, + "num_input_tokens_seen": 55398056, + "step": 95410 + }, + { + "epoch": 14.211349419124218, + "grad_norm": 0.04052734375, + "learning_rate": 0.007026359113185389, + "loss": 0.8018, + "num_input_tokens_seen": 55401160, + "step": 95415 + }, + { + "epoch": 14.212094131665177, + "grad_norm": 0.0625, + "learning_rate": 0.00702470780396616, + "loss": 0.7644, + "num_input_tokens_seen": 55403880, + "step": 95420 + }, + { + "epoch": 14.212838844206136, + "grad_norm": 0.04736328125, + "learning_rate": 0.007023056629481387, + "loss": 0.8387, + "num_input_tokens_seen": 55407112, + "step": 95425 + }, + { + "epoch": 14.213583556747096, + "grad_norm": 0.049072265625, + "learning_rate": 0.007021405589758964, + "loss": 0.8277, + "num_input_tokens_seen": 55410024, + "step": 95430 + }, + { + "epoch": 14.214328269288055, + "grad_norm": 0.0380859375, + "learning_rate": 0.007019754684826779, + "loss": 0.7752, + "num_input_tokens_seen": 55412744, + "step": 95435 + }, + { + "epoch": 14.215072981829014, + "grad_norm": 0.033447265625, + "learning_rate": 0.007018103914712734, + "loss": 0.7922, + "num_input_tokens_seen": 55415976, + "step": 95440 + }, + { + "epoch": 14.215817694369973, + "grad_norm": 0.04736328125, + "learning_rate": 0.007016453279444712, + "loss": 0.7939, + "num_input_tokens_seen": 55419144, + "step": 95445 + }, + { + "epoch": 14.216562406910933, + "grad_norm": 0.060791015625, + "learning_rate": 0.007014802779050592, + "loss": 0.7988, + "num_input_tokens_seen": 55422248, + "step": 95450 + }, + { + "epoch": 14.217307119451892, + "grad_norm": 0.032958984375, + "learning_rate": 0.007013152413558266, + "loss": 0.7939, + "num_input_tokens_seen": 55425064, + "step": 95455 + }, + { + "epoch": 14.21805183199285, + "grad_norm": 0.0233154296875, + "learning_rate": 0.007011502182995609, + "loss": 0.7939, + "num_input_tokens_seen": 55427848, + "step": 95460 + }, + { + "epoch": 14.21879654453381, + "grad_norm": 0.023681640625, + "learning_rate": 0.00700985208739051, + "loss": 0.7938, + "num_input_tokens_seen": 55430536, + "step": 95465 + }, + { + "epoch": 14.21954125707477, + "grad_norm": 0.04931640625, + "learning_rate": 0.0070082021267708315, + "loss": 0.7995, + "num_input_tokens_seen": 55433416, + "step": 95470 + }, + { + "epoch": 14.220285969615729, + "grad_norm": 0.046142578125, + "learning_rate": 0.0070065523011644615, + "loss": 0.7837, + "num_input_tokens_seen": 55436264, + "step": 95475 + }, + { + "epoch": 14.221030682156687, + "grad_norm": 0.04833984375, + "learning_rate": 0.007004902610599267, + "loss": 0.8026, + "num_input_tokens_seen": 55439144, + "step": 95480 + }, + { + "epoch": 14.221775394697646, + "grad_norm": 0.04345703125, + "learning_rate": 0.007003253055103111, + "loss": 0.8092, + "num_input_tokens_seen": 55442248, + "step": 95485 + }, + { + "epoch": 14.222520107238607, + "grad_norm": 0.053955078125, + "learning_rate": 0.007001603634703873, + "loss": 0.7744, + "num_input_tokens_seen": 55445000, + "step": 95490 + }, + { + "epoch": 14.223264819779565, + "grad_norm": 0.06298828125, + "learning_rate": 0.0069999543494294126, + "loss": 0.7752, + "num_input_tokens_seen": 55447944, + "step": 95495 + }, + { + "epoch": 14.224009532320524, + "grad_norm": 0.03515625, + "learning_rate": 0.006998305199307593, + "loss": 0.8142, + "num_input_tokens_seen": 55450664, + "step": 95500 + }, + { + "epoch": 14.224754244861483, + "grad_norm": 0.0400390625, + "learning_rate": 0.006996656184366267, + "loss": 0.8158, + "num_input_tokens_seen": 55453544, + "step": 95505 + }, + { + "epoch": 14.225498957402444, + "grad_norm": 0.037841796875, + "learning_rate": 0.006995007304633308, + "loss": 0.7955, + "num_input_tokens_seen": 55456360, + "step": 95510 + }, + { + "epoch": 14.226243669943402, + "grad_norm": 0.039306640625, + "learning_rate": 0.006993358560136558, + "loss": 0.7964, + "num_input_tokens_seen": 55459272, + "step": 95515 + }, + { + "epoch": 14.226988382484361, + "grad_norm": 0.03466796875, + "learning_rate": 0.006991709950903883, + "loss": 0.7962, + "num_input_tokens_seen": 55462024, + "step": 95520 + }, + { + "epoch": 14.22773309502532, + "grad_norm": 0.043212890625, + "learning_rate": 0.006990061476963131, + "loss": 0.7969, + "num_input_tokens_seen": 55464936, + "step": 95525 + }, + { + "epoch": 14.228477807566279, + "grad_norm": 0.03857421875, + "learning_rate": 0.006988413138342141, + "loss": 0.8192, + "num_input_tokens_seen": 55468200, + "step": 95530 + }, + { + "epoch": 14.229222520107239, + "grad_norm": 0.055419921875, + "learning_rate": 0.006986764935068776, + "loss": 0.802, + "num_input_tokens_seen": 55470952, + "step": 95535 + }, + { + "epoch": 14.229967232648198, + "grad_norm": 0.027587890625, + "learning_rate": 0.006985116867170867, + "loss": 0.8101, + "num_input_tokens_seen": 55473896, + "step": 95540 + }, + { + "epoch": 14.230711945189157, + "grad_norm": 0.048095703125, + "learning_rate": 0.006983468934676268, + "loss": 0.8158, + "num_input_tokens_seen": 55476744, + "step": 95545 + }, + { + "epoch": 14.231456657730115, + "grad_norm": 0.029052734375, + "learning_rate": 0.0069818211376128135, + "loss": 0.7898, + "num_input_tokens_seen": 55479752, + "step": 95550 + }, + { + "epoch": 14.232201370271076, + "grad_norm": 0.03662109375, + "learning_rate": 0.006980173476008342, + "loss": 0.7694, + "num_input_tokens_seen": 55482536, + "step": 95555 + }, + { + "epoch": 14.232946082812035, + "grad_norm": 0.07861328125, + "learning_rate": 0.006978525949890684, + "loss": 0.789, + "num_input_tokens_seen": 55485512, + "step": 95560 + }, + { + "epoch": 14.233690795352993, + "grad_norm": 0.044921875, + "learning_rate": 0.006976878559287681, + "loss": 0.7844, + "num_input_tokens_seen": 55488072, + "step": 95565 + }, + { + "epoch": 14.234435507893952, + "grad_norm": 0.045166015625, + "learning_rate": 0.006975231304227163, + "loss": 0.7913, + "num_input_tokens_seen": 55491016, + "step": 95570 + }, + { + "epoch": 14.235180220434913, + "grad_norm": 0.05322265625, + "learning_rate": 0.006973584184736948, + "loss": 0.7805, + "num_input_tokens_seen": 55494248, + "step": 95575 + }, + { + "epoch": 14.235924932975871, + "grad_norm": 0.054931640625, + "learning_rate": 0.006971937200844877, + "loss": 0.7955, + "num_input_tokens_seen": 55497032, + "step": 95580 + }, + { + "epoch": 14.23666964551683, + "grad_norm": 0.060546875, + "learning_rate": 0.006970290352578762, + "loss": 0.79, + "num_input_tokens_seen": 55499752, + "step": 95585 + }, + { + "epoch": 14.237414358057789, + "grad_norm": 0.03173828125, + "learning_rate": 0.006968643639966436, + "loss": 0.8007, + "num_input_tokens_seen": 55502760, + "step": 95590 + }, + { + "epoch": 14.23815907059875, + "grad_norm": 0.03564453125, + "learning_rate": 0.0069669970630357085, + "loss": 0.7943, + "num_input_tokens_seen": 55505672, + "step": 95595 + }, + { + "epoch": 14.238903783139708, + "grad_norm": 0.10986328125, + "learning_rate": 0.006965350621814404, + "loss": 0.7946, + "num_input_tokens_seen": 55508520, + "step": 95600 + }, + { + "epoch": 14.239648495680667, + "grad_norm": 0.0240478515625, + "learning_rate": 0.006963704316330336, + "loss": 0.8116, + "num_input_tokens_seen": 55511112, + "step": 95605 + }, + { + "epoch": 14.240393208221626, + "grad_norm": 0.0380859375, + "learning_rate": 0.006962058146611312, + "loss": 0.8096, + "num_input_tokens_seen": 55513896, + "step": 95610 + }, + { + "epoch": 14.241137920762586, + "grad_norm": 0.024658203125, + "learning_rate": 0.00696041211268515, + "loss": 0.8089, + "num_input_tokens_seen": 55516744, + "step": 95615 + }, + { + "epoch": 14.241882633303545, + "grad_norm": 0.0302734375, + "learning_rate": 0.006958766214579655, + "loss": 0.7932, + "num_input_tokens_seen": 55519624, + "step": 95620 + }, + { + "epoch": 14.242627345844504, + "grad_norm": 0.0296630859375, + "learning_rate": 0.006957120452322632, + "loss": 0.7769, + "num_input_tokens_seen": 55522600, + "step": 95625 + }, + { + "epoch": 14.243372058385463, + "grad_norm": 0.03955078125, + "learning_rate": 0.00695547482594188, + "loss": 0.8088, + "num_input_tokens_seen": 55525544, + "step": 95630 + }, + { + "epoch": 14.244116770926423, + "grad_norm": 0.03515625, + "learning_rate": 0.006953829335465209, + "loss": 0.7905, + "num_input_tokens_seen": 55528520, + "step": 95635 + }, + { + "epoch": 14.244861483467382, + "grad_norm": 0.044677734375, + "learning_rate": 0.006952183980920413, + "loss": 0.8161, + "num_input_tokens_seen": 55531624, + "step": 95640 + }, + { + "epoch": 14.24560619600834, + "grad_norm": 0.055419921875, + "learning_rate": 0.006950538762335285, + "loss": 0.7727, + "num_input_tokens_seen": 55534472, + "step": 95645 + }, + { + "epoch": 14.2463509085493, + "grad_norm": 0.040771484375, + "learning_rate": 0.006948893679737628, + "loss": 0.8144, + "num_input_tokens_seen": 55537608, + "step": 95650 + }, + { + "epoch": 14.24709562109026, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0069472487331552245, + "loss": 0.7834, + "num_input_tokens_seen": 55540520, + "step": 95655 + }, + { + "epoch": 14.247840333631219, + "grad_norm": 0.036865234375, + "learning_rate": 0.006945603922615875, + "loss": 0.8076, + "num_input_tokens_seen": 55543240, + "step": 95660 + }, + { + "epoch": 14.248585046172177, + "grad_norm": 0.046630859375, + "learning_rate": 0.0069439592481473555, + "loss": 0.7872, + "num_input_tokens_seen": 55546120, + "step": 95665 + }, + { + "epoch": 14.249329758713136, + "grad_norm": 0.037109375, + "learning_rate": 0.006942314709777462, + "loss": 0.8069, + "num_input_tokens_seen": 55549000, + "step": 95670 + }, + { + "epoch": 14.250074471254097, + "grad_norm": 0.037841796875, + "learning_rate": 0.0069406703075339665, + "loss": 0.8011, + "num_input_tokens_seen": 55551624, + "step": 95675 + }, + { + "epoch": 14.250819183795056, + "grad_norm": 0.033203125, + "learning_rate": 0.00693902604144466, + "loss": 0.808, + "num_input_tokens_seen": 55554280, + "step": 95680 + }, + { + "epoch": 14.251563896336014, + "grad_norm": 0.04736328125, + "learning_rate": 0.006937381911537316, + "loss": 0.8218, + "num_input_tokens_seen": 55557288, + "step": 95685 + }, + { + "epoch": 14.252308608876973, + "grad_norm": 0.054443359375, + "learning_rate": 0.006935737917839711, + "loss": 0.7974, + "num_input_tokens_seen": 55559944, + "step": 95690 + }, + { + "epoch": 14.253053321417934, + "grad_norm": 0.07958984375, + "learning_rate": 0.006934094060379617, + "loss": 0.8149, + "num_input_tokens_seen": 55563208, + "step": 95695 + }, + { + "epoch": 14.253798033958892, + "grad_norm": 0.03271484375, + "learning_rate": 0.006932450339184801, + "loss": 0.8381, + "num_input_tokens_seen": 55565896, + "step": 95700 + }, + { + "epoch": 14.254542746499851, + "grad_norm": 0.04052734375, + "learning_rate": 0.006930806754283043, + "loss": 0.7997, + "num_input_tokens_seen": 55568776, + "step": 95705 + }, + { + "epoch": 14.25528745904081, + "grad_norm": 0.0361328125, + "learning_rate": 0.006929163305702097, + "loss": 0.793, + "num_input_tokens_seen": 55571688, + "step": 95710 + }, + { + "epoch": 14.256032171581769, + "grad_norm": 0.03759765625, + "learning_rate": 0.00692751999346974, + "loss": 0.7746, + "num_input_tokens_seen": 55574568, + "step": 95715 + }, + { + "epoch": 14.25677688412273, + "grad_norm": 0.049072265625, + "learning_rate": 0.0069258768176137285, + "loss": 0.8098, + "num_input_tokens_seen": 55577448, + "step": 95720 + }, + { + "epoch": 14.257521596663688, + "grad_norm": 0.033447265625, + "learning_rate": 0.0069242337781618155, + "loss": 0.7977, + "num_input_tokens_seen": 55580136, + "step": 95725 + }, + { + "epoch": 14.258266309204647, + "grad_norm": 0.04541015625, + "learning_rate": 0.006922590875141772, + "loss": 0.7741, + "num_input_tokens_seen": 55583048, + "step": 95730 + }, + { + "epoch": 14.259011021745605, + "grad_norm": 0.05615234375, + "learning_rate": 0.006920948108581338, + "loss": 0.8049, + "num_input_tokens_seen": 55585928, + "step": 95735 + }, + { + "epoch": 14.259755734286566, + "grad_norm": 0.0390625, + "learning_rate": 0.006919305478508283, + "loss": 0.8122, + "num_input_tokens_seen": 55588872, + "step": 95740 + }, + { + "epoch": 14.260500446827525, + "grad_norm": 0.047607421875, + "learning_rate": 0.0069176629849503415, + "loss": 0.8027, + "num_input_tokens_seen": 55592104, + "step": 95745 + }, + { + "epoch": 14.261245159368483, + "grad_norm": 0.053466796875, + "learning_rate": 0.006916020627935276, + "loss": 0.8173, + "num_input_tokens_seen": 55594600, + "step": 95750 + }, + { + "epoch": 14.261989871909442, + "grad_norm": 0.03955078125, + "learning_rate": 0.006914378407490826, + "loss": 0.8201, + "num_input_tokens_seen": 55597704, + "step": 95755 + }, + { + "epoch": 14.262734584450403, + "grad_norm": 0.050048828125, + "learning_rate": 0.006912736323644734, + "loss": 0.7939, + "num_input_tokens_seen": 55600424, + "step": 95760 + }, + { + "epoch": 14.263479296991362, + "grad_norm": 0.038818359375, + "learning_rate": 0.006911094376424743, + "loss": 0.7886, + "num_input_tokens_seen": 55603336, + "step": 95765 + }, + { + "epoch": 14.26422400953232, + "grad_norm": 0.031982421875, + "learning_rate": 0.0069094525658585865, + "loss": 0.7922, + "num_input_tokens_seen": 55606376, + "step": 95770 + }, + { + "epoch": 14.264968722073279, + "grad_norm": 0.0306396484375, + "learning_rate": 0.00690781089197401, + "loss": 0.7899, + "num_input_tokens_seen": 55609256, + "step": 95775 + }, + { + "epoch": 14.26571343461424, + "grad_norm": 0.060791015625, + "learning_rate": 0.00690616935479874, + "loss": 0.7923, + "num_input_tokens_seen": 55612104, + "step": 95780 + }, + { + "epoch": 14.266458147155198, + "grad_norm": 0.130859375, + "learning_rate": 0.0069045279543605165, + "loss": 0.8575, + "num_input_tokens_seen": 55614984, + "step": 95785 + }, + { + "epoch": 14.267202859696157, + "grad_norm": 0.062255859375, + "learning_rate": 0.00690288669068706, + "loss": 0.7979, + "num_input_tokens_seen": 55618376, + "step": 95790 + }, + { + "epoch": 14.267947572237116, + "grad_norm": 0.035400390625, + "learning_rate": 0.006901245563806111, + "loss": 0.7773, + "num_input_tokens_seen": 55621256, + "step": 95795 + }, + { + "epoch": 14.268692284778076, + "grad_norm": 0.05224609375, + "learning_rate": 0.006899604573745384, + "loss": 0.7785, + "num_input_tokens_seen": 55624136, + "step": 95800 + }, + { + "epoch": 14.269436997319035, + "grad_norm": 0.0289306640625, + "learning_rate": 0.006897963720532601, + "loss": 0.8678, + "num_input_tokens_seen": 55626664, + "step": 95805 + }, + { + "epoch": 14.270181709859994, + "grad_norm": 0.036865234375, + "learning_rate": 0.006896323004195491, + "loss": 0.7922, + "num_input_tokens_seen": 55629576, + "step": 95810 + }, + { + "epoch": 14.270926422400953, + "grad_norm": 0.044677734375, + "learning_rate": 0.006894682424761763, + "loss": 0.8065, + "num_input_tokens_seen": 55632264, + "step": 95815 + }, + { + "epoch": 14.271671134941913, + "grad_norm": 0.055419921875, + "learning_rate": 0.006893041982259142, + "loss": 0.8139, + "num_input_tokens_seen": 55634952, + "step": 95820 + }, + { + "epoch": 14.272415847482872, + "grad_norm": 0.041748046875, + "learning_rate": 0.006891401676715339, + "loss": 0.8005, + "num_input_tokens_seen": 55637736, + "step": 95825 + }, + { + "epoch": 14.27316056002383, + "grad_norm": 0.032470703125, + "learning_rate": 0.006889761508158062, + "loss": 0.8521, + "num_input_tokens_seen": 55640488, + "step": 95830 + }, + { + "epoch": 14.27390527256479, + "grad_norm": 0.04248046875, + "learning_rate": 0.006888121476615018, + "loss": 0.8114, + "num_input_tokens_seen": 55643176, + "step": 95835 + }, + { + "epoch": 14.27464998510575, + "grad_norm": 0.02490234375, + "learning_rate": 0.006886481582113919, + "loss": 0.8034, + "num_input_tokens_seen": 55645960, + "step": 95840 + }, + { + "epoch": 14.275394697646709, + "grad_norm": 0.052978515625, + "learning_rate": 0.00688484182468247, + "loss": 0.8058, + "num_input_tokens_seen": 55648712, + "step": 95845 + }, + { + "epoch": 14.276139410187668, + "grad_norm": 0.042724609375, + "learning_rate": 0.006883202204348364, + "loss": 0.7976, + "num_input_tokens_seen": 55651464, + "step": 95850 + }, + { + "epoch": 14.276884122728626, + "grad_norm": 0.031982421875, + "learning_rate": 0.006881562721139313, + "loss": 0.7884, + "num_input_tokens_seen": 55654344, + "step": 95855 + }, + { + "epoch": 14.277628835269585, + "grad_norm": 0.045166015625, + "learning_rate": 0.006879923375083003, + "loss": 0.781, + "num_input_tokens_seen": 55657064, + "step": 95860 + }, + { + "epoch": 14.278373547810546, + "grad_norm": 0.049560546875, + "learning_rate": 0.006878284166207142, + "loss": 0.7963, + "num_input_tokens_seen": 55660296, + "step": 95865 + }, + { + "epoch": 14.279118260351504, + "grad_norm": 0.058837890625, + "learning_rate": 0.006876645094539408, + "loss": 0.8024, + "num_input_tokens_seen": 55663464, + "step": 95870 + }, + { + "epoch": 14.279862972892463, + "grad_norm": 0.03662109375, + "learning_rate": 0.006875006160107504, + "loss": 0.7848, + "num_input_tokens_seen": 55666600, + "step": 95875 + }, + { + "epoch": 14.280607685433422, + "grad_norm": 0.06396484375, + "learning_rate": 0.0068733673629391145, + "loss": 0.8047, + "num_input_tokens_seen": 55669640, + "step": 95880 + }, + { + "epoch": 14.281352397974382, + "grad_norm": 0.04638671875, + "learning_rate": 0.006871728703061923, + "loss": 0.7961, + "num_input_tokens_seen": 55672328, + "step": 95885 + }, + { + "epoch": 14.282097110515341, + "grad_norm": 0.035888671875, + "learning_rate": 0.006870090180503609, + "loss": 0.784, + "num_input_tokens_seen": 55675112, + "step": 95890 + }, + { + "epoch": 14.2828418230563, + "grad_norm": 0.0517578125, + "learning_rate": 0.006868451795291864, + "loss": 0.8059, + "num_input_tokens_seen": 55677992, + "step": 95895 + }, + { + "epoch": 14.283586535597259, + "grad_norm": 0.0283203125, + "learning_rate": 0.00686681354745436, + "loss": 0.8102, + "num_input_tokens_seen": 55680904, + "step": 95900 + }, + { + "epoch": 14.28433124813822, + "grad_norm": 0.058349609375, + "learning_rate": 0.00686517543701877, + "loss": 0.7941, + "num_input_tokens_seen": 55683784, + "step": 95905 + }, + { + "epoch": 14.285075960679178, + "grad_norm": 0.03466796875, + "learning_rate": 0.00686353746401278, + "loss": 0.814, + "num_input_tokens_seen": 55686792, + "step": 95910 + }, + { + "epoch": 14.285820673220137, + "grad_norm": 0.045166015625, + "learning_rate": 0.006861899628464049, + "loss": 0.7874, + "num_input_tokens_seen": 55689800, + "step": 95915 + }, + { + "epoch": 14.286565385761095, + "grad_norm": 0.06005859375, + "learning_rate": 0.006860261930400256, + "loss": 0.809, + "num_input_tokens_seen": 55692616, + "step": 95920 + }, + { + "epoch": 14.287310098302056, + "grad_norm": 0.040283203125, + "learning_rate": 0.006858624369849068, + "loss": 0.79, + "num_input_tokens_seen": 55695816, + "step": 95925 + }, + { + "epoch": 14.288054810843015, + "grad_norm": 0.04443359375, + "learning_rate": 0.006856986946838139, + "loss": 0.7992, + "num_input_tokens_seen": 55699080, + "step": 95930 + }, + { + "epoch": 14.288799523383974, + "grad_norm": 0.0712890625, + "learning_rate": 0.006855349661395145, + "loss": 0.7869, + "num_input_tokens_seen": 55701992, + "step": 95935 + }, + { + "epoch": 14.289544235924932, + "grad_norm": 0.06591796875, + "learning_rate": 0.006853712513547737, + "loss": 0.7886, + "num_input_tokens_seen": 55705032, + "step": 95940 + }, + { + "epoch": 14.290288948465893, + "grad_norm": 0.064453125, + "learning_rate": 0.00685207550332358, + "loss": 0.7985, + "num_input_tokens_seen": 55708104, + "step": 95945 + }, + { + "epoch": 14.291033661006852, + "grad_norm": 0.0458984375, + "learning_rate": 0.006850438630750327, + "loss": 0.7989, + "num_input_tokens_seen": 55711496, + "step": 95950 + }, + { + "epoch": 14.29177837354781, + "grad_norm": 0.0390625, + "learning_rate": 0.006848801895855631, + "loss": 0.8041, + "num_input_tokens_seen": 55714280, + "step": 95955 + }, + { + "epoch": 14.292523086088769, + "grad_norm": 0.04248046875, + "learning_rate": 0.006847165298667136, + "loss": 0.7888, + "num_input_tokens_seen": 55717160, + "step": 95960 + }, + { + "epoch": 14.29326779862973, + "grad_norm": 0.0390625, + "learning_rate": 0.006845528839212503, + "loss": 0.8099, + "num_input_tokens_seen": 55719816, + "step": 95965 + }, + { + "epoch": 14.294012511170688, + "grad_norm": 0.06689453125, + "learning_rate": 0.006843892517519373, + "loss": 0.8085, + "num_input_tokens_seen": 55722760, + "step": 95970 + }, + { + "epoch": 14.294757223711647, + "grad_norm": 0.037841796875, + "learning_rate": 0.006842256333615382, + "loss": 0.7964, + "num_input_tokens_seen": 55725608, + "step": 95975 + }, + { + "epoch": 14.295501936252606, + "grad_norm": 0.044921875, + "learning_rate": 0.006840620287528187, + "loss": 0.802, + "num_input_tokens_seen": 55728584, + "step": 95980 + }, + { + "epoch": 14.296246648793566, + "grad_norm": 0.037353515625, + "learning_rate": 0.006838984379285412, + "loss": 0.7634, + "num_input_tokens_seen": 55731432, + "step": 95985 + }, + { + "epoch": 14.296991361334525, + "grad_norm": 0.027587890625, + "learning_rate": 0.006837348608914708, + "loss": 0.805, + "num_input_tokens_seen": 55734440, + "step": 95990 + }, + { + "epoch": 14.297736073875484, + "grad_norm": 0.05517578125, + "learning_rate": 0.0068357129764436975, + "loss": 0.7791, + "num_input_tokens_seen": 55737128, + "step": 95995 + }, + { + "epoch": 14.298480786416443, + "grad_norm": 0.08447265625, + "learning_rate": 0.006834077481900024, + "loss": 0.8004, + "num_input_tokens_seen": 55739752, + "step": 96000 + }, + { + "epoch": 14.299225498957403, + "grad_norm": 0.062255859375, + "learning_rate": 0.006832442125311312, + "loss": 0.7985, + "num_input_tokens_seen": 55742792, + "step": 96005 + }, + { + "epoch": 14.299970211498362, + "grad_norm": 0.043212890625, + "learning_rate": 0.006830806906705183, + "loss": 0.796, + "num_input_tokens_seen": 55745864, + "step": 96010 + }, + { + "epoch": 14.30071492403932, + "grad_norm": 0.04443359375, + "learning_rate": 0.006829171826109275, + "loss": 0.7783, + "num_input_tokens_seen": 55748648, + "step": 96015 + }, + { + "epoch": 14.30145963658028, + "grad_norm": 0.036376953125, + "learning_rate": 0.006827536883551204, + "loss": 0.8111, + "num_input_tokens_seen": 55751656, + "step": 96020 + }, + { + "epoch": 14.30220434912124, + "grad_norm": 0.0703125, + "learning_rate": 0.00682590207905859, + "loss": 0.8049, + "num_input_tokens_seen": 55754664, + "step": 96025 + }, + { + "epoch": 14.302949061662199, + "grad_norm": 0.031982421875, + "learning_rate": 0.006824267412659049, + "loss": 0.7878, + "num_input_tokens_seen": 55757640, + "step": 96030 + }, + { + "epoch": 14.303693774203158, + "grad_norm": 0.034912109375, + "learning_rate": 0.006822632884380206, + "loss": 0.8049, + "num_input_tokens_seen": 55760712, + "step": 96035 + }, + { + "epoch": 14.304438486744116, + "grad_norm": 0.048583984375, + "learning_rate": 0.006820998494249664, + "loss": 0.7962, + "num_input_tokens_seen": 55763848, + "step": 96040 + }, + { + "epoch": 14.305183199285075, + "grad_norm": 0.0419921875, + "learning_rate": 0.006819364242295044, + "loss": 0.7981, + "num_input_tokens_seen": 55766856, + "step": 96045 + }, + { + "epoch": 14.305927911826036, + "grad_norm": 0.064453125, + "learning_rate": 0.006817730128543951, + "loss": 0.7969, + "num_input_tokens_seen": 55769544, + "step": 96050 + }, + { + "epoch": 14.306672624366994, + "grad_norm": 0.034912109375, + "learning_rate": 0.006816096153023985, + "loss": 0.8445, + "num_input_tokens_seen": 55772392, + "step": 96055 + }, + { + "epoch": 14.307417336907953, + "grad_norm": 0.0390625, + "learning_rate": 0.006814462315762764, + "loss": 0.792, + "num_input_tokens_seen": 55775144, + "step": 96060 + }, + { + "epoch": 14.308162049448912, + "grad_norm": 0.02685546875, + "learning_rate": 0.006812828616787877, + "loss": 0.7864, + "num_input_tokens_seen": 55777992, + "step": 96065 + }, + { + "epoch": 14.308906761989872, + "grad_norm": 0.043212890625, + "learning_rate": 0.006811195056126935, + "loss": 0.7888, + "num_input_tokens_seen": 55780552, + "step": 96070 + }, + { + "epoch": 14.309651474530831, + "grad_norm": 0.049072265625, + "learning_rate": 0.006809561633807522, + "loss": 0.7885, + "num_input_tokens_seen": 55783656, + "step": 96075 + }, + { + "epoch": 14.31039618707179, + "grad_norm": 0.0380859375, + "learning_rate": 0.006807928349857249, + "loss": 0.7927, + "num_input_tokens_seen": 55786344, + "step": 96080 + }, + { + "epoch": 14.311140899612749, + "grad_norm": 0.025390625, + "learning_rate": 0.0068062952043037, + "loss": 0.8073, + "num_input_tokens_seen": 55789288, + "step": 96085 + }, + { + "epoch": 14.31188561215371, + "grad_norm": 0.04345703125, + "learning_rate": 0.006804662197174465, + "loss": 0.8214, + "num_input_tokens_seen": 55792168, + "step": 96090 + }, + { + "epoch": 14.312630324694668, + "grad_norm": 0.044189453125, + "learning_rate": 0.006803029328497133, + "loss": 0.7934, + "num_input_tokens_seen": 55795304, + "step": 96095 + }, + { + "epoch": 14.313375037235627, + "grad_norm": 0.060791015625, + "learning_rate": 0.006801396598299284, + "loss": 0.7926, + "num_input_tokens_seen": 55797960, + "step": 96100 + }, + { + "epoch": 14.314119749776586, + "grad_norm": 0.049560546875, + "learning_rate": 0.006799764006608513, + "loss": 0.7886, + "num_input_tokens_seen": 55801032, + "step": 96105 + }, + { + "epoch": 14.314864462317546, + "grad_norm": 0.138671875, + "learning_rate": 0.006798131553452389, + "loss": 0.8409, + "num_input_tokens_seen": 55803816, + "step": 96110 + }, + { + "epoch": 14.315609174858505, + "grad_norm": 0.047119140625, + "learning_rate": 0.006796499238858501, + "loss": 0.7885, + "num_input_tokens_seen": 55806952, + "step": 96115 + }, + { + "epoch": 14.316353887399464, + "grad_norm": 0.0419921875, + "learning_rate": 0.006794867062854416, + "loss": 0.7932, + "num_input_tokens_seen": 55809800, + "step": 96120 + }, + { + "epoch": 14.317098599940422, + "grad_norm": 0.0498046875, + "learning_rate": 0.006793235025467717, + "loss": 0.7859, + "num_input_tokens_seen": 55812552, + "step": 96125 + }, + { + "epoch": 14.317843312481383, + "grad_norm": 0.04296875, + "learning_rate": 0.006791603126725974, + "loss": 0.7654, + "num_input_tokens_seen": 55815464, + "step": 96130 + }, + { + "epoch": 14.318588025022342, + "grad_norm": 0.047119140625, + "learning_rate": 0.006789971366656745, + "loss": 0.7864, + "num_input_tokens_seen": 55818312, + "step": 96135 + }, + { + "epoch": 14.3193327375633, + "grad_norm": 0.037841796875, + "learning_rate": 0.006788339745287613, + "loss": 0.7885, + "num_input_tokens_seen": 55821064, + "step": 96140 + }, + { + "epoch": 14.32007745010426, + "grad_norm": 0.049072265625, + "learning_rate": 0.006786708262646129, + "loss": 0.796, + "num_input_tokens_seen": 55824104, + "step": 96145 + }, + { + "epoch": 14.32082216264522, + "grad_norm": 0.035888671875, + "learning_rate": 0.006785076918759865, + "loss": 0.8056, + "num_input_tokens_seen": 55826728, + "step": 96150 + }, + { + "epoch": 14.321566875186178, + "grad_norm": 0.040283203125, + "learning_rate": 0.006783445713656379, + "loss": 0.7929, + "num_input_tokens_seen": 55829544, + "step": 96155 + }, + { + "epoch": 14.322311587727137, + "grad_norm": 0.0361328125, + "learning_rate": 0.006781814647363227, + "loss": 0.7784, + "num_input_tokens_seen": 55832488, + "step": 96160 + }, + { + "epoch": 14.323056300268096, + "grad_norm": 0.04736328125, + "learning_rate": 0.006780183719907963, + "loss": 0.7883, + "num_input_tokens_seen": 55835624, + "step": 96165 + }, + { + "epoch": 14.323801012809056, + "grad_norm": 0.031494140625, + "learning_rate": 0.0067785529313181365, + "loss": 0.8106, + "num_input_tokens_seen": 55838408, + "step": 96170 + }, + { + "epoch": 14.324545725350015, + "grad_norm": 0.04052734375, + "learning_rate": 0.0067769222816213065, + "loss": 0.8019, + "num_input_tokens_seen": 55841448, + "step": 96175 + }, + { + "epoch": 14.325290437890974, + "grad_norm": 0.07080078125, + "learning_rate": 0.006775291770845013, + "loss": 0.7912, + "num_input_tokens_seen": 55844904, + "step": 96180 + }, + { + "epoch": 14.326035150431933, + "grad_norm": 0.05322265625, + "learning_rate": 0.0067736613990168094, + "loss": 0.812, + "num_input_tokens_seen": 55848072, + "step": 96185 + }, + { + "epoch": 14.326779862972893, + "grad_norm": 0.064453125, + "learning_rate": 0.006772031166164231, + "loss": 0.7852, + "num_input_tokens_seen": 55850856, + "step": 96190 + }, + { + "epoch": 14.327524575513852, + "grad_norm": 0.03857421875, + "learning_rate": 0.006770401072314828, + "loss": 0.8218, + "num_input_tokens_seen": 55854120, + "step": 96195 + }, + { + "epoch": 14.32826928805481, + "grad_norm": 0.0303955078125, + "learning_rate": 0.006768771117496131, + "loss": 0.7973, + "num_input_tokens_seen": 55857064, + "step": 96200 + }, + { + "epoch": 14.32901400059577, + "grad_norm": 0.0244140625, + "learning_rate": 0.006767141301735684, + "loss": 0.786, + "num_input_tokens_seen": 55860296, + "step": 96205 + }, + { + "epoch": 14.32975871313673, + "grad_norm": 0.052001953125, + "learning_rate": 0.0067655116250610195, + "loss": 0.8286, + "num_input_tokens_seen": 55863016, + "step": 96210 + }, + { + "epoch": 14.330503425677689, + "grad_norm": 0.044921875, + "learning_rate": 0.0067638820874996635, + "loss": 0.7779, + "num_input_tokens_seen": 55866088, + "step": 96215 + }, + { + "epoch": 14.331248138218648, + "grad_norm": 0.036376953125, + "learning_rate": 0.006762252689079147, + "loss": 0.8087, + "num_input_tokens_seen": 55868904, + "step": 96220 + }, + { + "epoch": 14.331992850759606, + "grad_norm": 0.055419921875, + "learning_rate": 0.0067606234298270024, + "loss": 0.7888, + "num_input_tokens_seen": 55872008, + "step": 96225 + }, + { + "epoch": 14.332737563300565, + "grad_norm": 0.0498046875, + "learning_rate": 0.006758994309770751, + "loss": 0.7836, + "num_input_tokens_seen": 55874888, + "step": 96230 + }, + { + "epoch": 14.333482275841526, + "grad_norm": 0.062255859375, + "learning_rate": 0.0067573653289379094, + "loss": 0.8019, + "num_input_tokens_seen": 55877736, + "step": 96235 + }, + { + "epoch": 14.334226988382484, + "grad_norm": 0.038330078125, + "learning_rate": 0.006755736487356008, + "loss": 0.7973, + "num_input_tokens_seen": 55880648, + "step": 96240 + }, + { + "epoch": 14.334971700923443, + "grad_norm": 0.0458984375, + "learning_rate": 0.00675410778505256, + "loss": 0.7942, + "num_input_tokens_seen": 55883400, + "step": 96245 + }, + { + "epoch": 14.335716413464402, + "grad_norm": 0.08984375, + "learning_rate": 0.006752479222055076, + "loss": 0.8216, + "num_input_tokens_seen": 55886504, + "step": 96250 + }, + { + "epoch": 14.336461126005362, + "grad_norm": 0.033203125, + "learning_rate": 0.006750850798391078, + "loss": 0.8077, + "num_input_tokens_seen": 55889224, + "step": 96255 + }, + { + "epoch": 14.337205838546321, + "grad_norm": 0.03857421875, + "learning_rate": 0.006749222514088066, + "loss": 0.8263, + "num_input_tokens_seen": 55892072, + "step": 96260 + }, + { + "epoch": 14.33795055108728, + "grad_norm": 0.040771484375, + "learning_rate": 0.006747594369173558, + "loss": 0.7928, + "num_input_tokens_seen": 55895016, + "step": 96265 + }, + { + "epoch": 14.338695263628239, + "grad_norm": 0.0908203125, + "learning_rate": 0.006745966363675053, + "loss": 0.8073, + "num_input_tokens_seen": 55898024, + "step": 96270 + }, + { + "epoch": 14.3394399761692, + "grad_norm": 0.036376953125, + "learning_rate": 0.0067443384976200605, + "loss": 0.8209, + "num_input_tokens_seen": 55900968, + "step": 96275 + }, + { + "epoch": 14.340184688710158, + "grad_norm": 0.0341796875, + "learning_rate": 0.0067427107710360795, + "loss": 0.8159, + "num_input_tokens_seen": 55903912, + "step": 96280 + }, + { + "epoch": 14.340929401251117, + "grad_norm": 0.036865234375, + "learning_rate": 0.0067410831839506075, + "loss": 0.8179, + "num_input_tokens_seen": 55906952, + "step": 96285 + }, + { + "epoch": 14.341674113792076, + "grad_norm": 0.047119140625, + "learning_rate": 0.00673945573639114, + "loss": 0.782, + "num_input_tokens_seen": 55909768, + "step": 96290 + }, + { + "epoch": 14.342418826333036, + "grad_norm": 0.035888671875, + "learning_rate": 0.006737828428385168, + "loss": 0.7899, + "num_input_tokens_seen": 55912744, + "step": 96295 + }, + { + "epoch": 14.343163538873995, + "grad_norm": 0.05322265625, + "learning_rate": 0.006736201259960193, + "loss": 0.7862, + "num_input_tokens_seen": 55915656, + "step": 96300 + }, + { + "epoch": 14.343908251414954, + "grad_norm": 0.0732421875, + "learning_rate": 0.006734574231143693, + "loss": 0.7822, + "num_input_tokens_seen": 55918440, + "step": 96305 + }, + { + "epoch": 14.344652963955912, + "grad_norm": 0.06689453125, + "learning_rate": 0.006732947341963165, + "loss": 0.7916, + "num_input_tokens_seen": 55921192, + "step": 96310 + }, + { + "epoch": 14.345397676496873, + "grad_norm": 0.037841796875, + "learning_rate": 0.006731320592446087, + "loss": 0.79, + "num_input_tokens_seen": 55923848, + "step": 96315 + }, + { + "epoch": 14.346142389037832, + "grad_norm": 0.03515625, + "learning_rate": 0.006729693982619945, + "loss": 0.8448, + "num_input_tokens_seen": 55926728, + "step": 96320 + }, + { + "epoch": 14.34688710157879, + "grad_norm": 0.060546875, + "learning_rate": 0.006728067512512221, + "loss": 0.8061, + "num_input_tokens_seen": 55929832, + "step": 96325 + }, + { + "epoch": 14.34763181411975, + "grad_norm": 0.046630859375, + "learning_rate": 0.006726441182150382, + "loss": 0.7947, + "num_input_tokens_seen": 55933128, + "step": 96330 + }, + { + "epoch": 14.34837652666071, + "grad_norm": 0.028564453125, + "learning_rate": 0.006724814991561916, + "loss": 0.8094, + "num_input_tokens_seen": 55935816, + "step": 96335 + }, + { + "epoch": 14.349121239201668, + "grad_norm": 0.036376953125, + "learning_rate": 0.006723188940774285, + "loss": 0.8173, + "num_input_tokens_seen": 55938696, + "step": 96340 + }, + { + "epoch": 14.349865951742627, + "grad_norm": 0.0390625, + "learning_rate": 0.0067215630298149715, + "loss": 0.8026, + "num_input_tokens_seen": 55941640, + "step": 96345 + }, + { + "epoch": 14.350610664283586, + "grad_norm": 0.061767578125, + "learning_rate": 0.006719937258711435, + "loss": 0.8142, + "num_input_tokens_seen": 55944808, + "step": 96350 + }, + { + "epoch": 14.351355376824547, + "grad_norm": 0.03369140625, + "learning_rate": 0.006718311627491143, + "loss": 0.7869, + "num_input_tokens_seen": 55947560, + "step": 96355 + }, + { + "epoch": 14.352100089365505, + "grad_norm": 0.034423828125, + "learning_rate": 0.006716686136181554, + "loss": 0.8069, + "num_input_tokens_seen": 55950536, + "step": 96360 + }, + { + "epoch": 14.352844801906464, + "grad_norm": 0.056884765625, + "learning_rate": 0.00671506078481014, + "loss": 0.7786, + "num_input_tokens_seen": 55953576, + "step": 96365 + }, + { + "epoch": 14.353589514447423, + "grad_norm": 0.03955078125, + "learning_rate": 0.006713435573404353, + "loss": 0.7794, + "num_input_tokens_seen": 55956456, + "step": 96370 + }, + { + "epoch": 14.354334226988382, + "grad_norm": 0.043212890625, + "learning_rate": 0.006711810501991643, + "loss": 0.7973, + "num_input_tokens_seen": 55958984, + "step": 96375 + }, + { + "epoch": 14.355078939529342, + "grad_norm": 0.0233154296875, + "learning_rate": 0.006710185570599479, + "loss": 0.7915, + "num_input_tokens_seen": 55962248, + "step": 96380 + }, + { + "epoch": 14.3558236520703, + "grad_norm": 0.0341796875, + "learning_rate": 0.006708560779255297, + "loss": 0.8184, + "num_input_tokens_seen": 55965128, + "step": 96385 + }, + { + "epoch": 14.35656836461126, + "grad_norm": 0.034423828125, + "learning_rate": 0.006706936127986559, + "loss": 0.8124, + "num_input_tokens_seen": 55968136, + "step": 96390 + }, + { + "epoch": 14.357313077152218, + "grad_norm": 0.0478515625, + "learning_rate": 0.0067053116168207015, + "loss": 0.8055, + "num_input_tokens_seen": 55971176, + "step": 96395 + }, + { + "epoch": 14.358057789693179, + "grad_norm": 0.04931640625, + "learning_rate": 0.006703687245785179, + "loss": 0.7831, + "num_input_tokens_seen": 55973992, + "step": 96400 + }, + { + "epoch": 14.358802502234138, + "grad_norm": 0.0400390625, + "learning_rate": 0.00670206301490743, + "loss": 0.7857, + "num_input_tokens_seen": 55976968, + "step": 96405 + }, + { + "epoch": 14.359547214775096, + "grad_norm": 0.03564453125, + "learning_rate": 0.0067004389242148864, + "loss": 0.7992, + "num_input_tokens_seen": 55979912, + "step": 96410 + }, + { + "epoch": 14.360291927316055, + "grad_norm": 0.033935546875, + "learning_rate": 0.006698814973734997, + "loss": 0.7934, + "num_input_tokens_seen": 55982664, + "step": 96415 + }, + { + "epoch": 14.361036639857016, + "grad_norm": 0.04150390625, + "learning_rate": 0.006697191163495192, + "loss": 0.805, + "num_input_tokens_seen": 55985704, + "step": 96420 + }, + { + "epoch": 14.361781352397974, + "grad_norm": 0.0361328125, + "learning_rate": 0.006695567493522904, + "loss": 0.7831, + "num_input_tokens_seen": 55988648, + "step": 96425 + }, + { + "epoch": 14.362526064938933, + "grad_norm": 0.0458984375, + "learning_rate": 0.006693943963845557, + "loss": 0.7938, + "num_input_tokens_seen": 55991240, + "step": 96430 + }, + { + "epoch": 14.363270777479892, + "grad_norm": 0.047607421875, + "learning_rate": 0.00669232057449059, + "loss": 0.7839, + "num_input_tokens_seen": 55993992, + "step": 96435 + }, + { + "epoch": 14.364015490020853, + "grad_norm": 0.037353515625, + "learning_rate": 0.006690697325485417, + "loss": 0.8184, + "num_input_tokens_seen": 55996904, + "step": 96440 + }, + { + "epoch": 14.364760202561811, + "grad_norm": 0.047607421875, + "learning_rate": 0.006689074216857475, + "loss": 0.7903, + "num_input_tokens_seen": 56000104, + "step": 96445 + }, + { + "epoch": 14.36550491510277, + "grad_norm": 0.04931640625, + "learning_rate": 0.006687451248634177, + "loss": 0.8207, + "num_input_tokens_seen": 56003272, + "step": 96450 + }, + { + "epoch": 14.366249627643729, + "grad_norm": 0.052001953125, + "learning_rate": 0.006685828420842936, + "loss": 0.7771, + "num_input_tokens_seen": 56005928, + "step": 96455 + }, + { + "epoch": 14.36699434018469, + "grad_norm": 0.05810546875, + "learning_rate": 0.006684205733511178, + "loss": 0.7992, + "num_input_tokens_seen": 56008648, + "step": 96460 + }, + { + "epoch": 14.367739052725648, + "grad_norm": 0.033203125, + "learning_rate": 0.006682583186666307, + "loss": 0.7942, + "num_input_tokens_seen": 56011816, + "step": 96465 + }, + { + "epoch": 14.368483765266607, + "grad_norm": 0.04296875, + "learning_rate": 0.006680960780335745, + "loss": 0.8026, + "num_input_tokens_seen": 56014856, + "step": 96470 + }, + { + "epoch": 14.369228477807566, + "grad_norm": 0.047607421875, + "learning_rate": 0.006679338514546892, + "loss": 0.8224, + "num_input_tokens_seen": 56017448, + "step": 96475 + }, + { + "epoch": 14.369973190348526, + "grad_norm": 0.046630859375, + "learning_rate": 0.006677716389327162, + "loss": 0.8034, + "num_input_tokens_seen": 56020680, + "step": 96480 + }, + { + "epoch": 14.370717902889485, + "grad_norm": 0.048095703125, + "learning_rate": 0.006676094404703957, + "loss": 0.7981, + "num_input_tokens_seen": 56023528, + "step": 96485 + }, + { + "epoch": 14.371462615430444, + "grad_norm": 0.058349609375, + "learning_rate": 0.006674472560704675, + "loss": 0.8135, + "num_input_tokens_seen": 56026888, + "step": 96490 + }, + { + "epoch": 14.372207327971402, + "grad_norm": 0.038330078125, + "learning_rate": 0.006672850857356717, + "loss": 0.7966, + "num_input_tokens_seen": 56029672, + "step": 96495 + }, + { + "epoch": 14.372952040512363, + "grad_norm": 0.050048828125, + "learning_rate": 0.0066712292946874766, + "loss": 0.781, + "num_input_tokens_seen": 56032744, + "step": 96500 + }, + { + "epoch": 14.373696753053322, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0066696078727243565, + "loss": 0.7897, + "num_input_tokens_seen": 56035432, + "step": 96505 + }, + { + "epoch": 14.37444146559428, + "grad_norm": 0.033203125, + "learning_rate": 0.0066679865914947385, + "loss": 0.7984, + "num_input_tokens_seen": 56038056, + "step": 96510 + }, + { + "epoch": 14.37518617813524, + "grad_norm": 0.031005859375, + "learning_rate": 0.006666365451026026, + "loss": 0.804, + "num_input_tokens_seen": 56041128, + "step": 96515 + }, + { + "epoch": 14.3759308906762, + "grad_norm": 0.0458984375, + "learning_rate": 0.006664744451345594, + "loss": 0.7953, + "num_input_tokens_seen": 56044040, + "step": 96520 + }, + { + "epoch": 14.376675603217159, + "grad_norm": 0.044921875, + "learning_rate": 0.006663123592480839, + "loss": 0.8024, + "num_input_tokens_seen": 56046952, + "step": 96525 + }, + { + "epoch": 14.377420315758117, + "grad_norm": 0.031982421875, + "learning_rate": 0.006661502874459134, + "loss": 0.8159, + "num_input_tokens_seen": 56049768, + "step": 96530 + }, + { + "epoch": 14.378165028299076, + "grad_norm": 0.0546875, + "learning_rate": 0.006659882297307861, + "loss": 0.8163, + "num_input_tokens_seen": 56052712, + "step": 96535 + }, + { + "epoch": 14.378909740840037, + "grad_norm": 0.06201171875, + "learning_rate": 0.006658261861054404, + "loss": 0.7898, + "num_input_tokens_seen": 56055752, + "step": 96540 + }, + { + "epoch": 14.379654453380995, + "grad_norm": 0.049560546875, + "learning_rate": 0.006656641565726136, + "loss": 0.7949, + "num_input_tokens_seen": 56058600, + "step": 96545 + }, + { + "epoch": 14.380399165921954, + "grad_norm": 0.052490234375, + "learning_rate": 0.006655021411350422, + "loss": 0.7805, + "num_input_tokens_seen": 56061512, + "step": 96550 + }, + { + "epoch": 14.381143878462913, + "grad_norm": 0.03466796875, + "learning_rate": 0.006653401397954646, + "loss": 0.7947, + "num_input_tokens_seen": 56064360, + "step": 96555 + }, + { + "epoch": 14.381888591003872, + "grad_norm": 0.039794921875, + "learning_rate": 0.006651781525566171, + "loss": 0.7925, + "num_input_tokens_seen": 56067528, + "step": 96560 + }, + { + "epoch": 14.382633303544832, + "grad_norm": 0.0303955078125, + "learning_rate": 0.006650161794212356, + "loss": 0.8013, + "num_input_tokens_seen": 56070536, + "step": 96565 + }, + { + "epoch": 14.383378016085791, + "grad_norm": 0.02587890625, + "learning_rate": 0.006648542203920577, + "loss": 0.7834, + "num_input_tokens_seen": 56073448, + "step": 96570 + }, + { + "epoch": 14.38412272862675, + "grad_norm": 0.05224609375, + "learning_rate": 0.006646922754718188, + "loss": 0.7863, + "num_input_tokens_seen": 56075976, + "step": 96575 + }, + { + "epoch": 14.384867441167708, + "grad_norm": 0.0419921875, + "learning_rate": 0.006645303446632545, + "loss": 0.7896, + "num_input_tokens_seen": 56078568, + "step": 96580 + }, + { + "epoch": 14.385612153708669, + "grad_norm": 0.04443359375, + "learning_rate": 0.006643684279691015, + "loss": 0.7918, + "num_input_tokens_seen": 56081832, + "step": 96585 + }, + { + "epoch": 14.386356866249628, + "grad_norm": 0.034423828125, + "learning_rate": 0.00664206525392094, + "loss": 0.7881, + "num_input_tokens_seen": 56084488, + "step": 96590 + }, + { + "epoch": 14.387101578790586, + "grad_norm": 0.04248046875, + "learning_rate": 0.006640446369349684, + "loss": 0.7918, + "num_input_tokens_seen": 56087560, + "step": 96595 + }, + { + "epoch": 14.387846291331545, + "grad_norm": 0.0400390625, + "learning_rate": 0.006638827626004584, + "loss": 0.7975, + "num_input_tokens_seen": 56090472, + "step": 96600 + }, + { + "epoch": 14.388591003872506, + "grad_norm": 0.062255859375, + "learning_rate": 0.006637209023912998, + "loss": 0.8212, + "num_input_tokens_seen": 56093416, + "step": 96605 + }, + { + "epoch": 14.389335716413465, + "grad_norm": 0.044921875, + "learning_rate": 0.006635590563102267, + "loss": 0.8172, + "num_input_tokens_seen": 56096296, + "step": 96610 + }, + { + "epoch": 14.390080428954423, + "grad_norm": 0.0361328125, + "learning_rate": 0.006633972243599733, + "loss": 0.8215, + "num_input_tokens_seen": 56099176, + "step": 96615 + }, + { + "epoch": 14.390825141495382, + "grad_norm": 0.03466796875, + "learning_rate": 0.006632354065432735, + "loss": 0.8029, + "num_input_tokens_seen": 56101768, + "step": 96620 + }, + { + "epoch": 14.391569854036343, + "grad_norm": 0.05224609375, + "learning_rate": 0.006630736028628604, + "loss": 0.7841, + "num_input_tokens_seen": 56104648, + "step": 96625 + }, + { + "epoch": 14.392314566577301, + "grad_norm": 0.046142578125, + "learning_rate": 0.006629118133214688, + "loss": 0.8183, + "num_input_tokens_seen": 56107656, + "step": 96630 + }, + { + "epoch": 14.39305927911826, + "grad_norm": 0.044921875, + "learning_rate": 0.0066275003792183075, + "loss": 0.7726, + "num_input_tokens_seen": 56110376, + "step": 96635 + }, + { + "epoch": 14.393803991659219, + "grad_norm": 0.052490234375, + "learning_rate": 0.006625882766666803, + "loss": 0.7846, + "num_input_tokens_seen": 56113352, + "step": 96640 + }, + { + "epoch": 14.39454870420018, + "grad_norm": 0.048828125, + "learning_rate": 0.006624265295587494, + "loss": 0.7934, + "num_input_tokens_seen": 56117672, + "step": 96645 + }, + { + "epoch": 14.395293416741138, + "grad_norm": 0.052490234375, + "learning_rate": 0.006622647966007716, + "loss": 0.8107, + "num_input_tokens_seen": 56120776, + "step": 96650 + }, + { + "epoch": 14.396038129282097, + "grad_norm": 0.041259765625, + "learning_rate": 0.006621030777954785, + "loss": 0.8083, + "num_input_tokens_seen": 56123464, + "step": 96655 + }, + { + "epoch": 14.396782841823056, + "grad_norm": 0.0546875, + "learning_rate": 0.006619413731456018, + "loss": 0.7874, + "num_input_tokens_seen": 56126376, + "step": 96660 + }, + { + "epoch": 14.397527554364016, + "grad_norm": 0.03515625, + "learning_rate": 0.006617796826538745, + "loss": 0.8135, + "num_input_tokens_seen": 56129320, + "step": 96665 + }, + { + "epoch": 14.398272266904975, + "grad_norm": 0.04833984375, + "learning_rate": 0.006616180063230268, + "loss": 0.7907, + "num_input_tokens_seen": 56132072, + "step": 96670 + }, + { + "epoch": 14.399016979445934, + "grad_norm": 0.04150390625, + "learning_rate": 0.006614563441557915, + "loss": 0.7821, + "num_input_tokens_seen": 56134952, + "step": 96675 + }, + { + "epoch": 14.399761691986892, + "grad_norm": 0.044189453125, + "learning_rate": 0.006612946961548988, + "loss": 0.8131, + "num_input_tokens_seen": 56138152, + "step": 96680 + }, + { + "epoch": 14.400506404527853, + "grad_norm": 0.06298828125, + "learning_rate": 0.006611330623230799, + "loss": 0.8616, + "num_input_tokens_seen": 56141352, + "step": 96685 + }, + { + "epoch": 14.401251117068812, + "grad_norm": 0.03369140625, + "learning_rate": 0.0066097144266306545, + "loss": 0.7965, + "num_input_tokens_seen": 56143976, + "step": 96690 + }, + { + "epoch": 14.40199582960977, + "grad_norm": 0.042236328125, + "learning_rate": 0.00660809837177585, + "loss": 0.7865, + "num_input_tokens_seen": 56146792, + "step": 96695 + }, + { + "epoch": 14.40274054215073, + "grad_norm": 0.035888671875, + "learning_rate": 0.0066064824586937005, + "loss": 0.7944, + "num_input_tokens_seen": 56149928, + "step": 96700 + }, + { + "epoch": 14.40348525469169, + "grad_norm": 0.032958984375, + "learning_rate": 0.006604866687411493, + "loss": 0.7866, + "num_input_tokens_seen": 56152904, + "step": 96705 + }, + { + "epoch": 14.404229967232649, + "grad_norm": 0.05712890625, + "learning_rate": 0.006603251057956537, + "loss": 0.814, + "num_input_tokens_seen": 56155912, + "step": 96710 + }, + { + "epoch": 14.404974679773607, + "grad_norm": 0.0263671875, + "learning_rate": 0.006601635570356115, + "loss": 0.8159, + "num_input_tokens_seen": 56158856, + "step": 96715 + }, + { + "epoch": 14.405719392314566, + "grad_norm": 0.0380859375, + "learning_rate": 0.006600020224637529, + "loss": 0.7817, + "num_input_tokens_seen": 56161672, + "step": 96720 + }, + { + "epoch": 14.406464104855527, + "grad_norm": 0.052978515625, + "learning_rate": 0.006598405020828059, + "loss": 0.8181, + "num_input_tokens_seen": 56164648, + "step": 96725 + }, + { + "epoch": 14.407208817396485, + "grad_norm": 0.0576171875, + "learning_rate": 0.006596789958955001, + "loss": 0.783, + "num_input_tokens_seen": 56167880, + "step": 96730 + }, + { + "epoch": 14.407953529937444, + "grad_norm": 0.041015625, + "learning_rate": 0.006595175039045637, + "loss": 0.7991, + "num_input_tokens_seen": 56170728, + "step": 96735 + }, + { + "epoch": 14.408698242478403, + "grad_norm": 0.050537109375, + "learning_rate": 0.006593560261127245, + "loss": 0.8276, + "num_input_tokens_seen": 56173736, + "step": 96740 + }, + { + "epoch": 14.409442955019362, + "grad_norm": 0.07421875, + "learning_rate": 0.006591945625227111, + "loss": 0.7966, + "num_input_tokens_seen": 56176776, + "step": 96745 + }, + { + "epoch": 14.410187667560322, + "grad_norm": 0.046630859375, + "learning_rate": 0.006590331131372512, + "loss": 0.7931, + "num_input_tokens_seen": 56179624, + "step": 96750 + }, + { + "epoch": 14.410932380101281, + "grad_norm": 0.07177734375, + "learning_rate": 0.00658871677959072, + "loss": 0.8131, + "num_input_tokens_seen": 56182280, + "step": 96755 + }, + { + "epoch": 14.41167709264224, + "grad_norm": 0.048095703125, + "learning_rate": 0.006587102569909004, + "loss": 0.7807, + "num_input_tokens_seen": 56184936, + "step": 96760 + }, + { + "epoch": 14.412421805183198, + "grad_norm": 0.0625, + "learning_rate": 0.0065854885023546444, + "loss": 0.777, + "num_input_tokens_seen": 56187784, + "step": 96765 + }, + { + "epoch": 14.413166517724159, + "grad_norm": 0.04248046875, + "learning_rate": 0.006583874576954906, + "loss": 0.7844, + "num_input_tokens_seen": 56190472, + "step": 96770 + }, + { + "epoch": 14.413911230265118, + "grad_norm": 0.0260009765625, + "learning_rate": 0.006582260793737047, + "loss": 0.8069, + "num_input_tokens_seen": 56193576, + "step": 96775 + }, + { + "epoch": 14.414655942806077, + "grad_norm": 0.052978515625, + "learning_rate": 0.006580647152728342, + "loss": 0.7929, + "num_input_tokens_seen": 56196584, + "step": 96780 + }, + { + "epoch": 14.415400655347035, + "grad_norm": 0.042724609375, + "learning_rate": 0.00657903365395604, + "loss": 0.7645, + "num_input_tokens_seen": 56199880, + "step": 96785 + }, + { + "epoch": 14.416145367887996, + "grad_norm": 0.043701171875, + "learning_rate": 0.006577420297447412, + "loss": 0.8167, + "num_input_tokens_seen": 56203144, + "step": 96790 + }, + { + "epoch": 14.416890080428955, + "grad_norm": 0.043701171875, + "learning_rate": 0.006575807083229701, + "loss": 0.7905, + "num_input_tokens_seen": 56206088, + "step": 96795 + }, + { + "epoch": 14.417634792969913, + "grad_norm": 0.11962890625, + "learning_rate": 0.006574194011330175, + "loss": 0.8207, + "num_input_tokens_seen": 56208968, + "step": 96800 + }, + { + "epoch": 14.418379505510872, + "grad_norm": 0.04931640625, + "learning_rate": 0.00657258108177607, + "loss": 0.7975, + "num_input_tokens_seen": 56211624, + "step": 96805 + }, + { + "epoch": 14.419124218051833, + "grad_norm": 0.034912109375, + "learning_rate": 0.006570968294594649, + "loss": 0.7855, + "num_input_tokens_seen": 56214344, + "step": 96810 + }, + { + "epoch": 14.419868930592791, + "grad_norm": 0.038330078125, + "learning_rate": 0.006569355649813152, + "loss": 0.7894, + "num_input_tokens_seen": 56217064, + "step": 96815 + }, + { + "epoch": 14.42061364313375, + "grad_norm": 0.036865234375, + "learning_rate": 0.006567743147458821, + "loss": 0.802, + "num_input_tokens_seen": 56220168, + "step": 96820 + }, + { + "epoch": 14.421358355674709, + "grad_norm": 0.036376953125, + "learning_rate": 0.0065661307875589025, + "loss": 0.7897, + "num_input_tokens_seen": 56222792, + "step": 96825 + }, + { + "epoch": 14.42210306821567, + "grad_norm": 0.046875, + "learning_rate": 0.006564518570140625, + "loss": 0.7881, + "num_input_tokens_seen": 56225480, + "step": 96830 + }, + { + "epoch": 14.422847780756628, + "grad_norm": 0.045166015625, + "learning_rate": 0.006562906495231238, + "loss": 0.7982, + "num_input_tokens_seen": 56228296, + "step": 96835 + }, + { + "epoch": 14.423592493297587, + "grad_norm": 0.05419921875, + "learning_rate": 0.006561294562857966, + "loss": 0.8431, + "num_input_tokens_seen": 56231336, + "step": 96840 + }, + { + "epoch": 14.424337205838546, + "grad_norm": 0.0380859375, + "learning_rate": 0.00655968277304805, + "loss": 0.7833, + "num_input_tokens_seen": 56234248, + "step": 96845 + }, + { + "epoch": 14.425081918379506, + "grad_norm": 0.0286865234375, + "learning_rate": 0.006558071125828717, + "loss": 0.7835, + "num_input_tokens_seen": 56237224, + "step": 96850 + }, + { + "epoch": 14.425826630920465, + "grad_norm": 0.029541015625, + "learning_rate": 0.006556459621227185, + "loss": 0.8138, + "num_input_tokens_seen": 56240552, + "step": 96855 + }, + { + "epoch": 14.426571343461424, + "grad_norm": 0.052001953125, + "learning_rate": 0.006554848259270692, + "loss": 0.793, + "num_input_tokens_seen": 56243528, + "step": 96860 + }, + { + "epoch": 14.427316056002383, + "grad_norm": 0.037841796875, + "learning_rate": 0.00655323703998645, + "loss": 0.7604, + "num_input_tokens_seen": 56246184, + "step": 96865 + }, + { + "epoch": 14.428060768543343, + "grad_norm": 0.032470703125, + "learning_rate": 0.0065516259634016884, + "loss": 0.8092, + "num_input_tokens_seen": 56248968, + "step": 96870 + }, + { + "epoch": 14.428805481084302, + "grad_norm": 0.0390625, + "learning_rate": 0.0065500150295436195, + "loss": 0.7899, + "num_input_tokens_seen": 56252296, + "step": 96875 + }, + { + "epoch": 14.42955019362526, + "grad_norm": 0.033447265625, + "learning_rate": 0.006548404238439452, + "loss": 0.8135, + "num_input_tokens_seen": 56255048, + "step": 96880 + }, + { + "epoch": 14.43029490616622, + "grad_norm": 0.052001953125, + "learning_rate": 0.006546793590116411, + "loss": 0.7949, + "num_input_tokens_seen": 56257960, + "step": 96885 + }, + { + "epoch": 14.43103961870718, + "grad_norm": 0.052490234375, + "learning_rate": 0.006545183084601703, + "loss": 0.7883, + "num_input_tokens_seen": 56260840, + "step": 96890 + }, + { + "epoch": 14.431784331248139, + "grad_norm": 0.03662109375, + "learning_rate": 0.0065435727219225305, + "loss": 0.7984, + "num_input_tokens_seen": 56263592, + "step": 96895 + }, + { + "epoch": 14.432529043789097, + "grad_norm": 0.036376953125, + "learning_rate": 0.006541962502106099, + "loss": 0.8074, + "num_input_tokens_seen": 56266440, + "step": 96900 + }, + { + "epoch": 14.433273756330056, + "grad_norm": 0.050537109375, + "learning_rate": 0.006540352425179618, + "loss": 0.7917, + "num_input_tokens_seen": 56269448, + "step": 96905 + }, + { + "epoch": 14.434018468871017, + "grad_norm": 0.052490234375, + "learning_rate": 0.006538742491170281, + "loss": 0.7979, + "num_input_tokens_seen": 56272392, + "step": 96910 + }, + { + "epoch": 14.434763181411975, + "grad_norm": 0.056640625, + "learning_rate": 0.006537132700105295, + "loss": 0.7699, + "num_input_tokens_seen": 56274920, + "step": 96915 + }, + { + "epoch": 14.435507893952934, + "grad_norm": 0.03271484375, + "learning_rate": 0.006535523052011845, + "loss": 0.784, + "num_input_tokens_seen": 56277928, + "step": 96920 + }, + { + "epoch": 14.436252606493893, + "grad_norm": 0.0439453125, + "learning_rate": 0.006533913546917137, + "loss": 0.8384, + "num_input_tokens_seen": 56280552, + "step": 96925 + }, + { + "epoch": 14.436997319034852, + "grad_norm": 0.05810546875, + "learning_rate": 0.006532304184848353, + "loss": 0.7833, + "num_input_tokens_seen": 56283592, + "step": 96930 + }, + { + "epoch": 14.437742031575812, + "grad_norm": 0.046875, + "learning_rate": 0.0065306949658326795, + "loss": 0.7937, + "num_input_tokens_seen": 56286600, + "step": 96935 + }, + { + "epoch": 14.438486744116771, + "grad_norm": 0.047607421875, + "learning_rate": 0.006529085889897311, + "loss": 0.8101, + "num_input_tokens_seen": 56289096, + "step": 96940 + }, + { + "epoch": 14.43923145665773, + "grad_norm": 0.0322265625, + "learning_rate": 0.006527476957069428, + "loss": 0.8068, + "num_input_tokens_seen": 56292072, + "step": 96945 + }, + { + "epoch": 14.439976169198689, + "grad_norm": 0.045166015625, + "learning_rate": 0.006525868167376212, + "loss": 0.7924, + "num_input_tokens_seen": 56294920, + "step": 96950 + }, + { + "epoch": 14.440720881739649, + "grad_norm": 0.04541015625, + "learning_rate": 0.0065242595208448335, + "loss": 0.8197, + "num_input_tokens_seen": 56298184, + "step": 96955 + }, + { + "epoch": 14.441465594280608, + "grad_norm": 0.0537109375, + "learning_rate": 0.00652265101750248, + "loss": 0.7857, + "num_input_tokens_seen": 56301256, + "step": 96960 + }, + { + "epoch": 14.442210306821567, + "grad_norm": 0.046875, + "learning_rate": 0.006521042657376319, + "loss": 0.779, + "num_input_tokens_seen": 56303944, + "step": 96965 + }, + { + "epoch": 14.442955019362525, + "grad_norm": 0.032958984375, + "learning_rate": 0.006519434440493527, + "loss": 0.786, + "num_input_tokens_seen": 56306920, + "step": 96970 + }, + { + "epoch": 14.443699731903486, + "grad_norm": 0.0771484375, + "learning_rate": 0.006517826366881274, + "loss": 0.8238, + "num_input_tokens_seen": 56309736, + "step": 96975 + }, + { + "epoch": 14.444444444444445, + "grad_norm": 0.049072265625, + "learning_rate": 0.006516218436566716, + "loss": 0.7729, + "num_input_tokens_seen": 56312616, + "step": 96980 + }, + { + "epoch": 14.445189156985403, + "grad_norm": 0.0751953125, + "learning_rate": 0.0065146106495770325, + "loss": 0.7837, + "num_input_tokens_seen": 56315752, + "step": 96985 + }, + { + "epoch": 14.445933869526362, + "grad_norm": 0.044677734375, + "learning_rate": 0.006513003005939371, + "loss": 0.7921, + "num_input_tokens_seen": 56318536, + "step": 96990 + }, + { + "epoch": 14.446678582067323, + "grad_norm": 0.056884765625, + "learning_rate": 0.006511395505680903, + "loss": 0.7858, + "num_input_tokens_seen": 56321160, + "step": 96995 + }, + { + "epoch": 14.447423294608281, + "grad_norm": 0.07470703125, + "learning_rate": 0.006509788148828776, + "loss": 0.8089, + "num_input_tokens_seen": 56324392, + "step": 97000 + }, + { + "epoch": 14.44816800714924, + "grad_norm": 0.059814453125, + "learning_rate": 0.0065081809354101535, + "loss": 0.7993, + "num_input_tokens_seen": 56327272, + "step": 97005 + }, + { + "epoch": 14.448912719690199, + "grad_norm": 0.052001953125, + "learning_rate": 0.0065065738654521855, + "loss": 0.7995, + "num_input_tokens_seen": 56330216, + "step": 97010 + }, + { + "epoch": 14.44965743223116, + "grad_norm": 0.04150390625, + "learning_rate": 0.006504966938982018, + "loss": 0.8077, + "num_input_tokens_seen": 56333000, + "step": 97015 + }, + { + "epoch": 14.450402144772118, + "grad_norm": 0.025390625, + "learning_rate": 0.0065033601560268, + "loss": 0.7886, + "num_input_tokens_seen": 56335944, + "step": 97020 + }, + { + "epoch": 14.451146857313077, + "grad_norm": 0.06494140625, + "learning_rate": 0.006501753516613672, + "loss": 0.7959, + "num_input_tokens_seen": 56339048, + "step": 97025 + }, + { + "epoch": 14.451891569854036, + "grad_norm": 0.058349609375, + "learning_rate": 0.0065001470207697865, + "loss": 0.819, + "num_input_tokens_seen": 56341800, + "step": 97030 + }, + { + "epoch": 14.452636282394996, + "grad_norm": 0.03515625, + "learning_rate": 0.006498540668522272, + "loss": 0.805, + "num_input_tokens_seen": 56344520, + "step": 97035 + }, + { + "epoch": 14.453380994935955, + "grad_norm": 0.05322265625, + "learning_rate": 0.006496934459898278, + "loss": 0.789, + "num_input_tokens_seen": 56347816, + "step": 97040 + }, + { + "epoch": 14.454125707476914, + "grad_norm": 0.056640625, + "learning_rate": 0.0064953283949249286, + "loss": 0.7795, + "num_input_tokens_seen": 56350888, + "step": 97045 + }, + { + "epoch": 14.454870420017873, + "grad_norm": 0.056884765625, + "learning_rate": 0.00649372247362937, + "loss": 0.8019, + "num_input_tokens_seen": 56353800, + "step": 97050 + }, + { + "epoch": 14.455615132558833, + "grad_norm": 0.031494140625, + "learning_rate": 0.0064921166960387215, + "loss": 0.8022, + "num_input_tokens_seen": 56356680, + "step": 97055 + }, + { + "epoch": 14.456359845099792, + "grad_norm": 0.046875, + "learning_rate": 0.006490511062180111, + "loss": 0.7791, + "num_input_tokens_seen": 56359624, + "step": 97060 + }, + { + "epoch": 14.45710455764075, + "grad_norm": 0.037353515625, + "learning_rate": 0.0064889055720806724, + "loss": 0.7863, + "num_input_tokens_seen": 56362632, + "step": 97065 + }, + { + "epoch": 14.45784927018171, + "grad_norm": 0.053466796875, + "learning_rate": 0.006487300225767518, + "loss": 0.7832, + "num_input_tokens_seen": 56365352, + "step": 97070 + }, + { + "epoch": 14.458593982722668, + "grad_norm": 0.054443359375, + "learning_rate": 0.00648569502326778, + "loss": 0.7999, + "num_input_tokens_seen": 56368200, + "step": 97075 + }, + { + "epoch": 14.459338695263629, + "grad_norm": 0.03955078125, + "learning_rate": 0.006484089964608571, + "loss": 0.8169, + "num_input_tokens_seen": 56370888, + "step": 97080 + }, + { + "epoch": 14.460083407804587, + "grad_norm": 0.054931640625, + "learning_rate": 0.006482485049817007, + "loss": 0.8083, + "num_input_tokens_seen": 56373928, + "step": 97085 + }, + { + "epoch": 14.460828120345546, + "grad_norm": 0.043212890625, + "learning_rate": 0.006480880278920201, + "loss": 0.7858, + "num_input_tokens_seen": 56376552, + "step": 97090 + }, + { + "epoch": 14.461572832886505, + "grad_norm": 0.036865234375, + "learning_rate": 0.006479275651945259, + "loss": 0.7973, + "num_input_tokens_seen": 56379400, + "step": 97095 + }, + { + "epoch": 14.462317545427466, + "grad_norm": 0.045654296875, + "learning_rate": 0.006477671168919301, + "loss": 0.7952, + "num_input_tokens_seen": 56382408, + "step": 97100 + }, + { + "epoch": 14.463062257968424, + "grad_norm": 0.04150390625, + "learning_rate": 0.00647606682986942, + "loss": 0.7914, + "num_input_tokens_seen": 56385000, + "step": 97105 + }, + { + "epoch": 14.463806970509383, + "grad_norm": 0.050537109375, + "learning_rate": 0.006474462634822732, + "loss": 0.7894, + "num_input_tokens_seen": 56388200, + "step": 97110 + }, + { + "epoch": 14.464551683050342, + "grad_norm": 0.050048828125, + "learning_rate": 0.006472858583806329, + "loss": 0.8083, + "num_input_tokens_seen": 56390984, + "step": 97115 + }, + { + "epoch": 14.465296395591302, + "grad_norm": 0.044189453125, + "learning_rate": 0.006471254676847318, + "loss": 0.7984, + "num_input_tokens_seen": 56393704, + "step": 97120 + }, + { + "epoch": 14.466041108132261, + "grad_norm": 0.047119140625, + "learning_rate": 0.006469650913972785, + "loss": 0.7753, + "num_input_tokens_seen": 56396616, + "step": 97125 + }, + { + "epoch": 14.46678582067322, + "grad_norm": 0.0556640625, + "learning_rate": 0.006468047295209837, + "loss": 0.8006, + "num_input_tokens_seen": 56399528, + "step": 97130 + }, + { + "epoch": 14.467530533214179, + "grad_norm": 0.042724609375, + "learning_rate": 0.006466443820585557, + "loss": 0.7911, + "num_input_tokens_seen": 56402312, + "step": 97135 + }, + { + "epoch": 14.46827524575514, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0064648404901270305, + "loss": 0.8099, + "num_input_tokens_seen": 56405032, + "step": 97140 + }, + { + "epoch": 14.469019958296098, + "grad_norm": 0.048583984375, + "learning_rate": 0.0064632373038613555, + "loss": 0.7889, + "num_input_tokens_seen": 56407624, + "step": 97145 + }, + { + "epoch": 14.469764670837057, + "grad_norm": 0.052978515625, + "learning_rate": 0.006461634261815608, + "loss": 0.794, + "num_input_tokens_seen": 56410568, + "step": 97150 + }, + { + "epoch": 14.470509383378015, + "grad_norm": 0.052734375, + "learning_rate": 0.006460031364016873, + "loss": 0.799, + "num_input_tokens_seen": 56413736, + "step": 97155 + }, + { + "epoch": 14.471254095918976, + "grad_norm": 0.044189453125, + "learning_rate": 0.006458428610492222, + "loss": 0.8087, + "num_input_tokens_seen": 56416936, + "step": 97160 + }, + { + "epoch": 14.471998808459935, + "grad_norm": 0.04736328125, + "learning_rate": 0.006456826001268746, + "loss": 0.789, + "num_input_tokens_seen": 56420008, + "step": 97165 + }, + { + "epoch": 14.472743521000893, + "grad_norm": 0.06884765625, + "learning_rate": 0.006455223536373505, + "loss": 0.8062, + "num_input_tokens_seen": 56422760, + "step": 97170 + }, + { + "epoch": 14.473488233541852, + "grad_norm": 0.0380859375, + "learning_rate": 0.0064536212158335824, + "loss": 0.8089, + "num_input_tokens_seen": 56425832, + "step": 97175 + }, + { + "epoch": 14.474232946082813, + "grad_norm": 0.052001953125, + "learning_rate": 0.006452019039676044, + "loss": 0.7956, + "num_input_tokens_seen": 56428552, + "step": 97180 + }, + { + "epoch": 14.474977658623772, + "grad_norm": 0.041015625, + "learning_rate": 0.006450417007927951, + "loss": 0.8083, + "num_input_tokens_seen": 56431272, + "step": 97185 + }, + { + "epoch": 14.47572237116473, + "grad_norm": 0.026611328125, + "learning_rate": 0.006448815120616378, + "loss": 0.837, + "num_input_tokens_seen": 56434056, + "step": 97190 + }, + { + "epoch": 14.476467083705689, + "grad_norm": 0.0400390625, + "learning_rate": 0.0064472133777683785, + "loss": 0.8136, + "num_input_tokens_seen": 56437064, + "step": 97195 + }, + { + "epoch": 14.47721179624665, + "grad_norm": 0.0361328125, + "learning_rate": 0.00644561177941102, + "loss": 0.7893, + "num_input_tokens_seen": 56439944, + "step": 97200 + }, + { + "epoch": 14.477956508787608, + "grad_norm": 0.055419921875, + "learning_rate": 0.006444010325571357, + "loss": 0.7929, + "num_input_tokens_seen": 56442856, + "step": 97205 + }, + { + "epoch": 14.478701221328567, + "grad_norm": 0.043212890625, + "learning_rate": 0.006442409016276439, + "loss": 0.7899, + "num_input_tokens_seen": 56445864, + "step": 97210 + }, + { + "epoch": 14.479445933869526, + "grad_norm": 0.0220947265625, + "learning_rate": 0.006440807851553328, + "loss": 0.7962, + "num_input_tokens_seen": 56448648, + "step": 97215 + }, + { + "epoch": 14.480190646410486, + "grad_norm": 0.045654296875, + "learning_rate": 0.006439206831429069, + "loss": 0.7974, + "num_input_tokens_seen": 56451304, + "step": 97220 + }, + { + "epoch": 14.480935358951445, + "grad_norm": 0.03515625, + "learning_rate": 0.0064376059559307095, + "loss": 0.8145, + "num_input_tokens_seen": 56454312, + "step": 97225 + }, + { + "epoch": 14.481680071492404, + "grad_norm": 0.0390625, + "learning_rate": 0.00643600522508529, + "loss": 0.7899, + "num_input_tokens_seen": 56457608, + "step": 97230 + }, + { + "epoch": 14.482424784033363, + "grad_norm": 0.03857421875, + "learning_rate": 0.0064344046389198635, + "loss": 0.7871, + "num_input_tokens_seen": 56460584, + "step": 97235 + }, + { + "epoch": 14.483169496574323, + "grad_norm": 0.08349609375, + "learning_rate": 0.006432804197461458, + "loss": 0.797, + "num_input_tokens_seen": 56463624, + "step": 97240 + }, + { + "epoch": 14.483914209115282, + "grad_norm": 0.05419921875, + "learning_rate": 0.006431203900737125, + "loss": 0.7851, + "num_input_tokens_seen": 56466920, + "step": 97245 + }, + { + "epoch": 14.48465892165624, + "grad_norm": 0.03857421875, + "learning_rate": 0.006429603748773888, + "loss": 0.7912, + "num_input_tokens_seen": 56470056, + "step": 97250 + }, + { + "epoch": 14.4854036341972, + "grad_norm": 0.049560546875, + "learning_rate": 0.00642800374159879, + "loss": 0.8119, + "num_input_tokens_seen": 56473224, + "step": 97255 + }, + { + "epoch": 14.486148346738158, + "grad_norm": 0.0274658203125, + "learning_rate": 0.006426403879238858, + "loss": 0.788, + "num_input_tokens_seen": 56476104, + "step": 97260 + }, + { + "epoch": 14.486893059279119, + "grad_norm": 0.04443359375, + "learning_rate": 0.0064248041617211114, + "loss": 0.7945, + "num_input_tokens_seen": 56479272, + "step": 97265 + }, + { + "epoch": 14.487637771820078, + "grad_norm": 0.0478515625, + "learning_rate": 0.0064232045890725885, + "loss": 0.8142, + "num_input_tokens_seen": 56482280, + "step": 97270 + }, + { + "epoch": 14.488382484361036, + "grad_norm": 0.042236328125, + "learning_rate": 0.006421605161320307, + "loss": 0.787, + "num_input_tokens_seen": 56485032, + "step": 97275 + }, + { + "epoch": 14.489127196901995, + "grad_norm": 0.033935546875, + "learning_rate": 0.00642000587849129, + "loss": 0.7781, + "num_input_tokens_seen": 56488072, + "step": 97280 + }, + { + "epoch": 14.489871909442956, + "grad_norm": 0.03466796875, + "learning_rate": 0.006418406740612545, + "loss": 0.8123, + "num_input_tokens_seen": 56490920, + "step": 97285 + }, + { + "epoch": 14.490616621983914, + "grad_norm": 0.034423828125, + "learning_rate": 0.006416807747711104, + "loss": 0.7913, + "num_input_tokens_seen": 56493672, + "step": 97290 + }, + { + "epoch": 14.491361334524873, + "grad_norm": 0.034423828125, + "learning_rate": 0.006415208899813969, + "loss": 0.8077, + "num_input_tokens_seen": 56496424, + "step": 97295 + }, + { + "epoch": 14.492106047065832, + "grad_norm": 0.036376953125, + "learning_rate": 0.006413610196948152, + "loss": 0.7862, + "num_input_tokens_seen": 56499144, + "step": 97300 + }, + { + "epoch": 14.492850759606792, + "grad_norm": 0.03515625, + "learning_rate": 0.00641201163914067, + "loss": 0.8003, + "num_input_tokens_seen": 56502056, + "step": 97305 + }, + { + "epoch": 14.493595472147751, + "grad_norm": 0.052001953125, + "learning_rate": 0.006410413226418516, + "loss": 0.8022, + "num_input_tokens_seen": 56504648, + "step": 97310 + }, + { + "epoch": 14.49434018468871, + "grad_norm": 0.044921875, + "learning_rate": 0.006408814958808706, + "loss": 0.7873, + "num_input_tokens_seen": 56507496, + "step": 97315 + }, + { + "epoch": 14.495084897229669, + "grad_norm": 0.03271484375, + "learning_rate": 0.00640721683633823, + "loss": 0.8312, + "num_input_tokens_seen": 56510312, + "step": 97320 + }, + { + "epoch": 14.49582960977063, + "grad_norm": 0.037109375, + "learning_rate": 0.006405618859034097, + "loss": 0.8081, + "num_input_tokens_seen": 56513288, + "step": 97325 + }, + { + "epoch": 14.496574322311588, + "grad_norm": 0.04345703125, + "learning_rate": 0.0064040210269232945, + "loss": 0.7928, + "num_input_tokens_seen": 56515944, + "step": 97330 + }, + { + "epoch": 14.497319034852547, + "grad_norm": 0.045166015625, + "learning_rate": 0.006402423340032825, + "loss": 0.8154, + "num_input_tokens_seen": 56518888, + "step": 97335 + }, + { + "epoch": 14.498063747393505, + "grad_norm": 0.047607421875, + "learning_rate": 0.006400825798389676, + "loss": 0.7953, + "num_input_tokens_seen": 56521736, + "step": 97340 + }, + { + "epoch": 14.498808459934466, + "grad_norm": 0.033203125, + "learning_rate": 0.006399228402020834, + "loss": 0.7897, + "num_input_tokens_seen": 56524328, + "step": 97345 + }, + { + "epoch": 14.499553172475425, + "grad_norm": 0.04248046875, + "learning_rate": 0.006397631150953287, + "loss": 0.8039, + "num_input_tokens_seen": 56527336, + "step": 97350 + }, + { + "epoch": 14.500297885016384, + "grad_norm": 0.0419921875, + "learning_rate": 0.006396034045214012, + "loss": 0.7947, + "num_input_tokens_seen": 56530088, + "step": 97355 + }, + { + "epoch": 14.501042597557342, + "grad_norm": 0.04541015625, + "learning_rate": 0.006394437084830004, + "loss": 0.7887, + "num_input_tokens_seen": 56532904, + "step": 97360 + }, + { + "epoch": 14.501787310098303, + "grad_norm": 0.06494140625, + "learning_rate": 0.00639284026982823, + "loss": 0.8015, + "num_input_tokens_seen": 56536040, + "step": 97365 + }, + { + "epoch": 14.502532022639262, + "grad_norm": 0.041259765625, + "learning_rate": 0.006391243600235677, + "loss": 0.793, + "num_input_tokens_seen": 56538952, + "step": 97370 + }, + { + "epoch": 14.50327673518022, + "grad_norm": 0.04052734375, + "learning_rate": 0.006389647076079314, + "loss": 0.827, + "num_input_tokens_seen": 56541928, + "step": 97375 + }, + { + "epoch": 14.504021447721179, + "grad_norm": 0.052001953125, + "learning_rate": 0.0063880506973861065, + "loss": 0.8026, + "num_input_tokens_seen": 56544808, + "step": 97380 + }, + { + "epoch": 14.50476616026214, + "grad_norm": 0.048583984375, + "learning_rate": 0.0063864544641830355, + "loss": 0.8028, + "num_input_tokens_seen": 56547400, + "step": 97385 + }, + { + "epoch": 14.505510872803098, + "grad_norm": 0.03564453125, + "learning_rate": 0.006384858376497056, + "loss": 0.8125, + "num_input_tokens_seen": 56550056, + "step": 97390 + }, + { + "epoch": 14.506255585344057, + "grad_norm": 0.039306640625, + "learning_rate": 0.006383262434355144, + "loss": 0.8048, + "num_input_tokens_seen": 56552872, + "step": 97395 + }, + { + "epoch": 14.507000297885016, + "grad_norm": 0.038330078125, + "learning_rate": 0.006381666637784249, + "loss": 0.7795, + "num_input_tokens_seen": 56555848, + "step": 97400 + }, + { + "epoch": 14.507745010425975, + "grad_norm": 0.037841796875, + "learning_rate": 0.0063800709868113425, + "loss": 0.7948, + "num_input_tokens_seen": 56558952, + "step": 97405 + }, + { + "epoch": 14.508489722966935, + "grad_norm": 0.05322265625, + "learning_rate": 0.006378475481463378, + "loss": 0.7753, + "num_input_tokens_seen": 56562088, + "step": 97410 + }, + { + "epoch": 14.509234435507894, + "grad_norm": 0.048583984375, + "learning_rate": 0.006376880121767304, + "loss": 0.7942, + "num_input_tokens_seen": 56564872, + "step": 97415 + }, + { + "epoch": 14.509979148048853, + "grad_norm": 0.0361328125, + "learning_rate": 0.006375284907750079, + "loss": 0.7907, + "num_input_tokens_seen": 56567848, + "step": 97420 + }, + { + "epoch": 14.510723860589813, + "grad_norm": 0.0361328125, + "learning_rate": 0.0063736898394386446, + "loss": 0.7905, + "num_input_tokens_seen": 56570696, + "step": 97425 + }, + { + "epoch": 14.511468573130772, + "grad_norm": 0.0269775390625, + "learning_rate": 0.006372094916859958, + "loss": 0.8035, + "num_input_tokens_seen": 56573448, + "step": 97430 + }, + { + "epoch": 14.51221328567173, + "grad_norm": 0.0439453125, + "learning_rate": 0.006370500140040951, + "loss": 0.7953, + "num_input_tokens_seen": 56576424, + "step": 97435 + }, + { + "epoch": 14.51295799821269, + "grad_norm": 0.03857421875, + "learning_rate": 0.006368905509008581, + "loss": 0.7822, + "num_input_tokens_seen": 56579432, + "step": 97440 + }, + { + "epoch": 14.513702710753648, + "grad_norm": 0.057861328125, + "learning_rate": 0.0063673110237897745, + "loss": 0.7968, + "num_input_tokens_seen": 56582728, + "step": 97445 + }, + { + "epoch": 14.514447423294609, + "grad_norm": 0.034912109375, + "learning_rate": 0.00636571668441148, + "loss": 0.8035, + "num_input_tokens_seen": 56585672, + "step": 97450 + }, + { + "epoch": 14.515192135835568, + "grad_norm": 0.044921875, + "learning_rate": 0.0063641224909006255, + "loss": 0.788, + "num_input_tokens_seen": 56588776, + "step": 97455 + }, + { + "epoch": 14.515936848376526, + "grad_norm": 0.0302734375, + "learning_rate": 0.0063625284432841395, + "loss": 0.8128, + "num_input_tokens_seen": 56591656, + "step": 97460 + }, + { + "epoch": 14.516681560917485, + "grad_norm": 0.052490234375, + "learning_rate": 0.006360934541588964, + "loss": 0.8024, + "num_input_tokens_seen": 56594600, + "step": 97465 + }, + { + "epoch": 14.517426273458446, + "grad_norm": 0.058837890625, + "learning_rate": 0.006359340785842011, + "loss": 0.7856, + "num_input_tokens_seen": 56597448, + "step": 97470 + }, + { + "epoch": 14.518170985999404, + "grad_norm": 0.046142578125, + "learning_rate": 0.0063577471760702204, + "loss": 0.8042, + "num_input_tokens_seen": 56600424, + "step": 97475 + }, + { + "epoch": 14.518915698540363, + "grad_norm": 0.0732421875, + "learning_rate": 0.0063561537123005074, + "loss": 0.796, + "num_input_tokens_seen": 56603464, + "step": 97480 + }, + { + "epoch": 14.519660411081322, + "grad_norm": 0.039306640625, + "learning_rate": 0.006354560394559792, + "loss": 0.8002, + "num_input_tokens_seen": 56606344, + "step": 97485 + }, + { + "epoch": 14.520405123622282, + "grad_norm": 0.03466796875, + "learning_rate": 0.006352967222874987, + "loss": 0.7733, + "num_input_tokens_seen": 56609384, + "step": 97490 + }, + { + "epoch": 14.521149836163241, + "grad_norm": 0.039794921875, + "learning_rate": 0.006351374197273017, + "loss": 0.7709, + "num_input_tokens_seen": 56612008, + "step": 97495 + }, + { + "epoch": 14.5218945487042, + "grad_norm": 0.047119140625, + "learning_rate": 0.00634978131778079, + "loss": 0.7807, + "num_input_tokens_seen": 56614824, + "step": 97500 + }, + { + "epoch": 14.522639261245159, + "grad_norm": 0.045166015625, + "learning_rate": 0.006348188584425211, + "loss": 0.8023, + "num_input_tokens_seen": 56617576, + "step": 97505 + }, + { + "epoch": 14.52338397378612, + "grad_norm": 0.041259765625, + "learning_rate": 0.006346595997233198, + "loss": 0.7763, + "num_input_tokens_seen": 56620584, + "step": 97510 + }, + { + "epoch": 14.524128686327078, + "grad_norm": 0.046142578125, + "learning_rate": 0.006345003556231644, + "loss": 0.7973, + "num_input_tokens_seen": 56623272, + "step": 97515 + }, + { + "epoch": 14.524873398868037, + "grad_norm": 0.035400390625, + "learning_rate": 0.0063434112614474646, + "loss": 0.7934, + "num_input_tokens_seen": 56626088, + "step": 97520 + }, + { + "epoch": 14.525618111408996, + "grad_norm": 0.04150390625, + "learning_rate": 0.006341819112907549, + "loss": 0.8047, + "num_input_tokens_seen": 56628904, + "step": 97525 + }, + { + "epoch": 14.526362823949956, + "grad_norm": 0.056640625, + "learning_rate": 0.006340227110638805, + "loss": 0.7911, + "num_input_tokens_seen": 56631624, + "step": 97530 + }, + { + "epoch": 14.527107536490915, + "grad_norm": 0.04345703125, + "learning_rate": 0.006338635254668121, + "loss": 0.8042, + "num_input_tokens_seen": 56634536, + "step": 97535 + }, + { + "epoch": 14.527852249031874, + "grad_norm": 0.038818359375, + "learning_rate": 0.006337043545022387, + "loss": 0.802, + "num_input_tokens_seen": 56637384, + "step": 97540 + }, + { + "epoch": 14.528596961572832, + "grad_norm": 0.051025390625, + "learning_rate": 0.006335451981728504, + "loss": 0.7851, + "num_input_tokens_seen": 56640072, + "step": 97545 + }, + { + "epoch": 14.529341674113793, + "grad_norm": 0.033447265625, + "learning_rate": 0.006333860564813353, + "loss": 0.8102, + "num_input_tokens_seen": 56643112, + "step": 97550 + }, + { + "epoch": 14.530086386654752, + "grad_norm": 0.022216796875, + "learning_rate": 0.00633226929430382, + "loss": 0.8047, + "num_input_tokens_seen": 56646024, + "step": 97555 + }, + { + "epoch": 14.53083109919571, + "grad_norm": 0.09912109375, + "learning_rate": 0.006330678170226781, + "loss": 0.799, + "num_input_tokens_seen": 56649032, + "step": 97560 + }, + { + "epoch": 14.53157581173667, + "grad_norm": 0.042236328125, + "learning_rate": 0.00632908719260913, + "loss": 0.8032, + "num_input_tokens_seen": 56651752, + "step": 97565 + }, + { + "epoch": 14.53232052427763, + "grad_norm": 0.03271484375, + "learning_rate": 0.0063274963614777325, + "loss": 0.8049, + "num_input_tokens_seen": 56654536, + "step": 97570 + }, + { + "epoch": 14.533065236818588, + "grad_norm": 0.031005859375, + "learning_rate": 0.0063259056768594744, + "loss": 0.7839, + "num_input_tokens_seen": 56657256, + "step": 97575 + }, + { + "epoch": 14.533809949359547, + "grad_norm": 0.03857421875, + "learning_rate": 0.006324315138781225, + "loss": 0.7981, + "num_input_tokens_seen": 56659880, + "step": 97580 + }, + { + "epoch": 14.534554661900506, + "grad_norm": 0.0478515625, + "learning_rate": 0.006322724747269848, + "loss": 0.7844, + "num_input_tokens_seen": 56662760, + "step": 97585 + }, + { + "epoch": 14.535299374441465, + "grad_norm": 0.04052734375, + "learning_rate": 0.006321134502352222, + "loss": 0.8, + "num_input_tokens_seen": 56665672, + "step": 97590 + }, + { + "epoch": 14.536044086982425, + "grad_norm": 0.043212890625, + "learning_rate": 0.006319544404055202, + "loss": 0.7965, + "num_input_tokens_seen": 56668712, + "step": 97595 + }, + { + "epoch": 14.536788799523384, + "grad_norm": 0.041015625, + "learning_rate": 0.006317954452405664, + "loss": 0.7961, + "num_input_tokens_seen": 56671496, + "step": 97600 + }, + { + "epoch": 14.537533512064343, + "grad_norm": 0.0498046875, + "learning_rate": 0.00631636464743046, + "loss": 0.8392, + "num_input_tokens_seen": 56674312, + "step": 97605 + }, + { + "epoch": 14.538278224605303, + "grad_norm": 0.03271484375, + "learning_rate": 0.00631477498915645, + "loss": 0.7876, + "num_input_tokens_seen": 56677192, + "step": 97610 + }, + { + "epoch": 14.539022937146262, + "grad_norm": 0.038818359375, + "learning_rate": 0.006313185477610489, + "loss": 0.808, + "num_input_tokens_seen": 56680168, + "step": 97615 + }, + { + "epoch": 14.53976764968722, + "grad_norm": 0.0380859375, + "learning_rate": 0.006311596112819426, + "loss": 0.7979, + "num_input_tokens_seen": 56682728, + "step": 97620 + }, + { + "epoch": 14.54051236222818, + "grad_norm": 0.04833984375, + "learning_rate": 0.00631000689481012, + "loss": 0.8007, + "num_input_tokens_seen": 56685608, + "step": 97625 + }, + { + "epoch": 14.541257074769138, + "grad_norm": 0.032470703125, + "learning_rate": 0.006308417823609412, + "loss": 0.799, + "num_input_tokens_seen": 56688776, + "step": 97630 + }, + { + "epoch": 14.542001787310099, + "grad_norm": 0.051513671875, + "learning_rate": 0.006306828899244156, + "loss": 0.7827, + "num_input_tokens_seen": 56691528, + "step": 97635 + }, + { + "epoch": 14.542746499851058, + "grad_norm": 0.038330078125, + "learning_rate": 0.006305240121741184, + "loss": 0.7998, + "num_input_tokens_seen": 56694568, + "step": 97640 + }, + { + "epoch": 14.543491212392016, + "grad_norm": 0.0458984375, + "learning_rate": 0.006303651491127349, + "loss": 0.791, + "num_input_tokens_seen": 56697768, + "step": 97645 + }, + { + "epoch": 14.544235924932975, + "grad_norm": 0.051513671875, + "learning_rate": 0.006302063007429479, + "loss": 0.8094, + "num_input_tokens_seen": 56700936, + "step": 97650 + }, + { + "epoch": 14.544980637473936, + "grad_norm": 0.05078125, + "learning_rate": 0.006300474670674418, + "loss": 0.7975, + "num_input_tokens_seen": 56703560, + "step": 97655 + }, + { + "epoch": 14.545725350014894, + "grad_norm": 0.036865234375, + "learning_rate": 0.006298886480888997, + "loss": 0.7984, + "num_input_tokens_seen": 56706568, + "step": 97660 + }, + { + "epoch": 14.546470062555853, + "grad_norm": 0.04833984375, + "learning_rate": 0.0062972984381000395, + "loss": 0.8196, + "num_input_tokens_seen": 56709512, + "step": 97665 + }, + { + "epoch": 14.547214775096812, + "grad_norm": 0.035400390625, + "learning_rate": 0.006295710542334386, + "loss": 0.8036, + "num_input_tokens_seen": 56712616, + "step": 97670 + }, + { + "epoch": 14.547959487637772, + "grad_norm": 0.04833984375, + "learning_rate": 0.006294122793618856, + "loss": 0.7698, + "num_input_tokens_seen": 56715272, + "step": 97675 + }, + { + "epoch": 14.548704200178731, + "grad_norm": 0.043701171875, + "learning_rate": 0.006292535191980273, + "loss": 0.8224, + "num_input_tokens_seen": 56718248, + "step": 97680 + }, + { + "epoch": 14.54944891271969, + "grad_norm": 0.0439453125, + "learning_rate": 0.006290947737445453, + "loss": 0.797, + "num_input_tokens_seen": 56720808, + "step": 97685 + }, + { + "epoch": 14.550193625260649, + "grad_norm": 0.037109375, + "learning_rate": 0.006289360430041225, + "loss": 0.7828, + "num_input_tokens_seen": 56723752, + "step": 97690 + }, + { + "epoch": 14.55093833780161, + "grad_norm": 0.04736328125, + "learning_rate": 0.006287773269794398, + "loss": 0.7734, + "num_input_tokens_seen": 56726792, + "step": 97695 + }, + { + "epoch": 14.551683050342568, + "grad_norm": 0.04345703125, + "learning_rate": 0.006286186256731782, + "loss": 0.8083, + "num_input_tokens_seen": 56729576, + "step": 97700 + }, + { + "epoch": 14.552427762883527, + "grad_norm": 0.0419921875, + "learning_rate": 0.006284599390880198, + "loss": 0.808, + "num_input_tokens_seen": 56732424, + "step": 97705 + }, + { + "epoch": 14.553172475424486, + "grad_norm": 0.049072265625, + "learning_rate": 0.006283012672266445, + "loss": 0.8025, + "num_input_tokens_seen": 56735208, + "step": 97710 + }, + { + "epoch": 14.553917187965446, + "grad_norm": 0.031494140625, + "learning_rate": 0.006281426100917339, + "loss": 0.7993, + "num_input_tokens_seen": 56737992, + "step": 97715 + }, + { + "epoch": 14.554661900506405, + "grad_norm": 0.0478515625, + "learning_rate": 0.0062798396768596705, + "loss": 0.825, + "num_input_tokens_seen": 56740968, + "step": 97720 + }, + { + "epoch": 14.555406613047364, + "grad_norm": 0.053466796875, + "learning_rate": 0.006278253400120255, + "loss": 0.846, + "num_input_tokens_seen": 56743880, + "step": 97725 + }, + { + "epoch": 14.556151325588322, + "grad_norm": 0.049072265625, + "learning_rate": 0.006276667270725878, + "loss": 0.7916, + "num_input_tokens_seen": 56746568, + "step": 97730 + }, + { + "epoch": 14.556896038129283, + "grad_norm": 0.07177734375, + "learning_rate": 0.006275081288703347, + "loss": 0.8103, + "num_input_tokens_seen": 56749448, + "step": 97735 + }, + { + "epoch": 14.557640750670242, + "grad_norm": 0.05615234375, + "learning_rate": 0.00627349545407945, + "loss": 0.7863, + "num_input_tokens_seen": 56752392, + "step": 97740 + }, + { + "epoch": 14.5583854632112, + "grad_norm": 0.040771484375, + "learning_rate": 0.006271909766880979, + "loss": 0.7851, + "num_input_tokens_seen": 56755624, + "step": 97745 + }, + { + "epoch": 14.55913017575216, + "grad_norm": 0.03466796875, + "learning_rate": 0.006270324227134721, + "loss": 0.7961, + "num_input_tokens_seen": 56758792, + "step": 97750 + }, + { + "epoch": 14.55987488829312, + "grad_norm": 0.0498046875, + "learning_rate": 0.006268738834867455, + "loss": 0.8254, + "num_input_tokens_seen": 56762184, + "step": 97755 + }, + { + "epoch": 14.560619600834078, + "grad_norm": 0.042236328125, + "learning_rate": 0.006267153590105981, + "loss": 0.7979, + "num_input_tokens_seen": 56764872, + "step": 97760 + }, + { + "epoch": 14.561364313375037, + "grad_norm": 0.0230712890625, + "learning_rate": 0.006265568492877064, + "loss": 0.7832, + "num_input_tokens_seen": 56767848, + "step": 97765 + }, + { + "epoch": 14.562109025915996, + "grad_norm": 0.048095703125, + "learning_rate": 0.006263983543207497, + "loss": 0.7933, + "num_input_tokens_seen": 56770888, + "step": 97770 + }, + { + "epoch": 14.562853738456955, + "grad_norm": 0.033203125, + "learning_rate": 0.006262398741124042, + "loss": 0.8077, + "num_input_tokens_seen": 56773608, + "step": 97775 + }, + { + "epoch": 14.563598450997915, + "grad_norm": 0.041015625, + "learning_rate": 0.006260814086653487, + "loss": 0.7888, + "num_input_tokens_seen": 56776680, + "step": 97780 + }, + { + "epoch": 14.564343163538874, + "grad_norm": 0.04638671875, + "learning_rate": 0.006259229579822594, + "loss": 0.7938, + "num_input_tokens_seen": 56779432, + "step": 97785 + }, + { + "epoch": 14.565087876079833, + "grad_norm": 0.052978515625, + "learning_rate": 0.006257645220658131, + "loss": 0.8002, + "num_input_tokens_seen": 56782120, + "step": 97790 + }, + { + "epoch": 14.565832588620792, + "grad_norm": 0.031005859375, + "learning_rate": 0.006256061009186869, + "loss": 0.7815, + "num_input_tokens_seen": 56785064, + "step": 97795 + }, + { + "epoch": 14.566577301161752, + "grad_norm": 0.06982421875, + "learning_rate": 0.006254476945435566, + "loss": 0.8211, + "num_input_tokens_seen": 56787848, + "step": 97800 + }, + { + "epoch": 14.56732201370271, + "grad_norm": 0.043212890625, + "learning_rate": 0.006252893029430991, + "loss": 0.8041, + "num_input_tokens_seen": 56790536, + "step": 97805 + }, + { + "epoch": 14.56806672624367, + "grad_norm": 0.0341796875, + "learning_rate": 0.006251309261199898, + "loss": 0.7897, + "num_input_tokens_seen": 56793416, + "step": 97810 + }, + { + "epoch": 14.568811438784628, + "grad_norm": 0.05224609375, + "learning_rate": 0.006249725640769044, + "loss": 0.8053, + "num_input_tokens_seen": 56796136, + "step": 97815 + }, + { + "epoch": 14.569556151325589, + "grad_norm": 0.05029296875, + "learning_rate": 0.00624814216816518, + "loss": 0.7899, + "num_input_tokens_seen": 56799240, + "step": 97820 + }, + { + "epoch": 14.570300863866548, + "grad_norm": 0.033935546875, + "learning_rate": 0.006246558843415055, + "loss": 0.8, + "num_input_tokens_seen": 56802184, + "step": 97825 + }, + { + "epoch": 14.571045576407506, + "grad_norm": 0.031982421875, + "learning_rate": 0.006244975666545427, + "loss": 0.7879, + "num_input_tokens_seen": 56804840, + "step": 97830 + }, + { + "epoch": 14.571790288948465, + "grad_norm": 0.0252685546875, + "learning_rate": 0.006243392637583032, + "loss": 0.7958, + "num_input_tokens_seen": 56807624, + "step": 97835 + }, + { + "epoch": 14.572535001489426, + "grad_norm": 0.03662109375, + "learning_rate": 0.006241809756554623, + "loss": 0.8015, + "num_input_tokens_seen": 56810600, + "step": 97840 + }, + { + "epoch": 14.573279714030384, + "grad_norm": 0.057861328125, + "learning_rate": 0.006240227023486931, + "loss": 0.7864, + "num_input_tokens_seen": 56813416, + "step": 97845 + }, + { + "epoch": 14.574024426571343, + "grad_norm": 0.044921875, + "learning_rate": 0.006238644438406707, + "loss": 0.8081, + "num_input_tokens_seen": 56816200, + "step": 97850 + }, + { + "epoch": 14.574769139112302, + "grad_norm": 0.033203125, + "learning_rate": 0.006237062001340676, + "loss": 0.8045, + "num_input_tokens_seen": 56819368, + "step": 97855 + }, + { + "epoch": 14.575513851653263, + "grad_norm": 0.0390625, + "learning_rate": 0.00623547971231558, + "loss": 0.7715, + "num_input_tokens_seen": 56822216, + "step": 97860 + }, + { + "epoch": 14.576258564194221, + "grad_norm": 0.055419921875, + "learning_rate": 0.006233897571358148, + "loss": 0.8057, + "num_input_tokens_seen": 56825064, + "step": 97865 + }, + { + "epoch": 14.57700327673518, + "grad_norm": 0.03759765625, + "learning_rate": 0.0062323155784951, + "loss": 0.8012, + "num_input_tokens_seen": 56828264, + "step": 97870 + }, + { + "epoch": 14.577747989276139, + "grad_norm": 0.03369140625, + "learning_rate": 0.006230733733753175, + "loss": 0.7672, + "num_input_tokens_seen": 56831176, + "step": 97875 + }, + { + "epoch": 14.5784927018171, + "grad_norm": 0.037841796875, + "learning_rate": 0.006229152037159091, + "loss": 0.7782, + "num_input_tokens_seen": 56834216, + "step": 97880 + }, + { + "epoch": 14.579237414358058, + "grad_norm": 0.0498046875, + "learning_rate": 0.006227570488739569, + "loss": 0.8002, + "num_input_tokens_seen": 56837160, + "step": 97885 + }, + { + "epoch": 14.579982126899017, + "grad_norm": 0.03271484375, + "learning_rate": 0.0062259890885213226, + "loss": 0.7962, + "num_input_tokens_seen": 56840200, + "step": 97890 + }, + { + "epoch": 14.580726839439976, + "grad_norm": 0.05078125, + "learning_rate": 0.006224407836531079, + "loss": 0.7992, + "num_input_tokens_seen": 56843016, + "step": 97895 + }, + { + "epoch": 14.581471551980936, + "grad_norm": 0.08447265625, + "learning_rate": 0.006222826732795545, + "loss": 0.8099, + "num_input_tokens_seen": 56845992, + "step": 97900 + }, + { + "epoch": 14.582216264521895, + "grad_norm": 0.04248046875, + "learning_rate": 0.006221245777341426, + "loss": 0.7822, + "num_input_tokens_seen": 56849032, + "step": 97905 + }, + { + "epoch": 14.582960977062854, + "grad_norm": 0.057373046875, + "learning_rate": 0.006219664970195445, + "loss": 0.8065, + "num_input_tokens_seen": 56851912, + "step": 97910 + }, + { + "epoch": 14.583705689603812, + "grad_norm": 0.0277099609375, + "learning_rate": 0.006218084311384294, + "loss": 0.794, + "num_input_tokens_seen": 56854728, + "step": 97915 + }, + { + "epoch": 14.584450402144771, + "grad_norm": 0.035400390625, + "learning_rate": 0.006216503800934688, + "loss": 0.8414, + "num_input_tokens_seen": 56857352, + "step": 97920 + }, + { + "epoch": 14.585195114685732, + "grad_norm": 0.025390625, + "learning_rate": 0.006214923438873318, + "loss": 0.8061, + "num_input_tokens_seen": 56860200, + "step": 97925 + }, + { + "epoch": 14.58593982722669, + "grad_norm": 0.1611328125, + "learning_rate": 0.006213343225226893, + "loss": 0.8378, + "num_input_tokens_seen": 56863144, + "step": 97930 + }, + { + "epoch": 14.58668453976765, + "grad_norm": 0.046630859375, + "learning_rate": 0.006211763160022105, + "loss": 0.7926, + "num_input_tokens_seen": 56866152, + "step": 97935 + }, + { + "epoch": 14.58742925230861, + "grad_norm": 0.053955078125, + "learning_rate": 0.006210183243285645, + "loss": 0.7934, + "num_input_tokens_seen": 56869064, + "step": 97940 + }, + { + "epoch": 14.588173964849569, + "grad_norm": 0.040283203125, + "learning_rate": 0.006208603475044207, + "loss": 0.7856, + "num_input_tokens_seen": 56871656, + "step": 97945 + }, + { + "epoch": 14.588918677390527, + "grad_norm": 0.031982421875, + "learning_rate": 0.006207023855324472, + "loss": 0.818, + "num_input_tokens_seen": 56874536, + "step": 97950 + }, + { + "epoch": 14.589663389931486, + "grad_norm": 0.034423828125, + "learning_rate": 0.0062054443841531364, + "loss": 0.7824, + "num_input_tokens_seen": 56877416, + "step": 97955 + }, + { + "epoch": 14.590408102472445, + "grad_norm": 0.048828125, + "learning_rate": 0.006203865061556875, + "loss": 0.7848, + "num_input_tokens_seen": 56880136, + "step": 97960 + }, + { + "epoch": 14.591152815013405, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00620228588756238, + "loss": 0.7897, + "num_input_tokens_seen": 56883176, + "step": 97965 + }, + { + "epoch": 14.591897527554364, + "grad_norm": 0.0595703125, + "learning_rate": 0.006200706862196317, + "loss": 0.8333, + "num_input_tokens_seen": 56886280, + "step": 97970 + }, + { + "epoch": 14.592642240095323, + "grad_norm": 0.04931640625, + "learning_rate": 0.006199127985485373, + "loss": 0.8059, + "num_input_tokens_seen": 56888744, + "step": 97975 + }, + { + "epoch": 14.593386952636282, + "grad_norm": 0.0634765625, + "learning_rate": 0.006197549257456217, + "loss": 0.7851, + "num_input_tokens_seen": 56891848, + "step": 97980 + }, + { + "epoch": 14.594131665177242, + "grad_norm": 0.035400390625, + "learning_rate": 0.006195970678135515, + "loss": 0.805, + "num_input_tokens_seen": 56894632, + "step": 97985 + }, + { + "epoch": 14.594876377718201, + "grad_norm": 0.034423828125, + "learning_rate": 0.006194392247549946, + "loss": 0.798, + "num_input_tokens_seen": 56897608, + "step": 97990 + }, + { + "epoch": 14.59562109025916, + "grad_norm": 0.099609375, + "learning_rate": 0.006192813965726165, + "loss": 0.8115, + "num_input_tokens_seen": 56900552, + "step": 97995 + }, + { + "epoch": 14.596365802800118, + "grad_norm": 0.028564453125, + "learning_rate": 0.006191235832690847, + "loss": 0.7959, + "num_input_tokens_seen": 56903400, + "step": 98000 + }, + { + "epoch": 14.597110515341079, + "grad_norm": 0.0703125, + "learning_rate": 0.006189657848470646, + "loss": 0.7867, + "num_input_tokens_seen": 56906536, + "step": 98005 + }, + { + "epoch": 14.597855227882038, + "grad_norm": 0.03369140625, + "learning_rate": 0.0061880800130922195, + "loss": 0.7931, + "num_input_tokens_seen": 56909352, + "step": 98010 + }, + { + "epoch": 14.598599940422996, + "grad_norm": 0.039794921875, + "learning_rate": 0.006186502326582221, + "loss": 0.7946, + "num_input_tokens_seen": 56912584, + "step": 98015 + }, + { + "epoch": 14.599344652963955, + "grad_norm": 0.03515625, + "learning_rate": 0.006184924788967315, + "loss": 0.8031, + "num_input_tokens_seen": 56915432, + "step": 98020 + }, + { + "epoch": 14.600089365504916, + "grad_norm": 0.0556640625, + "learning_rate": 0.006183347400274144, + "loss": 0.7829, + "num_input_tokens_seen": 56918280, + "step": 98025 + }, + { + "epoch": 14.600834078045875, + "grad_norm": 0.03173828125, + "learning_rate": 0.0061817701605293525, + "loss": 0.7955, + "num_input_tokens_seen": 56921128, + "step": 98030 + }, + { + "epoch": 14.601578790586833, + "grad_norm": 0.033203125, + "learning_rate": 0.006180193069759597, + "loss": 0.8437, + "num_input_tokens_seen": 56923816, + "step": 98035 + }, + { + "epoch": 14.602323503127792, + "grad_norm": 0.04931640625, + "learning_rate": 0.00617861612799151, + "loss": 0.8122, + "num_input_tokens_seen": 56927176, + "step": 98040 + }, + { + "epoch": 14.603068215668753, + "grad_norm": 0.0311279296875, + "learning_rate": 0.006177039335251743, + "loss": 0.8005, + "num_input_tokens_seen": 56929992, + "step": 98045 + }, + { + "epoch": 14.603812928209711, + "grad_norm": 0.050537109375, + "learning_rate": 0.006175462691566925, + "loss": 0.7985, + "num_input_tokens_seen": 56932840, + "step": 98050 + }, + { + "epoch": 14.60455764075067, + "grad_norm": 0.03759765625, + "learning_rate": 0.006173886196963701, + "loss": 0.8083, + "num_input_tokens_seen": 56935848, + "step": 98055 + }, + { + "epoch": 14.605302353291629, + "grad_norm": 0.04541015625, + "learning_rate": 0.006172309851468699, + "loss": 0.8131, + "num_input_tokens_seen": 56938664, + "step": 98060 + }, + { + "epoch": 14.60604706583259, + "grad_norm": 0.060546875, + "learning_rate": 0.006170733655108544, + "loss": 0.7753, + "num_input_tokens_seen": 56941384, + "step": 98065 + }, + { + "epoch": 14.606791778373548, + "grad_norm": 0.0498046875, + "learning_rate": 0.006169157607909876, + "loss": 0.7934, + "num_input_tokens_seen": 56944072, + "step": 98070 + }, + { + "epoch": 14.607536490914507, + "grad_norm": 0.034912109375, + "learning_rate": 0.006167581709899315, + "loss": 0.7961, + "num_input_tokens_seen": 56947080, + "step": 98075 + }, + { + "epoch": 14.608281203455466, + "grad_norm": 0.0556640625, + "learning_rate": 0.006166005961103485, + "loss": 0.7854, + "num_input_tokens_seen": 56949992, + "step": 98080 + }, + { + "epoch": 14.609025915996426, + "grad_norm": 0.021484375, + "learning_rate": 0.006164430361548999, + "loss": 0.8048, + "num_input_tokens_seen": 56952488, + "step": 98085 + }, + { + "epoch": 14.609770628537385, + "grad_norm": 0.0235595703125, + "learning_rate": 0.006162854911262488, + "loss": 0.7926, + "num_input_tokens_seen": 56955272, + "step": 98090 + }, + { + "epoch": 14.610515341078344, + "grad_norm": 0.0703125, + "learning_rate": 0.006161279610270556, + "loss": 0.7752, + "num_input_tokens_seen": 56957928, + "step": 98095 + }, + { + "epoch": 14.611260053619302, + "grad_norm": 0.0458984375, + "learning_rate": 0.0061597044585998285, + "loss": 0.7891, + "num_input_tokens_seen": 56960904, + "step": 98100 + }, + { + "epoch": 14.612004766160261, + "grad_norm": 0.038330078125, + "learning_rate": 0.006158129456276908, + "loss": 0.7951, + "num_input_tokens_seen": 56963880, + "step": 98105 + }, + { + "epoch": 14.612749478701222, + "grad_norm": 0.08154296875, + "learning_rate": 0.006156554603328398, + "loss": 0.8182, + "num_input_tokens_seen": 56966888, + "step": 98110 + }, + { + "epoch": 14.61349419124218, + "grad_norm": 0.036865234375, + "learning_rate": 0.006154979899780917, + "loss": 0.8008, + "num_input_tokens_seen": 56969544, + "step": 98115 + }, + { + "epoch": 14.61423890378314, + "grad_norm": 0.034423828125, + "learning_rate": 0.006153405345661054, + "loss": 0.7929, + "num_input_tokens_seen": 56972456, + "step": 98120 + }, + { + "epoch": 14.6149836163241, + "grad_norm": 0.03466796875, + "learning_rate": 0.006151830940995423, + "loss": 0.806, + "num_input_tokens_seen": 56975400, + "step": 98125 + }, + { + "epoch": 14.615728328865059, + "grad_norm": 0.03466796875, + "learning_rate": 0.006150256685810609, + "loss": 0.8017, + "num_input_tokens_seen": 56978440, + "step": 98130 + }, + { + "epoch": 14.616473041406017, + "grad_norm": 0.046875, + "learning_rate": 0.006148682580133219, + "loss": 0.7915, + "num_input_tokens_seen": 56981448, + "step": 98135 + }, + { + "epoch": 14.617217753946976, + "grad_norm": 0.032470703125, + "learning_rate": 0.00614710862398984, + "loss": 0.7802, + "num_input_tokens_seen": 56984360, + "step": 98140 + }, + { + "epoch": 14.617962466487935, + "grad_norm": 0.0673828125, + "learning_rate": 0.006145534817407065, + "loss": 0.8065, + "num_input_tokens_seen": 56987208, + "step": 98145 + }, + { + "epoch": 14.618707179028895, + "grad_norm": 0.02783203125, + "learning_rate": 0.006143961160411479, + "loss": 0.8142, + "num_input_tokens_seen": 56990056, + "step": 98150 + }, + { + "epoch": 14.619451891569854, + "grad_norm": 0.05078125, + "learning_rate": 0.006142387653029664, + "loss": 0.8036, + "num_input_tokens_seen": 56992968, + "step": 98155 + }, + { + "epoch": 14.620196604110813, + "grad_norm": 0.0400390625, + "learning_rate": 0.006140814295288211, + "loss": 0.7863, + "num_input_tokens_seen": 56996168, + "step": 98160 + }, + { + "epoch": 14.620941316651772, + "grad_norm": 0.04638671875, + "learning_rate": 0.006139241087213692, + "loss": 0.789, + "num_input_tokens_seen": 56998856, + "step": 98165 + }, + { + "epoch": 14.621686029192732, + "grad_norm": 0.04736328125, + "learning_rate": 0.006137668028832695, + "loss": 0.7955, + "num_input_tokens_seen": 57001608, + "step": 98170 + }, + { + "epoch": 14.622430741733691, + "grad_norm": 0.040283203125, + "learning_rate": 0.0061360951201717825, + "loss": 0.7936, + "num_input_tokens_seen": 57004552, + "step": 98175 + }, + { + "epoch": 14.62317545427465, + "grad_norm": 0.06591796875, + "learning_rate": 0.006134522361257541, + "loss": 0.7706, + "num_input_tokens_seen": 57007496, + "step": 98180 + }, + { + "epoch": 14.623920166815608, + "grad_norm": 0.036376953125, + "learning_rate": 0.006132949752116533, + "loss": 0.8562, + "num_input_tokens_seen": 57010376, + "step": 98185 + }, + { + "epoch": 14.624664879356569, + "grad_norm": 0.042236328125, + "learning_rate": 0.006131377292775321, + "loss": 0.7904, + "num_input_tokens_seen": 57013256, + "step": 98190 + }, + { + "epoch": 14.625409591897528, + "grad_norm": 0.03076171875, + "learning_rate": 0.006129804983260482, + "loss": 0.8089, + "num_input_tokens_seen": 57016104, + "step": 98195 + }, + { + "epoch": 14.626154304438487, + "grad_norm": 0.06689453125, + "learning_rate": 0.006128232823598566, + "loss": 0.7852, + "num_input_tokens_seen": 57018792, + "step": 98200 + }, + { + "epoch": 14.626899016979445, + "grad_norm": 0.057373046875, + "learning_rate": 0.006126660813816146, + "loss": 0.7926, + "num_input_tokens_seen": 57021864, + "step": 98205 + }, + { + "epoch": 14.627643729520406, + "grad_norm": 0.048583984375, + "learning_rate": 0.006125088953939773, + "loss": 0.8013, + "num_input_tokens_seen": 57024488, + "step": 98210 + }, + { + "epoch": 14.628388442061365, + "grad_norm": 0.0478515625, + "learning_rate": 0.006123517243996002, + "loss": 0.8195, + "num_input_tokens_seen": 57027272, + "step": 98215 + }, + { + "epoch": 14.629133154602323, + "grad_norm": 0.033447265625, + "learning_rate": 0.006121945684011387, + "loss": 0.8014, + "num_input_tokens_seen": 57030248, + "step": 98220 + }, + { + "epoch": 14.629877867143282, + "grad_norm": 0.054443359375, + "learning_rate": 0.0061203742740124694, + "loss": 0.7855, + "num_input_tokens_seen": 57033160, + "step": 98225 + }, + { + "epoch": 14.630622579684243, + "grad_norm": 0.03857421875, + "learning_rate": 0.006118803014025808, + "loss": 0.7983, + "num_input_tokens_seen": 57035880, + "step": 98230 + }, + { + "epoch": 14.631367292225201, + "grad_norm": 0.0517578125, + "learning_rate": 0.00611723190407794, + "loss": 0.7801, + "num_input_tokens_seen": 57038632, + "step": 98235 + }, + { + "epoch": 14.63211200476616, + "grad_norm": 0.05126953125, + "learning_rate": 0.006115660944195415, + "loss": 0.797, + "num_input_tokens_seen": 57041384, + "step": 98240 + }, + { + "epoch": 14.632856717307119, + "grad_norm": 0.03515625, + "learning_rate": 0.006114090134404764, + "loss": 0.8034, + "num_input_tokens_seen": 57044360, + "step": 98245 + }, + { + "epoch": 14.63360142984808, + "grad_norm": 0.0311279296875, + "learning_rate": 0.006112519474732534, + "loss": 0.809, + "num_input_tokens_seen": 57047304, + "step": 98250 + }, + { + "epoch": 14.634346142389038, + "grad_norm": 0.04833984375, + "learning_rate": 0.00611094896520525, + "loss": 0.8027, + "num_input_tokens_seen": 57049896, + "step": 98255 + }, + { + "epoch": 14.635090854929997, + "grad_norm": 0.04833984375, + "learning_rate": 0.006109378605849453, + "loss": 0.7916, + "num_input_tokens_seen": 57052648, + "step": 98260 + }, + { + "epoch": 14.635835567470956, + "grad_norm": 0.035888671875, + "learning_rate": 0.006107808396691669, + "loss": 0.797, + "num_input_tokens_seen": 57055432, + "step": 98265 + }, + { + "epoch": 14.636580280011916, + "grad_norm": 0.047607421875, + "learning_rate": 0.0061062383377584245, + "loss": 0.7972, + "num_input_tokens_seen": 57058216, + "step": 98270 + }, + { + "epoch": 14.637324992552875, + "grad_norm": 0.039306640625, + "learning_rate": 0.006104668429076243, + "loss": 0.7914, + "num_input_tokens_seen": 57061384, + "step": 98275 + }, + { + "epoch": 14.638069705093834, + "grad_norm": 0.04296875, + "learning_rate": 0.006103098670671644, + "loss": 0.7864, + "num_input_tokens_seen": 57064296, + "step": 98280 + }, + { + "epoch": 14.638814417634793, + "grad_norm": 0.04638671875, + "learning_rate": 0.006101529062571155, + "loss": 0.7867, + "num_input_tokens_seen": 57066920, + "step": 98285 + }, + { + "epoch": 14.639559130175751, + "grad_norm": 0.044189453125, + "learning_rate": 0.006099959604801283, + "loss": 0.7968, + "num_input_tokens_seen": 57069896, + "step": 98290 + }, + { + "epoch": 14.640303842716712, + "grad_norm": 0.03564453125, + "learning_rate": 0.0060983902973885526, + "loss": 0.7846, + "num_input_tokens_seen": 57072808, + "step": 98295 + }, + { + "epoch": 14.64104855525767, + "grad_norm": 0.06982421875, + "learning_rate": 0.006096821140359473, + "loss": 0.7891, + "num_input_tokens_seen": 57075944, + "step": 98300 + }, + { + "epoch": 14.64179326779863, + "grad_norm": 0.028564453125, + "learning_rate": 0.006095252133740544, + "loss": 0.8046, + "num_input_tokens_seen": 57078952, + "step": 98305 + }, + { + "epoch": 14.642537980339588, + "grad_norm": 0.034912109375, + "learning_rate": 0.006093683277558286, + "loss": 0.7962, + "num_input_tokens_seen": 57082216, + "step": 98310 + }, + { + "epoch": 14.643282692880549, + "grad_norm": 0.05810546875, + "learning_rate": 0.006092114571839192, + "loss": 0.7807, + "num_input_tokens_seen": 57084936, + "step": 98315 + }, + { + "epoch": 14.644027405421507, + "grad_norm": 0.05029296875, + "learning_rate": 0.006090546016609775, + "loss": 0.8021, + "num_input_tokens_seen": 57087848, + "step": 98320 + }, + { + "epoch": 14.644772117962466, + "grad_norm": 0.051513671875, + "learning_rate": 0.006088977611896521, + "loss": 0.8046, + "num_input_tokens_seen": 57090600, + "step": 98325 + }, + { + "epoch": 14.645516830503425, + "grad_norm": 0.03857421875, + "learning_rate": 0.006087409357725939, + "loss": 0.797, + "num_input_tokens_seen": 57093384, + "step": 98330 + }, + { + "epoch": 14.646261543044385, + "grad_norm": 0.047119140625, + "learning_rate": 0.006085841254124517, + "loss": 0.803, + "num_input_tokens_seen": 57096488, + "step": 98335 + }, + { + "epoch": 14.647006255585344, + "grad_norm": 0.04638671875, + "learning_rate": 0.006084273301118748, + "loss": 0.7967, + "num_input_tokens_seen": 57099432, + "step": 98340 + }, + { + "epoch": 14.647750968126303, + "grad_norm": 0.045166015625, + "learning_rate": 0.006082705498735122, + "loss": 0.7987, + "num_input_tokens_seen": 57102376, + "step": 98345 + }, + { + "epoch": 14.648495680667262, + "grad_norm": 0.02490234375, + "learning_rate": 0.0060811378470001145, + "loss": 0.7782, + "num_input_tokens_seen": 57105544, + "step": 98350 + }, + { + "epoch": 14.649240393208222, + "grad_norm": 0.04052734375, + "learning_rate": 0.0060795703459402255, + "loss": 0.7953, + "num_input_tokens_seen": 57108296, + "step": 98355 + }, + { + "epoch": 14.649985105749181, + "grad_norm": 0.025146484375, + "learning_rate": 0.006078002995581924, + "loss": 0.7936, + "num_input_tokens_seen": 57111400, + "step": 98360 + }, + { + "epoch": 14.65072981829014, + "grad_norm": 0.06103515625, + "learning_rate": 0.0060764357959516985, + "loss": 0.8053, + "num_input_tokens_seen": 57113992, + "step": 98365 + }, + { + "epoch": 14.651474530831099, + "grad_norm": 0.06201171875, + "learning_rate": 0.006074868747076015, + "loss": 0.809, + "num_input_tokens_seen": 57116456, + "step": 98370 + }, + { + "epoch": 14.652219243372059, + "grad_norm": 0.059814453125, + "learning_rate": 0.006073301848981359, + "loss": 0.8176, + "num_input_tokens_seen": 57119560, + "step": 98375 + }, + { + "epoch": 14.652963955913018, + "grad_norm": 0.05078125, + "learning_rate": 0.0060717351016941895, + "loss": 0.7971, + "num_input_tokens_seen": 57122760, + "step": 98380 + }, + { + "epoch": 14.653708668453977, + "grad_norm": 0.050048828125, + "learning_rate": 0.006070168505240988, + "loss": 0.7883, + "num_input_tokens_seen": 57125416, + "step": 98385 + }, + { + "epoch": 14.654453380994935, + "grad_norm": 0.027587890625, + "learning_rate": 0.006068602059648214, + "loss": 0.7922, + "num_input_tokens_seen": 57128392, + "step": 98390 + }, + { + "epoch": 14.655198093535896, + "grad_norm": 0.043212890625, + "learning_rate": 0.006067035764942324, + "loss": 0.7951, + "num_input_tokens_seen": 57131496, + "step": 98395 + }, + { + "epoch": 14.655942806076855, + "grad_norm": 0.044189453125, + "learning_rate": 0.006065469621149791, + "loss": 0.7959, + "num_input_tokens_seen": 57134312, + "step": 98400 + }, + { + "epoch": 14.656687518617813, + "grad_norm": 0.055908203125, + "learning_rate": 0.006063903628297069, + "loss": 0.7811, + "num_input_tokens_seen": 57137288, + "step": 98405 + }, + { + "epoch": 14.657432231158772, + "grad_norm": 0.07666015625, + "learning_rate": 0.006062337786410612, + "loss": 0.847, + "num_input_tokens_seen": 57140136, + "step": 98410 + }, + { + "epoch": 14.658176943699733, + "grad_norm": 0.048095703125, + "learning_rate": 0.0060607720955168694, + "loss": 0.8211, + "num_input_tokens_seen": 57142984, + "step": 98415 + }, + { + "epoch": 14.658921656240691, + "grad_norm": 0.032958984375, + "learning_rate": 0.006059206555642302, + "loss": 0.7828, + "num_input_tokens_seen": 57145896, + "step": 98420 + }, + { + "epoch": 14.65966636878165, + "grad_norm": 0.05029296875, + "learning_rate": 0.006057641166813353, + "loss": 0.7988, + "num_input_tokens_seen": 57149000, + "step": 98425 + }, + { + "epoch": 14.660411081322609, + "grad_norm": 0.032470703125, + "learning_rate": 0.006056075929056463, + "loss": 0.8014, + "num_input_tokens_seen": 57151784, + "step": 98430 + }, + { + "epoch": 14.66115579386357, + "grad_norm": 0.046630859375, + "learning_rate": 0.006054510842398085, + "loss": 0.7997, + "num_input_tokens_seen": 57154856, + "step": 98435 + }, + { + "epoch": 14.661900506404528, + "grad_norm": 0.034912109375, + "learning_rate": 0.006052945906864648, + "loss": 0.8013, + "num_input_tokens_seen": 57157928, + "step": 98440 + }, + { + "epoch": 14.662645218945487, + "grad_norm": 0.052001953125, + "learning_rate": 0.006051381122482603, + "loss": 0.7955, + "num_input_tokens_seen": 57160776, + "step": 98445 + }, + { + "epoch": 14.663389931486446, + "grad_norm": 0.052001953125, + "learning_rate": 0.006049816489278374, + "loss": 0.7927, + "num_input_tokens_seen": 57163720, + "step": 98450 + }, + { + "epoch": 14.664134644027406, + "grad_norm": 0.0556640625, + "learning_rate": 0.006048252007278403, + "loss": 0.8014, + "num_input_tokens_seen": 57166952, + "step": 98455 + }, + { + "epoch": 14.664879356568365, + "grad_norm": 0.032470703125, + "learning_rate": 0.006046687676509112, + "loss": 0.7976, + "num_input_tokens_seen": 57169704, + "step": 98460 + }, + { + "epoch": 14.665624069109324, + "grad_norm": 0.0458984375, + "learning_rate": 0.006045123496996939, + "loss": 0.7791, + "num_input_tokens_seen": 57172360, + "step": 98465 + }, + { + "epoch": 14.666368781650283, + "grad_norm": 0.038330078125, + "learning_rate": 0.0060435594687683015, + "loss": 0.8046, + "num_input_tokens_seen": 57175368, + "step": 98470 + }, + { + "epoch": 14.667113494191241, + "grad_norm": 0.04736328125, + "learning_rate": 0.0060419955918496245, + "loss": 0.8008, + "num_input_tokens_seen": 57178504, + "step": 98475 + }, + { + "epoch": 14.667858206732202, + "grad_norm": 0.061279296875, + "learning_rate": 0.006040431866267327, + "loss": 0.777, + "num_input_tokens_seen": 57181448, + "step": 98480 + }, + { + "epoch": 14.66860291927316, + "grad_norm": 0.05712890625, + "learning_rate": 0.006038868292047821, + "loss": 0.8091, + "num_input_tokens_seen": 57184264, + "step": 98485 + }, + { + "epoch": 14.66934763181412, + "grad_norm": 0.037353515625, + "learning_rate": 0.006037304869217536, + "loss": 0.7966, + "num_input_tokens_seen": 57187208, + "step": 98490 + }, + { + "epoch": 14.670092344355078, + "grad_norm": 0.0361328125, + "learning_rate": 0.006035741597802868, + "loss": 0.7797, + "num_input_tokens_seen": 57189896, + "step": 98495 + }, + { + "epoch": 14.670837056896039, + "grad_norm": 0.03759765625, + "learning_rate": 0.00603417847783024, + "loss": 0.808, + "num_input_tokens_seen": 57192936, + "step": 98500 + }, + { + "epoch": 14.671581769436997, + "grad_norm": 0.034423828125, + "learning_rate": 0.006032615509326056, + "loss": 0.7808, + "num_input_tokens_seen": 57195656, + "step": 98505 + }, + { + "epoch": 14.672326481977956, + "grad_norm": 0.036376953125, + "learning_rate": 0.006031052692316711, + "loss": 0.7942, + "num_input_tokens_seen": 57198696, + "step": 98510 + }, + { + "epoch": 14.673071194518915, + "grad_norm": 0.04296875, + "learning_rate": 0.006029490026828623, + "loss": 0.7992, + "num_input_tokens_seen": 57201576, + "step": 98515 + }, + { + "epoch": 14.673815907059875, + "grad_norm": 0.052978515625, + "learning_rate": 0.006027927512888177, + "loss": 0.7879, + "num_input_tokens_seen": 57204680, + "step": 98520 + }, + { + "epoch": 14.674560619600834, + "grad_norm": 0.043212890625, + "learning_rate": 0.006026365150521781, + "loss": 0.8093, + "num_input_tokens_seen": 57207656, + "step": 98525 + }, + { + "epoch": 14.675305332141793, + "grad_norm": 0.050537109375, + "learning_rate": 0.006024802939755823, + "loss": 0.8016, + "num_input_tokens_seen": 57210600, + "step": 98530 + }, + { + "epoch": 14.676050044682752, + "grad_norm": 0.04248046875, + "learning_rate": 0.006023240880616701, + "loss": 0.7913, + "num_input_tokens_seen": 57213448, + "step": 98535 + }, + { + "epoch": 14.676794757223712, + "grad_norm": 0.04931640625, + "learning_rate": 0.0060216789731308016, + "loss": 0.7878, + "num_input_tokens_seen": 57216360, + "step": 98540 + }, + { + "epoch": 14.677539469764671, + "grad_norm": 0.046142578125, + "learning_rate": 0.006020117217324509, + "loss": 0.8041, + "num_input_tokens_seen": 57219368, + "step": 98545 + }, + { + "epoch": 14.67828418230563, + "grad_norm": 0.0458984375, + "learning_rate": 0.006018555613224212, + "loss": 0.794, + "num_input_tokens_seen": 57222184, + "step": 98550 + }, + { + "epoch": 14.679028894846589, + "grad_norm": 0.06396484375, + "learning_rate": 0.006016994160856283, + "loss": 0.7825, + "num_input_tokens_seen": 57224936, + "step": 98555 + }, + { + "epoch": 14.679773607387549, + "grad_norm": 0.048583984375, + "learning_rate": 0.0060154328602471125, + "loss": 0.7873, + "num_input_tokens_seen": 57227784, + "step": 98560 + }, + { + "epoch": 14.680518319928508, + "grad_norm": 0.05322265625, + "learning_rate": 0.006013871711423067, + "loss": 0.777, + "num_input_tokens_seen": 57230856, + "step": 98565 + }, + { + "epoch": 14.681263032469467, + "grad_norm": 0.038818359375, + "learning_rate": 0.0060123107144105315, + "loss": 0.7998, + "num_input_tokens_seen": 57234024, + "step": 98570 + }, + { + "epoch": 14.682007745010425, + "grad_norm": 0.036865234375, + "learning_rate": 0.006010749869235866, + "loss": 0.8051, + "num_input_tokens_seen": 57238088, + "step": 98575 + }, + { + "epoch": 14.682752457551386, + "grad_norm": 0.04931640625, + "learning_rate": 0.006009189175925452, + "loss": 0.7946, + "num_input_tokens_seen": 57240680, + "step": 98580 + }, + { + "epoch": 14.683497170092345, + "grad_norm": 0.040283203125, + "learning_rate": 0.006007628634505649, + "loss": 0.7831, + "num_input_tokens_seen": 57243560, + "step": 98585 + }, + { + "epoch": 14.684241882633303, + "grad_norm": 0.0732421875, + "learning_rate": 0.006006068245002813, + "loss": 0.7956, + "num_input_tokens_seen": 57246312, + "step": 98590 + }, + { + "epoch": 14.684986595174262, + "grad_norm": 0.02587890625, + "learning_rate": 0.00600450800744332, + "loss": 0.7832, + "num_input_tokens_seen": 57249384, + "step": 98595 + }, + { + "epoch": 14.685731307715223, + "grad_norm": 0.03759765625, + "learning_rate": 0.006002947921853521, + "loss": 0.771, + "num_input_tokens_seen": 57252520, + "step": 98600 + }, + { + "epoch": 14.686476020256181, + "grad_norm": 0.06103515625, + "learning_rate": 0.006001387988259774, + "loss": 0.7945, + "num_input_tokens_seen": 57255464, + "step": 98605 + }, + { + "epoch": 14.68722073279714, + "grad_norm": 0.0286865234375, + "learning_rate": 0.005999828206688424, + "loss": 0.7927, + "num_input_tokens_seen": 57258184, + "step": 98610 + }, + { + "epoch": 14.687965445338099, + "grad_norm": 0.042724609375, + "learning_rate": 0.005998268577165836, + "loss": 0.7837, + "num_input_tokens_seen": 57260808, + "step": 98615 + }, + { + "epoch": 14.688710157879058, + "grad_norm": 0.05322265625, + "learning_rate": 0.005996709099718344, + "loss": 0.7907, + "num_input_tokens_seen": 57263656, + "step": 98620 + }, + { + "epoch": 14.689454870420018, + "grad_norm": 0.049072265625, + "learning_rate": 0.005995149774372308, + "loss": 0.8293, + "num_input_tokens_seen": 57266696, + "step": 98625 + }, + { + "epoch": 14.690199582960977, + "grad_norm": 0.0400390625, + "learning_rate": 0.005993590601154065, + "loss": 0.7965, + "num_input_tokens_seen": 57269384, + "step": 98630 + }, + { + "epoch": 14.690944295501936, + "grad_norm": 0.049072265625, + "learning_rate": 0.005992031580089949, + "loss": 0.8188, + "num_input_tokens_seen": 57272136, + "step": 98635 + }, + { + "epoch": 14.691689008042896, + "grad_norm": 0.033935546875, + "learning_rate": 0.00599047271120631, + "loss": 0.8057, + "num_input_tokens_seen": 57275048, + "step": 98640 + }, + { + "epoch": 14.692433720583855, + "grad_norm": 0.034912109375, + "learning_rate": 0.005988913994529471, + "loss": 0.8067, + "num_input_tokens_seen": 57277960, + "step": 98645 + }, + { + "epoch": 14.693178433124814, + "grad_norm": 0.039306640625, + "learning_rate": 0.005987355430085778, + "loss": 0.8042, + "num_input_tokens_seen": 57281064, + "step": 98650 + }, + { + "epoch": 14.693923145665773, + "grad_norm": 0.03564453125, + "learning_rate": 0.00598579701790155, + "loss": 0.8016, + "num_input_tokens_seen": 57283944, + "step": 98655 + }, + { + "epoch": 14.694667858206731, + "grad_norm": 0.04541015625, + "learning_rate": 0.005984238758003125, + "loss": 0.7899, + "num_input_tokens_seen": 57286728, + "step": 98660 + }, + { + "epoch": 14.695412570747692, + "grad_norm": 0.05419921875, + "learning_rate": 0.005982680650416823, + "loss": 0.7979, + "num_input_tokens_seen": 57289992, + "step": 98665 + }, + { + "epoch": 14.69615728328865, + "grad_norm": 0.048095703125, + "learning_rate": 0.005981122695168965, + "loss": 0.8105, + "num_input_tokens_seen": 57293064, + "step": 98670 + }, + { + "epoch": 14.69690199582961, + "grad_norm": 0.030517578125, + "learning_rate": 0.005979564892285874, + "loss": 0.8075, + "num_input_tokens_seen": 57295848, + "step": 98675 + }, + { + "epoch": 14.697646708370568, + "grad_norm": 0.055419921875, + "learning_rate": 0.005978007241793861, + "loss": 0.815, + "num_input_tokens_seen": 57298696, + "step": 98680 + }, + { + "epoch": 14.698391420911529, + "grad_norm": 0.045654296875, + "learning_rate": 0.005976449743719251, + "loss": 0.8, + "num_input_tokens_seen": 57301512, + "step": 98685 + }, + { + "epoch": 14.699136133452487, + "grad_norm": 0.054931640625, + "learning_rate": 0.005974892398088347, + "loss": 0.7995, + "num_input_tokens_seen": 57304232, + "step": 98690 + }, + { + "epoch": 14.699880845993446, + "grad_norm": 0.08251953125, + "learning_rate": 0.005973335204927468, + "loss": 0.8081, + "num_input_tokens_seen": 57307112, + "step": 98695 + }, + { + "epoch": 14.700625558534405, + "grad_norm": 0.049560546875, + "learning_rate": 0.00597177816426291, + "loss": 0.817, + "num_input_tokens_seen": 57309832, + "step": 98700 + }, + { + "epoch": 14.701370271075366, + "grad_norm": 0.052001953125, + "learning_rate": 0.005970221276120991, + "loss": 0.7992, + "num_input_tokens_seen": 57312584, + "step": 98705 + }, + { + "epoch": 14.702114983616324, + "grad_norm": 0.04736328125, + "learning_rate": 0.0059686645405280046, + "loss": 0.7921, + "num_input_tokens_seen": 57315592, + "step": 98710 + }, + { + "epoch": 14.702859696157283, + "grad_norm": 0.042724609375, + "learning_rate": 0.005967107957510248, + "loss": 0.7973, + "num_input_tokens_seen": 57318376, + "step": 98715 + }, + { + "epoch": 14.703604408698242, + "grad_norm": 0.025146484375, + "learning_rate": 0.005965551527094027, + "loss": 0.7895, + "num_input_tokens_seen": 57321576, + "step": 98720 + }, + { + "epoch": 14.704349121239202, + "grad_norm": 0.041015625, + "learning_rate": 0.0059639952493056245, + "loss": 0.7778, + "num_input_tokens_seen": 57324552, + "step": 98725 + }, + { + "epoch": 14.705093833780161, + "grad_norm": 0.0269775390625, + "learning_rate": 0.005962439124171345, + "loss": 0.7771, + "num_input_tokens_seen": 57327624, + "step": 98730 + }, + { + "epoch": 14.70583854632112, + "grad_norm": 0.03466796875, + "learning_rate": 0.0059608831517174695, + "loss": 0.8014, + "num_input_tokens_seen": 57330312, + "step": 98735 + }, + { + "epoch": 14.706583258862079, + "grad_norm": 0.049560546875, + "learning_rate": 0.005959327331970287, + "loss": 0.8039, + "num_input_tokens_seen": 57333192, + "step": 98740 + }, + { + "epoch": 14.70732797140304, + "grad_norm": 0.064453125, + "learning_rate": 0.00595777166495608, + "loss": 0.7933, + "num_input_tokens_seen": 57336328, + "step": 98745 + }, + { + "epoch": 14.708072683943998, + "grad_norm": 0.03466796875, + "learning_rate": 0.005956216150701126, + "loss": 0.8115, + "num_input_tokens_seen": 57339272, + "step": 98750 + }, + { + "epoch": 14.708817396484957, + "grad_norm": 0.042724609375, + "learning_rate": 0.005954660789231714, + "loss": 0.8014, + "num_input_tokens_seen": 57342120, + "step": 98755 + }, + { + "epoch": 14.709562109025915, + "grad_norm": 0.047119140625, + "learning_rate": 0.005953105580574109, + "loss": 0.8061, + "num_input_tokens_seen": 57345224, + "step": 98760 + }, + { + "epoch": 14.710306821566876, + "grad_norm": 0.03857421875, + "learning_rate": 0.005951550524754596, + "loss": 0.7972, + "num_input_tokens_seen": 57348040, + "step": 98765 + }, + { + "epoch": 14.711051534107835, + "grad_norm": 0.076171875, + "learning_rate": 0.005949995621799435, + "loss": 0.7719, + "num_input_tokens_seen": 57350888, + "step": 98770 + }, + { + "epoch": 14.711796246648793, + "grad_norm": 0.0302734375, + "learning_rate": 0.005948440871734905, + "loss": 0.8272, + "num_input_tokens_seen": 57353896, + "step": 98775 + }, + { + "epoch": 14.712540959189752, + "grad_norm": 0.039306640625, + "learning_rate": 0.005946886274587262, + "loss": 0.8097, + "num_input_tokens_seen": 57356680, + "step": 98780 + }, + { + "epoch": 14.713285671730713, + "grad_norm": 0.060791015625, + "learning_rate": 0.00594533183038278, + "loss": 0.8069, + "num_input_tokens_seen": 57359592, + "step": 98785 + }, + { + "epoch": 14.714030384271672, + "grad_norm": 0.03466796875, + "learning_rate": 0.005943777539147712, + "loss": 0.8124, + "num_input_tokens_seen": 57362280, + "step": 98790 + }, + { + "epoch": 14.71477509681263, + "grad_norm": 0.042236328125, + "learning_rate": 0.005942223400908314, + "loss": 0.7956, + "num_input_tokens_seen": 57365032, + "step": 98795 + }, + { + "epoch": 14.715519809353589, + "grad_norm": 0.060546875, + "learning_rate": 0.005940669415690851, + "loss": 0.8071, + "num_input_tokens_seen": 57367752, + "step": 98800 + }, + { + "epoch": 14.716264521894548, + "grad_norm": 0.1455078125, + "learning_rate": 0.0059391155835215705, + "loss": 0.8223, + "num_input_tokens_seen": 57370568, + "step": 98805 + }, + { + "epoch": 14.717009234435508, + "grad_norm": 0.04931640625, + "learning_rate": 0.005937561904426724, + "loss": 0.7896, + "num_input_tokens_seen": 57373320, + "step": 98810 + }, + { + "epoch": 14.717753946976467, + "grad_norm": 0.0380859375, + "learning_rate": 0.005936008378432553, + "loss": 0.797, + "num_input_tokens_seen": 57376264, + "step": 98815 + }, + { + "epoch": 14.718498659517426, + "grad_norm": 0.024658203125, + "learning_rate": 0.005934455005565313, + "loss": 0.797, + "num_input_tokens_seen": 57379272, + "step": 98820 + }, + { + "epoch": 14.719243372058386, + "grad_norm": 0.0419921875, + "learning_rate": 0.005932901785851242, + "loss": 0.7905, + "num_input_tokens_seen": 57381832, + "step": 98825 + }, + { + "epoch": 14.719988084599345, + "grad_norm": 0.052001953125, + "learning_rate": 0.005931348719316574, + "loss": 0.7881, + "num_input_tokens_seen": 57384936, + "step": 98830 + }, + { + "epoch": 14.720732797140304, + "grad_norm": 0.028564453125, + "learning_rate": 0.0059297958059875595, + "loss": 0.7749, + "num_input_tokens_seen": 57387496, + "step": 98835 + }, + { + "epoch": 14.721477509681263, + "grad_norm": 0.0216064453125, + "learning_rate": 0.00592824304589042, + "loss": 0.813, + "num_input_tokens_seen": 57390728, + "step": 98840 + }, + { + "epoch": 14.722222222222221, + "grad_norm": 0.033203125, + "learning_rate": 0.005926690439051399, + "loss": 0.7966, + "num_input_tokens_seen": 57393512, + "step": 98845 + }, + { + "epoch": 14.722966934763182, + "grad_norm": 0.045166015625, + "learning_rate": 0.005925137985496715, + "loss": 0.8027, + "num_input_tokens_seen": 57396584, + "step": 98850 + }, + { + "epoch": 14.72371164730414, + "grad_norm": 0.048583984375, + "learning_rate": 0.0059235856852526084, + "loss": 0.8015, + "num_input_tokens_seen": 57399464, + "step": 98855 + }, + { + "epoch": 14.7244563598451, + "grad_norm": 0.05615234375, + "learning_rate": 0.005922033538345291, + "loss": 0.8063, + "num_input_tokens_seen": 57402888, + "step": 98860 + }, + { + "epoch": 14.725201072386058, + "grad_norm": 0.0546875, + "learning_rate": 0.005920481544800997, + "loss": 0.7689, + "num_input_tokens_seen": 57405576, + "step": 98865 + }, + { + "epoch": 14.725945784927019, + "grad_norm": 0.034912109375, + "learning_rate": 0.005918929704645938, + "loss": 0.7854, + "num_input_tokens_seen": 57408424, + "step": 98870 + }, + { + "epoch": 14.726690497467978, + "grad_norm": 0.059814453125, + "learning_rate": 0.005917378017906334, + "loss": 0.8193, + "num_input_tokens_seen": 57411528, + "step": 98875 + }, + { + "epoch": 14.727435210008936, + "grad_norm": 0.06787109375, + "learning_rate": 0.005915826484608394, + "loss": 0.8096, + "num_input_tokens_seen": 57414344, + "step": 98880 + }, + { + "epoch": 14.728179922549895, + "grad_norm": 0.045166015625, + "learning_rate": 0.005914275104778329, + "loss": 0.8005, + "num_input_tokens_seen": 57417320, + "step": 98885 + }, + { + "epoch": 14.728924635090856, + "grad_norm": 0.037109375, + "learning_rate": 0.005912723878442358, + "loss": 0.7978, + "num_input_tokens_seen": 57419816, + "step": 98890 + }, + { + "epoch": 14.729669347631814, + "grad_norm": 0.0439453125, + "learning_rate": 0.005911172805626675, + "loss": 0.803, + "num_input_tokens_seen": 57422888, + "step": 98895 + }, + { + "epoch": 14.730414060172773, + "grad_norm": 0.0625, + "learning_rate": 0.0059096218863574955, + "loss": 0.7894, + "num_input_tokens_seen": 57425896, + "step": 98900 + }, + { + "epoch": 14.731158772713732, + "grad_norm": 0.0712890625, + "learning_rate": 0.005908071120661014, + "loss": 0.7894, + "num_input_tokens_seen": 57428776, + "step": 98905 + }, + { + "epoch": 14.731903485254692, + "grad_norm": 0.0263671875, + "learning_rate": 0.005906520508563426, + "loss": 0.7967, + "num_input_tokens_seen": 57431624, + "step": 98910 + }, + { + "epoch": 14.732648197795651, + "grad_norm": 0.03955078125, + "learning_rate": 0.005904970050090938, + "loss": 0.7982, + "num_input_tokens_seen": 57434440, + "step": 98915 + }, + { + "epoch": 14.73339291033661, + "grad_norm": 0.06494140625, + "learning_rate": 0.005903419745269731, + "loss": 0.7868, + "num_input_tokens_seen": 57437352, + "step": 98920 + }, + { + "epoch": 14.734137622877569, + "grad_norm": 0.06298828125, + "learning_rate": 0.005901869594126007, + "loss": 0.7924, + "num_input_tokens_seen": 57440296, + "step": 98925 + }, + { + "epoch": 14.73488233541853, + "grad_norm": 0.03564453125, + "learning_rate": 0.00590031959668595, + "loss": 0.7799, + "num_input_tokens_seen": 57442984, + "step": 98930 + }, + { + "epoch": 14.735627047959488, + "grad_norm": 0.05126953125, + "learning_rate": 0.005898769752975745, + "loss": 0.8001, + "num_input_tokens_seen": 57446088, + "step": 98935 + }, + { + "epoch": 14.736371760500447, + "grad_norm": 0.056396484375, + "learning_rate": 0.005897220063021569, + "loss": 0.8127, + "num_input_tokens_seen": 57449064, + "step": 98940 + }, + { + "epoch": 14.737116473041405, + "grad_norm": 0.037353515625, + "learning_rate": 0.005895670526849615, + "loss": 0.79, + "num_input_tokens_seen": 57451912, + "step": 98945 + }, + { + "epoch": 14.737861185582366, + "grad_norm": 0.06201171875, + "learning_rate": 0.005894121144486053, + "loss": 0.82, + "num_input_tokens_seen": 57455048, + "step": 98950 + }, + { + "epoch": 14.738605898123325, + "grad_norm": 0.09619140625, + "learning_rate": 0.005892571915957053, + "loss": 0.7983, + "num_input_tokens_seen": 57458248, + "step": 98955 + }, + { + "epoch": 14.739350610664284, + "grad_norm": 0.03466796875, + "learning_rate": 0.005891022841288801, + "loss": 0.7939, + "num_input_tokens_seen": 57461256, + "step": 98960 + }, + { + "epoch": 14.740095323205242, + "grad_norm": 0.045166015625, + "learning_rate": 0.005889473920507455, + "loss": 0.7936, + "num_input_tokens_seen": 57463784, + "step": 98965 + }, + { + "epoch": 14.740840035746203, + "grad_norm": 0.0576171875, + "learning_rate": 0.005887925153639192, + "loss": 0.8012, + "num_input_tokens_seen": 57466504, + "step": 98970 + }, + { + "epoch": 14.741584748287162, + "grad_norm": 0.04833984375, + "learning_rate": 0.005886376540710168, + "loss": 0.7932, + "num_input_tokens_seen": 57469288, + "step": 98975 + }, + { + "epoch": 14.74232946082812, + "grad_norm": 0.02197265625, + "learning_rate": 0.005884828081746555, + "loss": 0.7871, + "num_input_tokens_seen": 57472264, + "step": 98980 + }, + { + "epoch": 14.743074173369079, + "grad_norm": 0.050048828125, + "learning_rate": 0.0058832797767745006, + "loss": 0.7962, + "num_input_tokens_seen": 57475144, + "step": 98985 + }, + { + "epoch": 14.743818885910038, + "grad_norm": 0.044921875, + "learning_rate": 0.0058817316258201755, + "loss": 0.798, + "num_input_tokens_seen": 57478056, + "step": 98990 + }, + { + "epoch": 14.744563598450998, + "grad_norm": 0.03759765625, + "learning_rate": 0.005880183628909726, + "loss": 0.8023, + "num_input_tokens_seen": 57480904, + "step": 98995 + }, + { + "epoch": 14.745308310991957, + "grad_norm": 0.0244140625, + "learning_rate": 0.005878635786069305, + "loss": 0.8059, + "num_input_tokens_seen": 57483880, + "step": 99000 + }, + { + "epoch": 14.746053023532916, + "grad_norm": 0.044677734375, + "learning_rate": 0.005877088097325064, + "loss": 0.7944, + "num_input_tokens_seen": 57487048, + "step": 99005 + }, + { + "epoch": 14.746797736073875, + "grad_norm": 0.080078125, + "learning_rate": 0.0058755405627031404, + "loss": 0.8062, + "num_input_tokens_seen": 57489800, + "step": 99010 + }, + { + "epoch": 14.747542448614835, + "grad_norm": 0.044677734375, + "learning_rate": 0.005873993182229691, + "loss": 0.7757, + "num_input_tokens_seen": 57492744, + "step": 99015 + }, + { + "epoch": 14.748287161155794, + "grad_norm": 0.033203125, + "learning_rate": 0.005872445955930846, + "loss": 0.7841, + "num_input_tokens_seen": 57495464, + "step": 99020 + }, + { + "epoch": 14.749031873696753, + "grad_norm": 0.0291748046875, + "learning_rate": 0.005870898883832755, + "loss": 0.7916, + "num_input_tokens_seen": 57498600, + "step": 99025 + }, + { + "epoch": 14.749776586237711, + "grad_norm": 0.0576171875, + "learning_rate": 0.005869351965961549, + "loss": 0.8002, + "num_input_tokens_seen": 57501288, + "step": 99030 + }, + { + "epoch": 14.750521298778672, + "grad_norm": 0.0390625, + "learning_rate": 0.005867805202343356, + "loss": 0.8016, + "num_input_tokens_seen": 57504136, + "step": 99035 + }, + { + "epoch": 14.75126601131963, + "grad_norm": 0.052978515625, + "learning_rate": 0.005866258593004318, + "loss": 0.7871, + "num_input_tokens_seen": 57507016, + "step": 99040 + }, + { + "epoch": 14.75201072386059, + "grad_norm": 0.046875, + "learning_rate": 0.005864712137970554, + "loss": 0.79, + "num_input_tokens_seen": 57509992, + "step": 99045 + }, + { + "epoch": 14.752755436401548, + "grad_norm": 0.033935546875, + "learning_rate": 0.005863165837268198, + "loss": 0.8037, + "num_input_tokens_seen": 57513064, + "step": 99050 + }, + { + "epoch": 14.753500148942509, + "grad_norm": 0.031982421875, + "learning_rate": 0.005861619690923363, + "loss": 0.8077, + "num_input_tokens_seen": 57515816, + "step": 99055 + }, + { + "epoch": 14.754244861483468, + "grad_norm": 0.039306640625, + "learning_rate": 0.005860073698962182, + "loss": 0.7837, + "num_input_tokens_seen": 57518696, + "step": 99060 + }, + { + "epoch": 14.754989574024426, + "grad_norm": 0.033447265625, + "learning_rate": 0.005858527861410764, + "loss": 0.792, + "num_input_tokens_seen": 57521544, + "step": 99065 + }, + { + "epoch": 14.755734286565385, + "grad_norm": 0.054931640625, + "learning_rate": 0.005856982178295229, + "loss": 0.8096, + "num_input_tokens_seen": 57524264, + "step": 99070 + }, + { + "epoch": 14.756478999106346, + "grad_norm": 0.052734375, + "learning_rate": 0.005855436649641687, + "loss": 0.8008, + "num_input_tokens_seen": 57527272, + "step": 99075 + }, + { + "epoch": 14.757223711647304, + "grad_norm": 0.050537109375, + "learning_rate": 0.005853891275476244, + "loss": 0.7973, + "num_input_tokens_seen": 57529960, + "step": 99080 + }, + { + "epoch": 14.757968424188263, + "grad_norm": 0.036376953125, + "learning_rate": 0.005852346055825017, + "loss": 0.788, + "num_input_tokens_seen": 57532616, + "step": 99085 + }, + { + "epoch": 14.758713136729222, + "grad_norm": 0.0556640625, + "learning_rate": 0.005850800990714101, + "loss": 0.8155, + "num_input_tokens_seen": 57535368, + "step": 99090 + }, + { + "epoch": 14.759457849270182, + "grad_norm": 0.051025390625, + "learning_rate": 0.005849256080169608, + "loss": 0.7982, + "num_input_tokens_seen": 57538216, + "step": 99095 + }, + { + "epoch": 14.760202561811141, + "grad_norm": 0.04638671875, + "learning_rate": 0.005847711324217629, + "loss": 0.7838, + "num_input_tokens_seen": 57541192, + "step": 99100 + }, + { + "epoch": 14.7609472743521, + "grad_norm": 0.0400390625, + "learning_rate": 0.0058461667228842705, + "loss": 0.8112, + "num_input_tokens_seen": 57544040, + "step": 99105 + }, + { + "epoch": 14.761691986893059, + "grad_norm": 0.0400390625, + "learning_rate": 0.005844622276195621, + "loss": 0.7869, + "num_input_tokens_seen": 57546888, + "step": 99110 + }, + { + "epoch": 14.76243669943402, + "grad_norm": 0.02490234375, + "learning_rate": 0.005843077984177767, + "loss": 0.7978, + "num_input_tokens_seen": 57549640, + "step": 99115 + }, + { + "epoch": 14.763181411974978, + "grad_norm": 0.03564453125, + "learning_rate": 0.00584153384685681, + "loss": 0.794, + "num_input_tokens_seen": 57552552, + "step": 99120 + }, + { + "epoch": 14.763926124515937, + "grad_norm": 0.039306640625, + "learning_rate": 0.0058399898642588245, + "loss": 0.7952, + "num_input_tokens_seen": 57555272, + "step": 99125 + }, + { + "epoch": 14.764670837056896, + "grad_norm": 0.041748046875, + "learning_rate": 0.005838446036409906, + "loss": 0.778, + "num_input_tokens_seen": 57558152, + "step": 99130 + }, + { + "epoch": 14.765415549597854, + "grad_norm": 0.0322265625, + "learning_rate": 0.005836902363336131, + "loss": 0.792, + "num_input_tokens_seen": 57561224, + "step": 99135 + }, + { + "epoch": 14.766160262138815, + "grad_norm": 0.043212890625, + "learning_rate": 0.005835358845063577, + "loss": 0.7905, + "num_input_tokens_seen": 57564040, + "step": 99140 + }, + { + "epoch": 14.766904974679774, + "grad_norm": 0.057373046875, + "learning_rate": 0.005833815481618316, + "loss": 0.7931, + "num_input_tokens_seen": 57566792, + "step": 99145 + }, + { + "epoch": 14.767649687220732, + "grad_norm": 0.0390625, + "learning_rate": 0.005832272273026432, + "loss": 0.7713, + "num_input_tokens_seen": 57569704, + "step": 99150 + }, + { + "epoch": 14.768394399761693, + "grad_norm": 0.037353515625, + "learning_rate": 0.005830729219313989, + "loss": 0.8109, + "num_input_tokens_seen": 57572488, + "step": 99155 + }, + { + "epoch": 14.769139112302652, + "grad_norm": 0.033447265625, + "learning_rate": 0.0058291863205070515, + "loss": 0.7787, + "num_input_tokens_seen": 57575560, + "step": 99160 + }, + { + "epoch": 14.76988382484361, + "grad_norm": 0.0400390625, + "learning_rate": 0.005827643576631696, + "loss": 0.8174, + "num_input_tokens_seen": 57578472, + "step": 99165 + }, + { + "epoch": 14.77062853738457, + "grad_norm": 0.046142578125, + "learning_rate": 0.005826100987713976, + "loss": 0.806, + "num_input_tokens_seen": 57581224, + "step": 99170 + }, + { + "epoch": 14.771373249925528, + "grad_norm": 0.031982421875, + "learning_rate": 0.005824558553779959, + "loss": 0.7826, + "num_input_tokens_seen": 57584008, + "step": 99175 + }, + { + "epoch": 14.772117962466488, + "grad_norm": 0.0286865234375, + "learning_rate": 0.005823016274855696, + "loss": 0.8096, + "num_input_tokens_seen": 57586760, + "step": 99180 + }, + { + "epoch": 14.772862675007447, + "grad_norm": 0.04931640625, + "learning_rate": 0.005821474150967251, + "loss": 0.7928, + "num_input_tokens_seen": 57589736, + "step": 99185 + }, + { + "epoch": 14.773607387548406, + "grad_norm": 0.047119140625, + "learning_rate": 0.00581993218214067, + "loss": 0.7883, + "num_input_tokens_seen": 57592744, + "step": 99190 + }, + { + "epoch": 14.774352100089365, + "grad_norm": 0.0556640625, + "learning_rate": 0.005818390368402, + "loss": 0.7995, + "num_input_tokens_seen": 57595688, + "step": 99195 + }, + { + "epoch": 14.775096812630325, + "grad_norm": 0.0274658203125, + "learning_rate": 0.005816848709777299, + "loss": 0.8047, + "num_input_tokens_seen": 57598632, + "step": 99200 + }, + { + "epoch": 14.775841525171284, + "grad_norm": 0.034912109375, + "learning_rate": 0.005815307206292605, + "loss": 0.797, + "num_input_tokens_seen": 57601672, + "step": 99205 + }, + { + "epoch": 14.776586237712243, + "grad_norm": 0.046875, + "learning_rate": 0.005813765857973961, + "loss": 0.7759, + "num_input_tokens_seen": 57604520, + "step": 99210 + }, + { + "epoch": 14.777330950253202, + "grad_norm": 0.03759765625, + "learning_rate": 0.0058122246648474015, + "loss": 0.7917, + "num_input_tokens_seen": 57607432, + "step": 99215 + }, + { + "epoch": 14.778075662794162, + "grad_norm": 0.0439453125, + "learning_rate": 0.0058106836269389735, + "loss": 0.8079, + "num_input_tokens_seen": 57610472, + "step": 99220 + }, + { + "epoch": 14.77882037533512, + "grad_norm": 0.0380859375, + "learning_rate": 0.0058091427442747, + "loss": 0.781, + "num_input_tokens_seen": 57613064, + "step": 99225 + }, + { + "epoch": 14.77956508787608, + "grad_norm": 0.0478515625, + "learning_rate": 0.005807602016880625, + "loss": 0.7885, + "num_input_tokens_seen": 57615784, + "step": 99230 + }, + { + "epoch": 14.780309800417038, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00580606144478277, + "loss": 0.7861, + "num_input_tokens_seen": 57618504, + "step": 99235 + }, + { + "epoch": 14.781054512957999, + "grad_norm": 0.0556640625, + "learning_rate": 0.005804521028007159, + "loss": 0.8003, + "num_input_tokens_seen": 57621448, + "step": 99240 + }, + { + "epoch": 14.781799225498958, + "grad_norm": 0.06298828125, + "learning_rate": 0.005802980766579824, + "loss": 0.7946, + "num_input_tokens_seen": 57624488, + "step": 99245 + }, + { + "epoch": 14.782543938039916, + "grad_norm": 0.06689453125, + "learning_rate": 0.005801440660526775, + "loss": 0.7847, + "num_input_tokens_seen": 57627560, + "step": 99250 + }, + { + "epoch": 14.783288650580875, + "grad_norm": 0.033203125, + "learning_rate": 0.0057999007098740426, + "loss": 0.789, + "num_input_tokens_seen": 57630984, + "step": 99255 + }, + { + "epoch": 14.784033363121836, + "grad_norm": 0.0306396484375, + "learning_rate": 0.005798360914647639, + "loss": 0.8036, + "num_input_tokens_seen": 57633736, + "step": 99260 + }, + { + "epoch": 14.784778075662794, + "grad_norm": 0.032958984375, + "learning_rate": 0.005796821274873574, + "loss": 0.7988, + "num_input_tokens_seen": 57636424, + "step": 99265 + }, + { + "epoch": 14.785522788203753, + "grad_norm": 0.0286865234375, + "learning_rate": 0.005795281790577854, + "loss": 0.8265, + "num_input_tokens_seen": 57639272, + "step": 99270 + }, + { + "epoch": 14.786267500744712, + "grad_norm": 0.0390625, + "learning_rate": 0.0057937424617865, + "loss": 0.7845, + "num_input_tokens_seen": 57642280, + "step": 99275 + }, + { + "epoch": 14.787012213285673, + "grad_norm": 0.06396484375, + "learning_rate": 0.005792203288525508, + "loss": 0.7623, + "num_input_tokens_seen": 57645352, + "step": 99280 + }, + { + "epoch": 14.787756925826631, + "grad_norm": 0.050048828125, + "learning_rate": 0.005790664270820879, + "loss": 0.7923, + "num_input_tokens_seen": 57648296, + "step": 99285 + }, + { + "epoch": 14.78850163836759, + "grad_norm": 0.047119140625, + "learning_rate": 0.005789125408698624, + "loss": 0.8038, + "num_input_tokens_seen": 57651080, + "step": 99290 + }, + { + "epoch": 14.789246350908549, + "grad_norm": 0.044677734375, + "learning_rate": 0.005787586702184727, + "loss": 0.7945, + "num_input_tokens_seen": 57654312, + "step": 99295 + }, + { + "epoch": 14.78999106344951, + "grad_norm": 0.0712890625, + "learning_rate": 0.005786048151305195, + "loss": 0.7798, + "num_input_tokens_seen": 57657064, + "step": 99300 + }, + { + "epoch": 14.790735775990468, + "grad_norm": 0.05029296875, + "learning_rate": 0.00578450975608601, + "loss": 0.795, + "num_input_tokens_seen": 57660136, + "step": 99305 + }, + { + "epoch": 14.791480488531427, + "grad_norm": 0.02490234375, + "learning_rate": 0.00578297151655317, + "loss": 0.8077, + "num_input_tokens_seen": 57662920, + "step": 99310 + }, + { + "epoch": 14.792225201072386, + "grad_norm": 0.06494140625, + "learning_rate": 0.0057814334327326595, + "loss": 0.7893, + "num_input_tokens_seen": 57665992, + "step": 99315 + }, + { + "epoch": 14.792969913613344, + "grad_norm": 0.05712890625, + "learning_rate": 0.005779895504650458, + "loss": 0.8202, + "num_input_tokens_seen": 57669064, + "step": 99320 + }, + { + "epoch": 14.793714626154305, + "grad_norm": 0.05908203125, + "learning_rate": 0.005778357732332554, + "loss": 0.7958, + "num_input_tokens_seen": 57671880, + "step": 99325 + }, + { + "epoch": 14.794459338695264, + "grad_norm": 0.0400390625, + "learning_rate": 0.005776820115804925, + "loss": 0.7951, + "num_input_tokens_seen": 57674600, + "step": 99330 + }, + { + "epoch": 14.795204051236222, + "grad_norm": 0.057861328125, + "learning_rate": 0.005775282655093546, + "loss": 0.785, + "num_input_tokens_seen": 57677512, + "step": 99335 + }, + { + "epoch": 14.795948763777183, + "grad_norm": 0.048828125, + "learning_rate": 0.005773745350224385, + "loss": 0.7967, + "num_input_tokens_seen": 57680264, + "step": 99340 + }, + { + "epoch": 14.796693476318142, + "grad_norm": 0.059814453125, + "learning_rate": 0.005772208201223425, + "loss": 0.8047, + "num_input_tokens_seen": 57683240, + "step": 99345 + }, + { + "epoch": 14.7974381888591, + "grad_norm": 0.02490234375, + "learning_rate": 0.00577067120811663, + "loss": 0.7988, + "num_input_tokens_seen": 57685832, + "step": 99350 + }, + { + "epoch": 14.79818290140006, + "grad_norm": 0.042724609375, + "learning_rate": 0.0057691343709299564, + "loss": 0.8092, + "num_input_tokens_seen": 57688712, + "step": 99355 + }, + { + "epoch": 14.798927613941018, + "grad_norm": 0.05859375, + "learning_rate": 0.005767597689689384, + "loss": 0.8072, + "num_input_tokens_seen": 57691656, + "step": 99360 + }, + { + "epoch": 14.799672326481979, + "grad_norm": 0.049560546875, + "learning_rate": 0.005766061164420857, + "loss": 0.7673, + "num_input_tokens_seen": 57694440, + "step": 99365 + }, + { + "epoch": 14.800417039022937, + "grad_norm": 0.04443359375, + "learning_rate": 0.005764524795150349, + "loss": 0.7927, + "num_input_tokens_seen": 57697448, + "step": 99370 + }, + { + "epoch": 14.801161751563896, + "grad_norm": 0.0810546875, + "learning_rate": 0.005762988581903803, + "loss": 0.7791, + "num_input_tokens_seen": 57700168, + "step": 99375 + }, + { + "epoch": 14.801906464104855, + "grad_norm": 0.03369140625, + "learning_rate": 0.005761452524707181, + "loss": 0.7777, + "num_input_tokens_seen": 57702984, + "step": 99380 + }, + { + "epoch": 14.802651176645815, + "grad_norm": 0.03662109375, + "learning_rate": 0.005759916623586425, + "loss": 0.792, + "num_input_tokens_seen": 57705960, + "step": 99385 + }, + { + "epoch": 14.803395889186774, + "grad_norm": 0.05908203125, + "learning_rate": 0.00575838087856749, + "loss": 0.7927, + "num_input_tokens_seen": 57708936, + "step": 99390 + }, + { + "epoch": 14.804140601727733, + "grad_norm": 0.046875, + "learning_rate": 0.0057568452896763185, + "loss": 0.7969, + "num_input_tokens_seen": 57711816, + "step": 99395 + }, + { + "epoch": 14.804885314268692, + "grad_norm": 0.0498046875, + "learning_rate": 0.005755309856938851, + "loss": 0.7912, + "num_input_tokens_seen": 57714600, + "step": 99400 + }, + { + "epoch": 14.805630026809652, + "grad_norm": 0.0556640625, + "learning_rate": 0.005753774580381028, + "loss": 0.7718, + "num_input_tokens_seen": 57717704, + "step": 99405 + }, + { + "epoch": 14.80637473935061, + "grad_norm": 0.053466796875, + "learning_rate": 0.00575223946002878, + "loss": 0.7964, + "num_input_tokens_seen": 57720232, + "step": 99410 + }, + { + "epoch": 14.80711945189157, + "grad_norm": 0.05859375, + "learning_rate": 0.005750704495908053, + "loss": 0.7782, + "num_input_tokens_seen": 57723432, + "step": 99415 + }, + { + "epoch": 14.807864164432528, + "grad_norm": 0.038818359375, + "learning_rate": 0.005749169688044768, + "loss": 0.803, + "num_input_tokens_seen": 57726120, + "step": 99420 + }, + { + "epoch": 14.808608876973489, + "grad_norm": 0.07763671875, + "learning_rate": 0.005747635036464865, + "loss": 0.8174, + "num_input_tokens_seen": 57728872, + "step": 99425 + }, + { + "epoch": 14.809353589514448, + "grad_norm": 0.062255859375, + "learning_rate": 0.0057461005411942635, + "loss": 0.8113, + "num_input_tokens_seen": 57732072, + "step": 99430 + }, + { + "epoch": 14.810098302055406, + "grad_norm": 0.053466796875, + "learning_rate": 0.005744566202258883, + "loss": 0.7792, + "num_input_tokens_seen": 57734888, + "step": 99435 + }, + { + "epoch": 14.810843014596365, + "grad_norm": 0.0517578125, + "learning_rate": 0.0057430320196846536, + "loss": 0.7875, + "num_input_tokens_seen": 57737896, + "step": 99440 + }, + { + "epoch": 14.811587727137326, + "grad_norm": 0.04541015625, + "learning_rate": 0.0057414979934974865, + "loss": 0.7769, + "num_input_tokens_seen": 57741224, + "step": 99445 + }, + { + "epoch": 14.812332439678285, + "grad_norm": 0.06884765625, + "learning_rate": 0.005739964123723306, + "loss": 0.7766, + "num_input_tokens_seen": 57744008, + "step": 99450 + }, + { + "epoch": 14.813077152219243, + "grad_norm": 0.047607421875, + "learning_rate": 0.0057384304103880145, + "loss": 0.8034, + "num_input_tokens_seen": 57746888, + "step": 99455 + }, + { + "epoch": 14.813821864760202, + "grad_norm": 0.048095703125, + "learning_rate": 0.005736896853517532, + "loss": 0.8035, + "num_input_tokens_seen": 57749768, + "step": 99460 + }, + { + "epoch": 14.814566577301163, + "grad_norm": 0.05419921875, + "learning_rate": 0.005735363453137764, + "loss": 0.7939, + "num_input_tokens_seen": 57752872, + "step": 99465 + }, + { + "epoch": 14.815311289842121, + "grad_norm": 0.056396484375, + "learning_rate": 0.005733830209274615, + "loss": 0.775, + "num_input_tokens_seen": 57755560, + "step": 99470 + }, + { + "epoch": 14.81605600238308, + "grad_norm": 0.0289306640625, + "learning_rate": 0.005732297121953985, + "loss": 0.8059, + "num_input_tokens_seen": 57758472, + "step": 99475 + }, + { + "epoch": 14.816800714924039, + "grad_norm": 0.0498046875, + "learning_rate": 0.005730764191201772, + "loss": 0.7701, + "num_input_tokens_seen": 57761192, + "step": 99480 + }, + { + "epoch": 14.817545427465, + "grad_norm": 0.04345703125, + "learning_rate": 0.0057292314170438825, + "loss": 0.7949, + "num_input_tokens_seen": 57764136, + "step": 99485 + }, + { + "epoch": 14.818290140005958, + "grad_norm": 0.08251953125, + "learning_rate": 0.005727698799506201, + "loss": 0.7808, + "num_input_tokens_seen": 57767112, + "step": 99490 + }, + { + "epoch": 14.819034852546917, + "grad_norm": 0.04638671875, + "learning_rate": 0.005726166338614627, + "loss": 0.8012, + "num_input_tokens_seen": 57770248, + "step": 99495 + }, + { + "epoch": 14.819779565087876, + "grad_norm": 0.03662109375, + "learning_rate": 0.005724634034395043, + "loss": 0.7812, + "num_input_tokens_seen": 57773128, + "step": 99500 + }, + { + "epoch": 14.820524277628834, + "grad_norm": 0.031982421875, + "learning_rate": 0.005723101886873347, + "loss": 0.8153, + "num_input_tokens_seen": 57776328, + "step": 99505 + }, + { + "epoch": 14.821268990169795, + "grad_norm": 0.04052734375, + "learning_rate": 0.005721569896075415, + "loss": 0.7752, + "num_input_tokens_seen": 57779016, + "step": 99510 + }, + { + "epoch": 14.822013702710754, + "grad_norm": 0.0458984375, + "learning_rate": 0.005720038062027124, + "loss": 0.7856, + "num_input_tokens_seen": 57781832, + "step": 99515 + }, + { + "epoch": 14.822758415251712, + "grad_norm": 0.02685546875, + "learning_rate": 0.005718506384754363, + "loss": 0.7759, + "num_input_tokens_seen": 57784616, + "step": 99520 + }, + { + "epoch": 14.823503127792671, + "grad_norm": 0.052734375, + "learning_rate": 0.005716974864282998, + "loss": 0.7777, + "num_input_tokens_seen": 57787624, + "step": 99525 + }, + { + "epoch": 14.824247840333632, + "grad_norm": 0.055419921875, + "learning_rate": 0.0057154435006389155, + "loss": 0.7733, + "num_input_tokens_seen": 57790440, + "step": 99530 + }, + { + "epoch": 14.82499255287459, + "grad_norm": 0.052978515625, + "learning_rate": 0.005713912293847975, + "loss": 0.7926, + "num_input_tokens_seen": 57793192, + "step": 99535 + }, + { + "epoch": 14.82573726541555, + "grad_norm": 0.042724609375, + "learning_rate": 0.005712381243936051, + "loss": 0.7894, + "num_input_tokens_seen": 57795976, + "step": 99540 + }, + { + "epoch": 14.826481977956508, + "grad_norm": 0.0576171875, + "learning_rate": 0.005710850350929, + "loss": 0.8, + "num_input_tokens_seen": 57798632, + "step": 99545 + }, + { + "epoch": 14.827226690497469, + "grad_norm": 0.05908203125, + "learning_rate": 0.0057093196148526966, + "loss": 0.8424, + "num_input_tokens_seen": 57801352, + "step": 99550 + }, + { + "epoch": 14.827971403038427, + "grad_norm": 0.05615234375, + "learning_rate": 0.005707789035732997, + "loss": 0.786, + "num_input_tokens_seen": 57804296, + "step": 99555 + }, + { + "epoch": 14.828716115579386, + "grad_norm": 0.053955078125, + "learning_rate": 0.005706258613595751, + "loss": 0.8149, + "num_input_tokens_seen": 57807432, + "step": 99560 + }, + { + "epoch": 14.829460828120345, + "grad_norm": 0.06689453125, + "learning_rate": 0.005704728348466825, + "loss": 0.7733, + "num_input_tokens_seen": 57810024, + "step": 99565 + }, + { + "epoch": 14.830205540661305, + "grad_norm": 0.03564453125, + "learning_rate": 0.00570319824037206, + "loss": 0.8072, + "num_input_tokens_seen": 57812808, + "step": 99570 + }, + { + "epoch": 14.830950253202264, + "grad_norm": 0.033935546875, + "learning_rate": 0.005701668289337319, + "loss": 0.8117, + "num_input_tokens_seen": 57815688, + "step": 99575 + }, + { + "epoch": 14.831694965743223, + "grad_norm": 0.0400390625, + "learning_rate": 0.0057001384953884356, + "loss": 0.8088, + "num_input_tokens_seen": 57818664, + "step": 99580 + }, + { + "epoch": 14.832439678284182, + "grad_norm": 0.060791015625, + "learning_rate": 0.0056986088585512665, + "loss": 0.8057, + "num_input_tokens_seen": 57821608, + "step": 99585 + }, + { + "epoch": 14.833184390825142, + "grad_norm": 0.048583984375, + "learning_rate": 0.005697079378851647, + "loss": 0.7985, + "num_input_tokens_seen": 57824264, + "step": 99590 + }, + { + "epoch": 14.833929103366101, + "grad_norm": 0.04541015625, + "learning_rate": 0.0056955500563154165, + "loss": 0.8016, + "num_input_tokens_seen": 57826952, + "step": 99595 + }, + { + "epoch": 14.83467381590706, + "grad_norm": 0.0419921875, + "learning_rate": 0.005694020890968406, + "loss": 0.7827, + "num_input_tokens_seen": 57829864, + "step": 99600 + }, + { + "epoch": 14.835418528448018, + "grad_norm": 0.0478515625, + "learning_rate": 0.005692491882836458, + "loss": 0.7768, + "num_input_tokens_seen": 57832456, + "step": 99605 + }, + { + "epoch": 14.836163240988979, + "grad_norm": 0.048828125, + "learning_rate": 0.0056909630319454015, + "loss": 0.7977, + "num_input_tokens_seen": 57835624, + "step": 99610 + }, + { + "epoch": 14.836907953529938, + "grad_norm": 0.0380859375, + "learning_rate": 0.005689434338321057, + "loss": 0.7913, + "num_input_tokens_seen": 57838504, + "step": 99615 + }, + { + "epoch": 14.837652666070897, + "grad_norm": 0.03857421875, + "learning_rate": 0.005687905801989263, + "loss": 0.7987, + "num_input_tokens_seen": 57841480, + "step": 99620 + }, + { + "epoch": 14.838397378611855, + "grad_norm": 0.05517578125, + "learning_rate": 0.005686377422975831, + "loss": 0.7897, + "num_input_tokens_seen": 57844328, + "step": 99625 + }, + { + "epoch": 14.839142091152816, + "grad_norm": 0.043212890625, + "learning_rate": 0.005684849201306589, + "loss": 0.8049, + "num_input_tokens_seen": 57847240, + "step": 99630 + }, + { + "epoch": 14.839886803693775, + "grad_norm": 0.0615234375, + "learning_rate": 0.005683321137007355, + "loss": 0.7917, + "num_input_tokens_seen": 57850120, + "step": 99635 + }, + { + "epoch": 14.840631516234733, + "grad_norm": 0.036865234375, + "learning_rate": 0.005681793230103935, + "loss": 0.809, + "num_input_tokens_seen": 57853160, + "step": 99640 + }, + { + "epoch": 14.841376228775692, + "grad_norm": 0.035888671875, + "learning_rate": 0.005680265480622153, + "loss": 0.801, + "num_input_tokens_seen": 57856040, + "step": 99645 + }, + { + "epoch": 14.842120941316653, + "grad_norm": 0.05078125, + "learning_rate": 0.005678737888587809, + "loss": 0.7823, + "num_input_tokens_seen": 57859016, + "step": 99650 + }, + { + "epoch": 14.842865653857611, + "grad_norm": 0.0712890625, + "learning_rate": 0.005677210454026719, + "loss": 0.7741, + "num_input_tokens_seen": 57862024, + "step": 99655 + }, + { + "epoch": 14.84361036639857, + "grad_norm": 0.052978515625, + "learning_rate": 0.005675683176964684, + "loss": 0.7886, + "num_input_tokens_seen": 57864968, + "step": 99660 + }, + { + "epoch": 14.844355078939529, + "grad_norm": 0.095703125, + "learning_rate": 0.005674156057427505, + "loss": 0.8143, + "num_input_tokens_seen": 57867656, + "step": 99665 + }, + { + "epoch": 14.84509979148049, + "grad_norm": 0.027587890625, + "learning_rate": 0.005672629095440975, + "loss": 0.8171, + "num_input_tokens_seen": 57870568, + "step": 99670 + }, + { + "epoch": 14.845844504021448, + "grad_norm": 0.05126953125, + "learning_rate": 0.005671102291030902, + "loss": 0.8263, + "num_input_tokens_seen": 57873288, + "step": 99675 + }, + { + "epoch": 14.846589216562407, + "grad_norm": 0.0703125, + "learning_rate": 0.0056695756442230755, + "loss": 0.7668, + "num_input_tokens_seen": 57876328, + "step": 99680 + }, + { + "epoch": 14.847333929103366, + "grad_norm": 0.03564453125, + "learning_rate": 0.005668049155043279, + "loss": 0.8378, + "num_input_tokens_seen": 57879208, + "step": 99685 + }, + { + "epoch": 14.848078641644324, + "grad_norm": 0.04931640625, + "learning_rate": 0.005666522823517313, + "loss": 0.7939, + "num_input_tokens_seen": 57882184, + "step": 99690 + }, + { + "epoch": 14.848823354185285, + "grad_norm": 0.0361328125, + "learning_rate": 0.005664996649670951, + "loss": 0.7918, + "num_input_tokens_seen": 57884808, + "step": 99695 + }, + { + "epoch": 14.849568066726244, + "grad_norm": 0.053955078125, + "learning_rate": 0.00566347063352999, + "loss": 0.8084, + "num_input_tokens_seen": 57888072, + "step": 99700 + }, + { + "epoch": 14.850312779267203, + "grad_norm": 0.034423828125, + "learning_rate": 0.005661944775120198, + "loss": 0.829, + "num_input_tokens_seen": 57890856, + "step": 99705 + }, + { + "epoch": 14.851057491808161, + "grad_norm": 0.046630859375, + "learning_rate": 0.005660419074467364, + "loss": 0.7895, + "num_input_tokens_seen": 57894088, + "step": 99710 + }, + { + "epoch": 14.851802204349122, + "grad_norm": 0.08740234375, + "learning_rate": 0.005658893531597256, + "loss": 0.8454, + "num_input_tokens_seen": 57896776, + "step": 99715 + }, + { + "epoch": 14.85254691689008, + "grad_norm": 0.03515625, + "learning_rate": 0.005657368146535642, + "loss": 0.7994, + "num_input_tokens_seen": 57899976, + "step": 99720 + }, + { + "epoch": 14.85329162943104, + "grad_norm": 0.056640625, + "learning_rate": 0.005655842919308306, + "loss": 0.808, + "num_input_tokens_seen": 57902728, + "step": 99725 + }, + { + "epoch": 14.854036341971998, + "grad_norm": 0.055908203125, + "learning_rate": 0.0056543178499410054, + "loss": 0.79, + "num_input_tokens_seen": 57905800, + "step": 99730 + }, + { + "epoch": 14.854781054512959, + "grad_norm": 0.052978515625, + "learning_rate": 0.005652792938459505, + "loss": 0.7813, + "num_input_tokens_seen": 57908808, + "step": 99735 + }, + { + "epoch": 14.855525767053917, + "grad_norm": 0.041015625, + "learning_rate": 0.005651268184889565, + "loss": 0.8117, + "num_input_tokens_seen": 57911784, + "step": 99740 + }, + { + "epoch": 14.856270479594876, + "grad_norm": 0.039306640625, + "learning_rate": 0.005649743589256951, + "loss": 0.8044, + "num_input_tokens_seen": 57914632, + "step": 99745 + }, + { + "epoch": 14.857015192135835, + "grad_norm": 0.036865234375, + "learning_rate": 0.00564821915158741, + "loss": 0.7957, + "num_input_tokens_seen": 57917448, + "step": 99750 + }, + { + "epoch": 14.857759904676795, + "grad_norm": 0.025390625, + "learning_rate": 0.005646694871906707, + "loss": 0.8091, + "num_input_tokens_seen": 57920104, + "step": 99755 + }, + { + "epoch": 14.858504617217754, + "grad_norm": 0.0380859375, + "learning_rate": 0.005645170750240589, + "loss": 0.8182, + "num_input_tokens_seen": 57922728, + "step": 99760 + }, + { + "epoch": 14.859249329758713, + "grad_norm": 0.057373046875, + "learning_rate": 0.005643646786614797, + "loss": 0.8034, + "num_input_tokens_seen": 57925896, + "step": 99765 + }, + { + "epoch": 14.859994042299672, + "grad_norm": 0.05029296875, + "learning_rate": 0.005642122981055091, + "loss": 0.81, + "num_input_tokens_seen": 57929096, + "step": 99770 + }, + { + "epoch": 14.860738754840632, + "grad_norm": 0.07080078125, + "learning_rate": 0.005640599333587199, + "loss": 0.8061, + "num_input_tokens_seen": 57932136, + "step": 99775 + }, + { + "epoch": 14.861483467381591, + "grad_norm": 0.04248046875, + "learning_rate": 0.005639075844236874, + "loss": 0.7886, + "num_input_tokens_seen": 57935400, + "step": 99780 + }, + { + "epoch": 14.86222817992255, + "grad_norm": 0.0260009765625, + "learning_rate": 0.005637552513029843, + "loss": 0.8186, + "num_input_tokens_seen": 57938152, + "step": 99785 + }, + { + "epoch": 14.862972892463509, + "grad_norm": 0.041748046875, + "learning_rate": 0.005636029339991853, + "loss": 0.7964, + "num_input_tokens_seen": 57940840, + "step": 99790 + }, + { + "epoch": 14.863717605004469, + "grad_norm": 0.0693359375, + "learning_rate": 0.00563450632514863, + "loss": 0.815, + "num_input_tokens_seen": 57943880, + "step": 99795 + }, + { + "epoch": 14.864462317545428, + "grad_norm": 0.04052734375, + "learning_rate": 0.005632983468525904, + "loss": 0.8096, + "num_input_tokens_seen": 57946792, + "step": 99800 + }, + { + "epoch": 14.865207030086387, + "grad_norm": 0.047119140625, + "learning_rate": 0.005631460770149401, + "loss": 0.8133, + "num_input_tokens_seen": 57949672, + "step": 99805 + }, + { + "epoch": 14.865951742627345, + "grad_norm": 0.03515625, + "learning_rate": 0.0056299382300448435, + "loss": 0.7986, + "num_input_tokens_seen": 57952648, + "step": 99810 + }, + { + "epoch": 14.866696455168306, + "grad_norm": 0.0361328125, + "learning_rate": 0.005628415848237959, + "loss": 0.7829, + "num_input_tokens_seen": 57955176, + "step": 99815 + }, + { + "epoch": 14.867441167709265, + "grad_norm": 0.040771484375, + "learning_rate": 0.005626893624754461, + "loss": 0.7986, + "num_input_tokens_seen": 57958280, + "step": 99820 + }, + { + "epoch": 14.868185880250223, + "grad_norm": 0.038330078125, + "learning_rate": 0.005625371559620073, + "loss": 0.7988, + "num_input_tokens_seen": 57961224, + "step": 99825 + }, + { + "epoch": 14.868930592791182, + "grad_norm": 0.033203125, + "learning_rate": 0.0056238496528604985, + "loss": 0.7953, + "num_input_tokens_seen": 57964232, + "step": 99830 + }, + { + "epoch": 14.86967530533214, + "grad_norm": 0.07177734375, + "learning_rate": 0.00562232790450146, + "loss": 0.7908, + "num_input_tokens_seen": 57967176, + "step": 99835 + }, + { + "epoch": 14.870420017873101, + "grad_norm": 0.041259765625, + "learning_rate": 0.005620806314568662, + "loss": 0.8005, + "num_input_tokens_seen": 57970024, + "step": 99840 + }, + { + "epoch": 14.87116473041406, + "grad_norm": 0.036865234375, + "learning_rate": 0.005619284883087802, + "loss": 0.8012, + "num_input_tokens_seen": 57973000, + "step": 99845 + }, + { + "epoch": 14.871909442955019, + "grad_norm": 0.031005859375, + "learning_rate": 0.005617763610084595, + "loss": 0.8194, + "num_input_tokens_seen": 57976296, + "step": 99850 + }, + { + "epoch": 14.87265415549598, + "grad_norm": 0.040283203125, + "learning_rate": 0.005616242495584732, + "loss": 0.8089, + "num_input_tokens_seen": 57979080, + "step": 99855 + }, + { + "epoch": 14.873398868036938, + "grad_norm": 0.0712890625, + "learning_rate": 0.00561472153961392, + "loss": 0.8131, + "num_input_tokens_seen": 57981800, + "step": 99860 + }, + { + "epoch": 14.874143580577897, + "grad_norm": 0.041259765625, + "learning_rate": 0.005613200742197847, + "loss": 0.7774, + "num_input_tokens_seen": 57984424, + "step": 99865 + }, + { + "epoch": 14.874888293118856, + "grad_norm": 0.034912109375, + "learning_rate": 0.00561168010336221, + "loss": 0.7857, + "num_input_tokens_seen": 57987304, + "step": 99870 + }, + { + "epoch": 14.875633005659815, + "grad_norm": 0.03662109375, + "learning_rate": 0.005610159623132693, + "loss": 0.7948, + "num_input_tokens_seen": 57990088, + "step": 99875 + }, + { + "epoch": 14.876377718200775, + "grad_norm": 0.047119140625, + "learning_rate": 0.005608639301534982, + "loss": 0.8058, + "num_input_tokens_seen": 57992904, + "step": 99880 + }, + { + "epoch": 14.877122430741734, + "grad_norm": 0.0693359375, + "learning_rate": 0.00560711913859477, + "loss": 0.8146, + "num_input_tokens_seen": 57995688, + "step": 99885 + }, + { + "epoch": 14.877867143282693, + "grad_norm": 0.040771484375, + "learning_rate": 0.0056055991343377295, + "loss": 0.7995, + "num_input_tokens_seen": 57998632, + "step": 99890 + }, + { + "epoch": 14.878611855823651, + "grad_norm": 0.044677734375, + "learning_rate": 0.0056040792887895475, + "loss": 0.8123, + "num_input_tokens_seen": 58001320, + "step": 99895 + }, + { + "epoch": 14.879356568364612, + "grad_norm": 0.041259765625, + "learning_rate": 0.005602559601975892, + "loss": 0.7994, + "num_input_tokens_seen": 58004520, + "step": 99900 + }, + { + "epoch": 14.88010128090557, + "grad_norm": 0.039794921875, + "learning_rate": 0.005601040073922444, + "loss": 0.8009, + "num_input_tokens_seen": 58007464, + "step": 99905 + }, + { + "epoch": 14.88084599344653, + "grad_norm": 0.06787109375, + "learning_rate": 0.0055995207046548685, + "loss": 0.8164, + "num_input_tokens_seen": 58010312, + "step": 99910 + }, + { + "epoch": 14.881590705987488, + "grad_norm": 0.03662109375, + "learning_rate": 0.005598001494198841, + "loss": 0.7861, + "num_input_tokens_seen": 58013160, + "step": 99915 + }, + { + "epoch": 14.882335418528449, + "grad_norm": 0.041015625, + "learning_rate": 0.005596482442580022, + "loss": 0.7962, + "num_input_tokens_seen": 58016104, + "step": 99920 + }, + { + "epoch": 14.883080131069407, + "grad_norm": 0.055908203125, + "learning_rate": 0.005594963549824076, + "loss": 0.7856, + "num_input_tokens_seen": 58019016, + "step": 99925 + }, + { + "epoch": 14.883824843610366, + "grad_norm": 0.046875, + "learning_rate": 0.005593444815956655, + "loss": 0.7821, + "num_input_tokens_seen": 58021864, + "step": 99930 + }, + { + "epoch": 14.884569556151325, + "grad_norm": 0.037109375, + "learning_rate": 0.0055919262410034296, + "loss": 0.7734, + "num_input_tokens_seen": 58024584, + "step": 99935 + }, + { + "epoch": 14.885314268692285, + "grad_norm": 0.05029296875, + "learning_rate": 0.005590407824990049, + "loss": 0.8027, + "num_input_tokens_seen": 58027624, + "step": 99940 + }, + { + "epoch": 14.886058981233244, + "grad_norm": 0.0419921875, + "learning_rate": 0.005588889567942157, + "loss": 0.8047, + "num_input_tokens_seen": 58030792, + "step": 99945 + }, + { + "epoch": 14.886803693774203, + "grad_norm": 0.060791015625, + "learning_rate": 0.0055873714698854195, + "loss": 0.7945, + "num_input_tokens_seen": 58034056, + "step": 99950 + }, + { + "epoch": 14.887548406315162, + "grad_norm": 0.05029296875, + "learning_rate": 0.005585853530845473, + "loss": 0.7854, + "num_input_tokens_seen": 58036712, + "step": 99955 + }, + { + "epoch": 14.888293118856122, + "grad_norm": 0.033935546875, + "learning_rate": 0.005584335750847957, + "loss": 0.8197, + "num_input_tokens_seen": 58039496, + "step": 99960 + }, + { + "epoch": 14.889037831397081, + "grad_norm": 0.02587890625, + "learning_rate": 0.0055828181299185245, + "loss": 0.7973, + "num_input_tokens_seen": 58042216, + "step": 99965 + }, + { + "epoch": 14.88978254393804, + "grad_norm": 0.03271484375, + "learning_rate": 0.005581300668082803, + "loss": 0.8002, + "num_input_tokens_seen": 58045128, + "step": 99970 + }, + { + "epoch": 14.890527256478999, + "grad_norm": 0.0625, + "learning_rate": 0.00557978336536644, + "loss": 0.8302, + "num_input_tokens_seen": 58047816, + "step": 99975 + }, + { + "epoch": 14.891271969019959, + "grad_norm": 0.056884765625, + "learning_rate": 0.0055782662217950555, + "loss": 0.8083, + "num_input_tokens_seen": 58050376, + "step": 99980 + }, + { + "epoch": 14.892016681560918, + "grad_norm": 0.1103515625, + "learning_rate": 0.005576749237394295, + "loss": 0.8201, + "num_input_tokens_seen": 58053416, + "step": 99985 + }, + { + "epoch": 14.892761394101877, + "grad_norm": 0.06005859375, + "learning_rate": 0.005575232412189777, + "loss": 0.8135, + "num_input_tokens_seen": 58056328, + "step": 99990 + }, + { + "epoch": 14.893506106642835, + "grad_norm": 0.0673828125, + "learning_rate": 0.00557371574620713, + "loss": 0.7955, + "num_input_tokens_seen": 58059368, + "step": 99995 + }, + { + "epoch": 14.894250819183796, + "grad_norm": 0.0654296875, + "learning_rate": 0.005572199239471975, + "loss": 0.7827, + "num_input_tokens_seen": 58062504, + "step": 100000 + }, + { + "epoch": 14.894995531724755, + "grad_norm": 0.047607421875, + "learning_rate": 0.0055706828920099255, + "loss": 0.8077, + "num_input_tokens_seen": 58065608, + "step": 100005 + }, + { + "epoch": 14.895740244265713, + "grad_norm": 0.060546875, + "learning_rate": 0.005569166703846612, + "loss": 0.7922, + "num_input_tokens_seen": 58068616, + "step": 100010 + }, + { + "epoch": 14.896484956806672, + "grad_norm": 0.044677734375, + "learning_rate": 0.0055676506750076335, + "loss": 0.784, + "num_input_tokens_seen": 58071400, + "step": 100015 + }, + { + "epoch": 14.897229669347631, + "grad_norm": 0.042724609375, + "learning_rate": 0.0055661348055186174, + "loss": 0.7949, + "num_input_tokens_seen": 58074344, + "step": 100020 + }, + { + "epoch": 14.897974381888591, + "grad_norm": 0.05126953125, + "learning_rate": 0.005564619095405161, + "loss": 0.8069, + "num_input_tokens_seen": 58077256, + "step": 100025 + }, + { + "epoch": 14.89871909442955, + "grad_norm": 0.056884765625, + "learning_rate": 0.00556310354469288, + "loss": 0.7908, + "num_input_tokens_seen": 58080072, + "step": 100030 + }, + { + "epoch": 14.899463806970509, + "grad_norm": 0.039306640625, + "learning_rate": 0.005561588153407374, + "loss": 0.7779, + "num_input_tokens_seen": 58082696, + "step": 100035 + }, + { + "epoch": 14.900208519511468, + "grad_norm": 0.04833984375, + "learning_rate": 0.005560072921574238, + "loss": 0.7766, + "num_input_tokens_seen": 58085832, + "step": 100040 + }, + { + "epoch": 14.900953232052428, + "grad_norm": 0.052978515625, + "learning_rate": 0.005558557849219079, + "loss": 0.7882, + "num_input_tokens_seen": 58088776, + "step": 100045 + }, + { + "epoch": 14.901697944593387, + "grad_norm": 0.052978515625, + "learning_rate": 0.005557042936367485, + "loss": 0.8007, + "num_input_tokens_seen": 58091688, + "step": 100050 + }, + { + "epoch": 14.902442657134346, + "grad_norm": 0.055908203125, + "learning_rate": 0.0055555281830450606, + "loss": 0.8241, + "num_input_tokens_seen": 58094408, + "step": 100055 + }, + { + "epoch": 14.903187369675305, + "grad_norm": 0.058349609375, + "learning_rate": 0.005554013589277386, + "loss": 0.8028, + "num_input_tokens_seen": 58097288, + "step": 100060 + }, + { + "epoch": 14.903932082216265, + "grad_norm": 0.043212890625, + "learning_rate": 0.005552499155090053, + "loss": 0.8077, + "num_input_tokens_seen": 58100360, + "step": 100065 + }, + { + "epoch": 14.904676794757224, + "grad_norm": 0.042724609375, + "learning_rate": 0.005550984880508639, + "loss": 0.7896, + "num_input_tokens_seen": 58103336, + "step": 100070 + }, + { + "epoch": 14.905421507298183, + "grad_norm": 0.05322265625, + "learning_rate": 0.005549470765558737, + "loss": 0.8151, + "num_input_tokens_seen": 58106120, + "step": 100075 + }, + { + "epoch": 14.906166219839141, + "grad_norm": 0.0390625, + "learning_rate": 0.005547956810265921, + "loss": 0.7795, + "num_input_tokens_seen": 58109224, + "step": 100080 + }, + { + "epoch": 14.906910932380102, + "grad_norm": 0.060546875, + "learning_rate": 0.005546443014655762, + "loss": 0.831, + "num_input_tokens_seen": 58111880, + "step": 100085 + }, + { + "epoch": 14.90765564492106, + "grad_norm": 0.042724609375, + "learning_rate": 0.005544929378753847, + "loss": 0.7764, + "num_input_tokens_seen": 58114728, + "step": 100090 + }, + { + "epoch": 14.90840035746202, + "grad_norm": 0.03955078125, + "learning_rate": 0.005543415902585735, + "loss": 0.7855, + "num_input_tokens_seen": 58117640, + "step": 100095 + }, + { + "epoch": 14.909145070002978, + "grad_norm": 0.0546875, + "learning_rate": 0.005541902586177004, + "loss": 0.7851, + "num_input_tokens_seen": 58120936, + "step": 100100 + }, + { + "epoch": 14.909889782543939, + "grad_norm": 0.0595703125, + "learning_rate": 0.005540389429553212, + "loss": 0.7901, + "num_input_tokens_seen": 58124264, + "step": 100105 + }, + { + "epoch": 14.910634495084897, + "grad_norm": 0.06298828125, + "learning_rate": 0.005538876432739931, + "loss": 0.805, + "num_input_tokens_seen": 58127080, + "step": 100110 + }, + { + "epoch": 14.911379207625856, + "grad_norm": 0.0301513671875, + "learning_rate": 0.005537363595762718, + "loss": 0.8138, + "num_input_tokens_seen": 58129928, + "step": 100115 + }, + { + "epoch": 14.912123920166815, + "grad_norm": 0.0380859375, + "learning_rate": 0.005535850918647124, + "loss": 0.7987, + "num_input_tokens_seen": 58132904, + "step": 100120 + }, + { + "epoch": 14.912868632707776, + "grad_norm": 0.03564453125, + "learning_rate": 0.005534338401418713, + "loss": 0.7997, + "num_input_tokens_seen": 58136008, + "step": 100125 + }, + { + "epoch": 14.913613345248734, + "grad_norm": 0.044921875, + "learning_rate": 0.0055328260441030366, + "loss": 0.7834, + "num_input_tokens_seen": 58138856, + "step": 100130 + }, + { + "epoch": 14.914358057789693, + "grad_norm": 0.056640625, + "learning_rate": 0.00553131384672564, + "loss": 0.8047, + "num_input_tokens_seen": 58141800, + "step": 100135 + }, + { + "epoch": 14.915102770330652, + "grad_norm": 0.057373046875, + "learning_rate": 0.005529801809312069, + "loss": 0.8063, + "num_input_tokens_seen": 58144680, + "step": 100140 + }, + { + "epoch": 14.915847482871612, + "grad_norm": 0.044189453125, + "learning_rate": 0.005528289931887876, + "loss": 0.7886, + "num_input_tokens_seen": 58147624, + "step": 100145 + }, + { + "epoch": 14.916592195412571, + "grad_norm": 0.046630859375, + "learning_rate": 0.005526778214478592, + "loss": 0.7847, + "num_input_tokens_seen": 58150216, + "step": 100150 + }, + { + "epoch": 14.91733690795353, + "grad_norm": 0.0361328125, + "learning_rate": 0.0055252666571097685, + "loss": 0.7907, + "num_input_tokens_seen": 58153096, + "step": 100155 + }, + { + "epoch": 14.918081620494489, + "grad_norm": 0.041748046875, + "learning_rate": 0.005523755259806935, + "loss": 0.7515, + "num_input_tokens_seen": 58155784, + "step": 100160 + }, + { + "epoch": 14.91882633303545, + "grad_norm": 0.0625, + "learning_rate": 0.00552224402259562, + "loss": 0.7907, + "num_input_tokens_seen": 58159016, + "step": 100165 + }, + { + "epoch": 14.919571045576408, + "grad_norm": 0.037353515625, + "learning_rate": 0.005520732945501364, + "loss": 0.789, + "num_input_tokens_seen": 58161736, + "step": 100170 + }, + { + "epoch": 14.920315758117367, + "grad_norm": 0.039306640625, + "learning_rate": 0.005519222028549686, + "loss": 0.7912, + "num_input_tokens_seen": 58164968, + "step": 100175 + }, + { + "epoch": 14.921060470658325, + "grad_norm": 0.0380859375, + "learning_rate": 0.00551771127176612, + "loss": 0.8005, + "num_input_tokens_seen": 58168008, + "step": 100180 + }, + { + "epoch": 14.921805183199286, + "grad_norm": 0.053466796875, + "learning_rate": 0.005516200675176181, + "loss": 0.8052, + "num_input_tokens_seen": 58171112, + "step": 100185 + }, + { + "epoch": 14.922549895740245, + "grad_norm": 0.05224609375, + "learning_rate": 0.005514690238805397, + "loss": 0.7846, + "num_input_tokens_seen": 58173960, + "step": 100190 + }, + { + "epoch": 14.923294608281203, + "grad_norm": 0.1376953125, + "learning_rate": 0.005513179962679281, + "loss": 0.8337, + "num_input_tokens_seen": 58177032, + "step": 100195 + }, + { + "epoch": 14.924039320822162, + "grad_norm": 0.035888671875, + "learning_rate": 0.005511669846823348, + "loss": 0.787, + "num_input_tokens_seen": 58180040, + "step": 100200 + }, + { + "epoch": 14.924784033363121, + "grad_norm": 0.07421875, + "learning_rate": 0.005510159891263109, + "loss": 0.8288, + "num_input_tokens_seen": 58182920, + "step": 100205 + }, + { + "epoch": 14.925528745904082, + "grad_norm": 0.03466796875, + "learning_rate": 0.005508650096024069, + "loss": 0.7891, + "num_input_tokens_seen": 58185672, + "step": 100210 + }, + { + "epoch": 14.92627345844504, + "grad_norm": 0.055419921875, + "learning_rate": 0.005507140461131744, + "loss": 0.7932, + "num_input_tokens_seen": 58188392, + "step": 100215 + }, + { + "epoch": 14.927018170985999, + "grad_norm": 0.044677734375, + "learning_rate": 0.005505630986611627, + "loss": 0.8059, + "num_input_tokens_seen": 58191432, + "step": 100220 + }, + { + "epoch": 14.927762883526958, + "grad_norm": 0.056640625, + "learning_rate": 0.00550412167248923, + "loss": 0.7801, + "num_input_tokens_seen": 58194472, + "step": 100225 + }, + { + "epoch": 14.928507596067918, + "grad_norm": 0.030029296875, + "learning_rate": 0.005502612518790042, + "loss": 0.7864, + "num_input_tokens_seen": 58197384, + "step": 100230 + }, + { + "epoch": 14.929252308608877, + "grad_norm": 0.04736328125, + "learning_rate": 0.005501103525539568, + "loss": 0.8112, + "num_input_tokens_seen": 58200488, + "step": 100235 + }, + { + "epoch": 14.929997021149836, + "grad_norm": 0.059326171875, + "learning_rate": 0.005499594692763294, + "loss": 0.7893, + "num_input_tokens_seen": 58203176, + "step": 100240 + }, + { + "epoch": 14.930741733690795, + "grad_norm": 0.047607421875, + "learning_rate": 0.005498086020486707, + "loss": 0.8107, + "num_input_tokens_seen": 58206152, + "step": 100245 + }, + { + "epoch": 14.931486446231755, + "grad_norm": 0.080078125, + "learning_rate": 0.0054965775087353055, + "loss": 0.792, + "num_input_tokens_seen": 58209384, + "step": 100250 + }, + { + "epoch": 14.932231158772714, + "grad_norm": 0.038818359375, + "learning_rate": 0.005495069157534568, + "loss": 0.773, + "num_input_tokens_seen": 58212168, + "step": 100255 + }, + { + "epoch": 14.932975871313673, + "grad_norm": 0.060302734375, + "learning_rate": 0.005493560966909971, + "loss": 0.7841, + "num_input_tokens_seen": 58215208, + "step": 100260 + }, + { + "epoch": 14.933720583854631, + "grad_norm": 0.04248046875, + "learning_rate": 0.005492052936887007, + "loss": 0.8351, + "num_input_tokens_seen": 58218184, + "step": 100265 + }, + { + "epoch": 14.934465296395592, + "grad_norm": 0.037353515625, + "learning_rate": 0.005490545067491142, + "loss": 0.7943, + "num_input_tokens_seen": 58221256, + "step": 100270 + }, + { + "epoch": 14.93521000893655, + "grad_norm": 0.052490234375, + "learning_rate": 0.00548903735874785, + "loss": 0.808, + "num_input_tokens_seen": 58224008, + "step": 100275 + }, + { + "epoch": 14.93595472147751, + "grad_norm": 0.032470703125, + "learning_rate": 0.005487529810682609, + "loss": 0.8181, + "num_input_tokens_seen": 58226888, + "step": 100280 + }, + { + "epoch": 14.936699434018468, + "grad_norm": 0.04150390625, + "learning_rate": 0.005486022423320885, + "loss": 0.7869, + "num_input_tokens_seen": 58229800, + "step": 100285 + }, + { + "epoch": 14.937444146559429, + "grad_norm": 0.052734375, + "learning_rate": 0.005484515196688139, + "loss": 0.7885, + "num_input_tokens_seen": 58232744, + "step": 100290 + }, + { + "epoch": 14.938188859100388, + "grad_norm": 0.03515625, + "learning_rate": 0.005483008130809839, + "loss": 0.7987, + "num_input_tokens_seen": 58235496, + "step": 100295 + }, + { + "epoch": 14.938933571641346, + "grad_norm": 0.050048828125, + "learning_rate": 0.0054815012257114415, + "loss": 0.7756, + "num_input_tokens_seen": 58238152, + "step": 100300 + }, + { + "epoch": 14.939678284182305, + "grad_norm": 0.054931640625, + "learning_rate": 0.0054799944814184105, + "loss": 0.8069, + "num_input_tokens_seen": 58240872, + "step": 100305 + }, + { + "epoch": 14.940422996723266, + "grad_norm": 0.03271484375, + "learning_rate": 0.005478487897956192, + "loss": 0.828, + "num_input_tokens_seen": 58243624, + "step": 100310 + }, + { + "epoch": 14.941167709264224, + "grad_norm": 0.0498046875, + "learning_rate": 0.0054769814753502484, + "loss": 0.8396, + "num_input_tokens_seen": 58246376, + "step": 100315 + }, + { + "epoch": 14.941912421805183, + "grad_norm": 0.03857421875, + "learning_rate": 0.0054754752136260245, + "loss": 0.7903, + "num_input_tokens_seen": 58249128, + "step": 100320 + }, + { + "epoch": 14.942657134346142, + "grad_norm": 0.058837890625, + "learning_rate": 0.0054739691128089666, + "loss": 0.8128, + "num_input_tokens_seen": 58251848, + "step": 100325 + }, + { + "epoch": 14.943401846887102, + "grad_norm": 0.046875, + "learning_rate": 0.005472463172924519, + "loss": 0.8142, + "num_input_tokens_seen": 58254888, + "step": 100330 + }, + { + "epoch": 14.944146559428061, + "grad_norm": 0.0390625, + "learning_rate": 0.0054709573939981165, + "loss": 0.7929, + "num_input_tokens_seen": 58257864, + "step": 100335 + }, + { + "epoch": 14.94489127196902, + "grad_norm": 0.03125, + "learning_rate": 0.00546945177605521, + "loss": 0.7803, + "num_input_tokens_seen": 58260808, + "step": 100340 + }, + { + "epoch": 14.945635984509979, + "grad_norm": 0.03515625, + "learning_rate": 0.005467946319121223, + "loss": 0.7877, + "num_input_tokens_seen": 58263432, + "step": 100345 + }, + { + "epoch": 14.946380697050937, + "grad_norm": 0.046142578125, + "learning_rate": 0.0054664410232216016, + "loss": 0.7835, + "num_input_tokens_seen": 58266280, + "step": 100350 + }, + { + "epoch": 14.947125409591898, + "grad_norm": 0.0595703125, + "learning_rate": 0.005464935888381765, + "loss": 0.8086, + "num_input_tokens_seen": 58269256, + "step": 100355 + }, + { + "epoch": 14.947870122132857, + "grad_norm": 0.0478515625, + "learning_rate": 0.005463430914627148, + "loss": 0.8013, + "num_input_tokens_seen": 58272072, + "step": 100360 + }, + { + "epoch": 14.948614834673815, + "grad_norm": 0.035400390625, + "learning_rate": 0.005461926101983175, + "loss": 0.8042, + "num_input_tokens_seen": 58275144, + "step": 100365 + }, + { + "epoch": 14.949359547214776, + "grad_norm": 0.051513671875, + "learning_rate": 0.005460421450475261, + "loss": 0.8296, + "num_input_tokens_seen": 58277768, + "step": 100370 + }, + { + "epoch": 14.950104259755735, + "grad_norm": 0.057373046875, + "learning_rate": 0.005458916960128837, + "loss": 0.7923, + "num_input_tokens_seen": 58280328, + "step": 100375 + }, + { + "epoch": 14.950848972296694, + "grad_norm": 0.0341796875, + "learning_rate": 0.005457412630969308, + "loss": 0.8019, + "num_input_tokens_seen": 58283304, + "step": 100380 + }, + { + "epoch": 14.951593684837652, + "grad_norm": 0.037109375, + "learning_rate": 0.0054559084630221, + "loss": 0.7888, + "num_input_tokens_seen": 58286216, + "step": 100385 + }, + { + "epoch": 14.952338397378611, + "grad_norm": 0.052978515625, + "learning_rate": 0.0054544044563126165, + "loss": 0.7859, + "num_input_tokens_seen": 58289064, + "step": 100390 + }, + { + "epoch": 14.953083109919572, + "grad_norm": 0.040283203125, + "learning_rate": 0.005452900610866269, + "loss": 0.7923, + "num_input_tokens_seen": 58292072, + "step": 100395 + }, + { + "epoch": 14.95382782246053, + "grad_norm": 0.043701171875, + "learning_rate": 0.005451396926708462, + "loss": 0.7975, + "num_input_tokens_seen": 58294632, + "step": 100400 + }, + { + "epoch": 14.954572535001489, + "grad_norm": 0.044189453125, + "learning_rate": 0.005449893403864594, + "loss": 0.7888, + "num_input_tokens_seen": 58297576, + "step": 100405 + }, + { + "epoch": 14.955317247542448, + "grad_norm": 0.027587890625, + "learning_rate": 0.005448390042360076, + "loss": 0.8125, + "num_input_tokens_seen": 58300392, + "step": 100410 + }, + { + "epoch": 14.956061960083408, + "grad_norm": 0.03369140625, + "learning_rate": 0.005446886842220297, + "loss": 0.8093, + "num_input_tokens_seen": 58303272, + "step": 100415 + }, + { + "epoch": 14.956806672624367, + "grad_norm": 0.07763671875, + "learning_rate": 0.005445383803470658, + "loss": 0.82, + "num_input_tokens_seen": 58306472, + "step": 100420 + }, + { + "epoch": 14.957551385165326, + "grad_norm": 0.0264892578125, + "learning_rate": 0.005443880926136544, + "loss": 0.7998, + "num_input_tokens_seen": 58309224, + "step": 100425 + }, + { + "epoch": 14.958296097706285, + "grad_norm": 0.046630859375, + "learning_rate": 0.005442378210243355, + "loss": 0.7934, + "num_input_tokens_seen": 58311912, + "step": 100430 + }, + { + "epoch": 14.959040810247245, + "grad_norm": 0.024169921875, + "learning_rate": 0.005440875655816467, + "loss": 0.7962, + "num_input_tokens_seen": 58315048, + "step": 100435 + }, + { + "epoch": 14.959785522788204, + "grad_norm": 0.03173828125, + "learning_rate": 0.005439373262881276, + "loss": 0.795, + "num_input_tokens_seen": 58318024, + "step": 100440 + }, + { + "epoch": 14.960530235329163, + "grad_norm": 0.04345703125, + "learning_rate": 0.005437871031463154, + "loss": 0.7942, + "num_input_tokens_seen": 58320712, + "step": 100445 + }, + { + "epoch": 14.961274947870121, + "grad_norm": 0.1318359375, + "learning_rate": 0.0054363689615874785, + "loss": 0.8345, + "num_input_tokens_seen": 58323272, + "step": 100450 + }, + { + "epoch": 14.962019660411082, + "grad_norm": 0.07568359375, + "learning_rate": 0.0054348670532796345, + "loss": 0.7954, + "num_input_tokens_seen": 58326056, + "step": 100455 + }, + { + "epoch": 14.96276437295204, + "grad_norm": 0.048095703125, + "learning_rate": 0.00543336530656499, + "loss": 0.7953, + "num_input_tokens_seen": 58328936, + "step": 100460 + }, + { + "epoch": 14.963509085493, + "grad_norm": 0.028564453125, + "learning_rate": 0.005431863721468916, + "loss": 0.8036, + "num_input_tokens_seen": 58331944, + "step": 100465 + }, + { + "epoch": 14.964253798033958, + "grad_norm": 0.042724609375, + "learning_rate": 0.0054303622980167755, + "loss": 0.7761, + "num_input_tokens_seen": 58334952, + "step": 100470 + }, + { + "epoch": 14.964998510574919, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00542886103623394, + "loss": 0.7993, + "num_input_tokens_seen": 58337864, + "step": 100475 + }, + { + "epoch": 14.965743223115878, + "grad_norm": 0.049560546875, + "learning_rate": 0.005427359936145772, + "loss": 0.7739, + "num_input_tokens_seen": 58340744, + "step": 100480 + }, + { + "epoch": 14.966487935656836, + "grad_norm": 0.039794921875, + "learning_rate": 0.005425858997777624, + "loss": 0.7849, + "num_input_tokens_seen": 58343624, + "step": 100485 + }, + { + "epoch": 14.967232648197795, + "grad_norm": 0.0634765625, + "learning_rate": 0.005424358221154861, + "loss": 0.8039, + "num_input_tokens_seen": 58346376, + "step": 100490 + }, + { + "epoch": 14.967977360738756, + "grad_norm": 0.03564453125, + "learning_rate": 0.005422857606302829, + "loss": 0.8137, + "num_input_tokens_seen": 58349032, + "step": 100495 + }, + { + "epoch": 14.968722073279714, + "grad_norm": 0.033935546875, + "learning_rate": 0.005421357153246889, + "loss": 0.7866, + "num_input_tokens_seen": 58351816, + "step": 100500 + }, + { + "epoch": 14.969466785820673, + "grad_norm": 0.03125, + "learning_rate": 0.00541985686201238, + "loss": 0.8129, + "num_input_tokens_seen": 58354792, + "step": 100505 + }, + { + "epoch": 14.970211498361632, + "grad_norm": 0.05126953125, + "learning_rate": 0.005418356732624658, + "loss": 0.7855, + "num_input_tokens_seen": 58357672, + "step": 100510 + }, + { + "epoch": 14.970956210902592, + "grad_norm": 0.0498046875, + "learning_rate": 0.0054168567651090545, + "loss": 0.7976, + "num_input_tokens_seen": 58360552, + "step": 100515 + }, + { + "epoch": 14.971700923443551, + "grad_norm": 0.052978515625, + "learning_rate": 0.005415356959490922, + "loss": 0.7997, + "num_input_tokens_seen": 58363528, + "step": 100520 + }, + { + "epoch": 14.97244563598451, + "grad_norm": 0.050048828125, + "learning_rate": 0.005413857315795591, + "loss": 0.7922, + "num_input_tokens_seen": 58366088, + "step": 100525 + }, + { + "epoch": 14.973190348525469, + "grad_norm": 0.05029296875, + "learning_rate": 0.0054123578340483985, + "loss": 0.8072, + "num_input_tokens_seen": 58368840, + "step": 100530 + }, + { + "epoch": 14.973935061066427, + "grad_norm": 0.044189453125, + "learning_rate": 0.005410858514274676, + "loss": 0.8012, + "num_input_tokens_seen": 58371592, + "step": 100535 + }, + { + "epoch": 14.974679773607388, + "grad_norm": 0.026611328125, + "learning_rate": 0.005409359356499747, + "loss": 0.8019, + "num_input_tokens_seen": 58374376, + "step": 100540 + }, + { + "epoch": 14.975424486148347, + "grad_norm": 0.080078125, + "learning_rate": 0.005407860360748951, + "loss": 0.8234, + "num_input_tokens_seen": 58377288, + "step": 100545 + }, + { + "epoch": 14.976169198689306, + "grad_norm": 0.038818359375, + "learning_rate": 0.005406361527047599, + "loss": 0.7813, + "num_input_tokens_seen": 58380136, + "step": 100550 + }, + { + "epoch": 14.976913911230266, + "grad_norm": 0.04052734375, + "learning_rate": 0.005404862855421024, + "loss": 0.7912, + "num_input_tokens_seen": 58383336, + "step": 100555 + }, + { + "epoch": 14.977658623771225, + "grad_norm": 0.037109375, + "learning_rate": 0.005403364345894539, + "loss": 0.8025, + "num_input_tokens_seen": 58386408, + "step": 100560 + }, + { + "epoch": 14.978403336312184, + "grad_norm": 0.0458984375, + "learning_rate": 0.005401865998493455, + "loss": 0.7756, + "num_input_tokens_seen": 58389288, + "step": 100565 + }, + { + "epoch": 14.979148048853142, + "grad_norm": 0.09619140625, + "learning_rate": 0.005400367813243094, + "loss": 0.7696, + "num_input_tokens_seen": 58392072, + "step": 100570 + }, + { + "epoch": 14.979892761394101, + "grad_norm": 0.04443359375, + "learning_rate": 0.005398869790168757, + "loss": 0.7996, + "num_input_tokens_seen": 58395240, + "step": 100575 + }, + { + "epoch": 14.980637473935062, + "grad_norm": 0.051025390625, + "learning_rate": 0.005397371929295763, + "loss": 0.7911, + "num_input_tokens_seen": 58398056, + "step": 100580 + }, + { + "epoch": 14.98138218647602, + "grad_norm": 0.03955078125, + "learning_rate": 0.0053958742306494095, + "loss": 0.7734, + "num_input_tokens_seen": 58400904, + "step": 100585 + }, + { + "epoch": 14.98212689901698, + "grad_norm": 0.03125, + "learning_rate": 0.005394376694254996, + "loss": 0.8302, + "num_input_tokens_seen": 58403880, + "step": 100590 + }, + { + "epoch": 14.982871611557938, + "grad_norm": 0.042236328125, + "learning_rate": 0.005392879320137828, + "loss": 0.7916, + "num_input_tokens_seen": 58406728, + "step": 100595 + }, + { + "epoch": 14.983616324098898, + "grad_norm": 0.050537109375, + "learning_rate": 0.005391382108323201, + "loss": 0.7952, + "num_input_tokens_seen": 58409352, + "step": 100600 + }, + { + "epoch": 14.984361036639857, + "grad_norm": 0.042724609375, + "learning_rate": 0.0053898850588364085, + "loss": 0.7745, + "num_input_tokens_seen": 58412072, + "step": 100605 + }, + { + "epoch": 14.985105749180816, + "grad_norm": 0.0262451171875, + "learning_rate": 0.005388388171702735, + "loss": 0.8188, + "num_input_tokens_seen": 58414536, + "step": 100610 + }, + { + "epoch": 14.985850461721775, + "grad_norm": 0.052978515625, + "learning_rate": 0.0053868914469474794, + "loss": 0.7943, + "num_input_tokens_seen": 58417576, + "step": 100615 + }, + { + "epoch": 14.986595174262735, + "grad_norm": 0.07958984375, + "learning_rate": 0.005385394884595917, + "loss": 0.8049, + "num_input_tokens_seen": 58420744, + "step": 100620 + }, + { + "epoch": 14.987339886803694, + "grad_norm": 0.042236328125, + "learning_rate": 0.005383898484673341, + "loss": 0.8041, + "num_input_tokens_seen": 58423560, + "step": 100625 + }, + { + "epoch": 14.988084599344653, + "grad_norm": 0.049072265625, + "learning_rate": 0.005382402247205023, + "loss": 0.7841, + "num_input_tokens_seen": 58426408, + "step": 100630 + }, + { + "epoch": 14.988829311885612, + "grad_norm": 0.0595703125, + "learning_rate": 0.0053809061722162495, + "loss": 0.8152, + "num_input_tokens_seen": 58429544, + "step": 100635 + }, + { + "epoch": 14.989574024426572, + "grad_norm": 0.08544921875, + "learning_rate": 0.005379410259732288, + "loss": 0.7977, + "num_input_tokens_seen": 58432520, + "step": 100640 + }, + { + "epoch": 14.99031873696753, + "grad_norm": 0.09619140625, + "learning_rate": 0.005377914509778409, + "loss": 0.8179, + "num_input_tokens_seen": 58435528, + "step": 100645 + }, + { + "epoch": 14.99106344950849, + "grad_norm": 0.0311279296875, + "learning_rate": 0.005376418922379888, + "loss": 0.8351, + "num_input_tokens_seen": 58438536, + "step": 100650 + }, + { + "epoch": 14.991808162049448, + "grad_norm": 0.06005859375, + "learning_rate": 0.005374923497561989, + "loss": 0.8065, + "num_input_tokens_seen": 58441512, + "step": 100655 + }, + { + "epoch": 14.992552874590409, + "grad_norm": 0.0625, + "learning_rate": 0.005373428235349977, + "loss": 0.7941, + "num_input_tokens_seen": 58444360, + "step": 100660 + }, + { + "epoch": 14.993297587131368, + "grad_norm": 0.030029296875, + "learning_rate": 0.005371933135769104, + "loss": 0.7948, + "num_input_tokens_seen": 58447464, + "step": 100665 + }, + { + "epoch": 14.994042299672326, + "grad_norm": 0.044189453125, + "learning_rate": 0.0053704381988446395, + "loss": 0.8017, + "num_input_tokens_seen": 58450440, + "step": 100670 + }, + { + "epoch": 14.994787012213285, + "grad_norm": 0.08447265625, + "learning_rate": 0.00536894342460183, + "loss": 0.8004, + "num_input_tokens_seen": 58453224, + "step": 100675 + }, + { + "epoch": 14.995531724754246, + "grad_norm": 0.03369140625, + "learning_rate": 0.005367448813065939, + "loss": 0.8096, + "num_input_tokens_seen": 58455976, + "step": 100680 + }, + { + "epoch": 14.996276437295204, + "grad_norm": 0.037353515625, + "learning_rate": 0.00536595436426221, + "loss": 0.8289, + "num_input_tokens_seen": 58459016, + "step": 100685 + }, + { + "epoch": 14.997021149836163, + "grad_norm": 0.03662109375, + "learning_rate": 0.005364460078215885, + "loss": 0.8089, + "num_input_tokens_seen": 58461800, + "step": 100690 + }, + { + "epoch": 14.997765862377122, + "grad_norm": 0.072265625, + "learning_rate": 0.005362965954952218, + "loss": 0.784, + "num_input_tokens_seen": 58464648, + "step": 100695 + }, + { + "epoch": 14.998510574918082, + "grad_norm": 0.05908203125, + "learning_rate": 0.005361471994496442, + "loss": 0.8199, + "num_input_tokens_seen": 58467688, + "step": 100700 + }, + { + "epoch": 14.999255287459041, + "grad_norm": 0.056884765625, + "learning_rate": 0.005359978196873806, + "loss": 0.7957, + "num_input_tokens_seen": 58470376, + "step": 100705 + }, + { + "epoch": 15.0, + "grad_norm": 0.1181640625, + "learning_rate": 0.005358484562109536, + "loss": 0.8029, + "num_input_tokens_seen": 58472760, + "step": 100710 + }, + { + "epoch": 15.0, + "eval_loss": 0.8003412485122681, + "eval_runtime": 70.6033, + "eval_samples_per_second": 42.264, + "eval_steps_per_second": 10.566, + "num_input_tokens_seen": 58472760, + "step": 100710 + }, + { + "epoch": 15.000744712540959, + "grad_norm": 0.053466796875, + "learning_rate": 0.005356991090228875, + "loss": 0.7785, + "num_input_tokens_seen": 58475768, + "step": 100715 + }, + { + "epoch": 15.001489425081918, + "grad_norm": 0.045166015625, + "learning_rate": 0.0053554977812570485, + "loss": 0.8096, + "num_input_tokens_seen": 58478936, + "step": 100720 + }, + { + "epoch": 15.002234137622878, + "grad_norm": 0.0576171875, + "learning_rate": 0.005354004635219285, + "loss": 0.8076, + "num_input_tokens_seen": 58481688, + "step": 100725 + }, + { + "epoch": 15.002978850163837, + "grad_norm": 0.051025390625, + "learning_rate": 0.005352511652140809, + "loss": 0.8007, + "num_input_tokens_seen": 58484536, + "step": 100730 + }, + { + "epoch": 15.003723562704796, + "grad_norm": 0.052734375, + "learning_rate": 0.005351018832046837, + "loss": 0.7867, + "num_input_tokens_seen": 58487544, + "step": 100735 + }, + { + "epoch": 15.004468275245754, + "grad_norm": 0.037109375, + "learning_rate": 0.005349526174962603, + "loss": 0.8126, + "num_input_tokens_seen": 58490200, + "step": 100740 + }, + { + "epoch": 15.005212987786715, + "grad_norm": 0.04052734375, + "learning_rate": 0.005348033680913309, + "loss": 0.8048, + "num_input_tokens_seen": 58493336, + "step": 100745 + }, + { + "epoch": 15.005957700327674, + "grad_norm": 0.035888671875, + "learning_rate": 0.0053465413499241814, + "loss": 0.7925, + "num_input_tokens_seen": 58496184, + "step": 100750 + }, + { + "epoch": 15.006702412868632, + "grad_norm": 0.044677734375, + "learning_rate": 0.005345049182020422, + "loss": 0.7839, + "num_input_tokens_seen": 58499128, + "step": 100755 + }, + { + "epoch": 15.007447125409591, + "grad_norm": 0.04345703125, + "learning_rate": 0.005343557177227249, + "loss": 0.7975, + "num_input_tokens_seen": 58502200, + "step": 100760 + }, + { + "epoch": 15.008191837950552, + "grad_norm": 0.034423828125, + "learning_rate": 0.005342065335569862, + "loss": 0.7999, + "num_input_tokens_seen": 58505048, + "step": 100765 + }, + { + "epoch": 15.00893655049151, + "grad_norm": 0.042724609375, + "learning_rate": 0.00534057365707346, + "loss": 0.799, + "num_input_tokens_seen": 58507576, + "step": 100770 + }, + { + "epoch": 15.00968126303247, + "grad_norm": 0.0380859375, + "learning_rate": 0.005339082141763254, + "loss": 0.7849, + "num_input_tokens_seen": 58510840, + "step": 100775 + }, + { + "epoch": 15.010425975573428, + "grad_norm": 0.037841796875, + "learning_rate": 0.0053375907896644325, + "loss": 0.7711, + "num_input_tokens_seen": 58513688, + "step": 100780 + }, + { + "epoch": 15.011170688114388, + "grad_norm": 0.328125, + "learning_rate": 0.0053360996008021975, + "loss": 0.8082, + "num_input_tokens_seen": 58516632, + "step": 100785 + }, + { + "epoch": 15.011915400655347, + "grad_norm": 0.0478515625, + "learning_rate": 0.00533460857520174, + "loss": 0.7896, + "num_input_tokens_seen": 58519352, + "step": 100790 + }, + { + "epoch": 15.012660113196306, + "grad_norm": 0.07275390625, + "learning_rate": 0.005333117712888245, + "loss": 0.7957, + "num_input_tokens_seen": 58522040, + "step": 100795 + }, + { + "epoch": 15.013404825737265, + "grad_norm": 0.05224609375, + "learning_rate": 0.005331627013886897, + "loss": 0.7873, + "num_input_tokens_seen": 58524888, + "step": 100800 + }, + { + "epoch": 15.014149538278225, + "grad_norm": 0.054931640625, + "learning_rate": 0.00533013647822289, + "loss": 0.8019, + "num_input_tokens_seen": 58527896, + "step": 100805 + }, + { + "epoch": 15.014894250819184, + "grad_norm": 0.07861328125, + "learning_rate": 0.005328646105921399, + "loss": 0.8116, + "num_input_tokens_seen": 58530744, + "step": 100810 + }, + { + "epoch": 15.015638963360143, + "grad_norm": 0.048095703125, + "learning_rate": 0.005327155897007596, + "loss": 0.7883, + "num_input_tokens_seen": 58533656, + "step": 100815 + }, + { + "epoch": 15.016383675901102, + "grad_norm": 0.06494140625, + "learning_rate": 0.005325665851506669, + "loss": 0.7976, + "num_input_tokens_seen": 58536792, + "step": 100820 + }, + { + "epoch": 15.017128388442062, + "grad_norm": 0.0361328125, + "learning_rate": 0.005324175969443779, + "loss": 0.8137, + "num_input_tokens_seen": 58539640, + "step": 100825 + }, + { + "epoch": 15.01787310098302, + "grad_norm": 0.03173828125, + "learning_rate": 0.005322686250844109, + "loss": 0.7979, + "num_input_tokens_seen": 58542328, + "step": 100830 + }, + { + "epoch": 15.01861781352398, + "grad_norm": 0.0634765625, + "learning_rate": 0.005321196695732813, + "loss": 0.8003, + "num_input_tokens_seen": 58545208, + "step": 100835 + }, + { + "epoch": 15.019362526064938, + "grad_norm": 0.0242919921875, + "learning_rate": 0.005319707304135067, + "loss": 0.8059, + "num_input_tokens_seen": 58548152, + "step": 100840 + }, + { + "epoch": 15.020107238605899, + "grad_norm": 0.037109375, + "learning_rate": 0.005318218076076028, + "loss": 0.782, + "num_input_tokens_seen": 58551160, + "step": 100845 + }, + { + "epoch": 15.020851951146858, + "grad_norm": 0.04541015625, + "learning_rate": 0.0053167290115808475, + "loss": 0.7809, + "num_input_tokens_seen": 58554040, + "step": 100850 + }, + { + "epoch": 15.021596663687816, + "grad_norm": 0.0478515625, + "learning_rate": 0.005315240110674696, + "loss": 0.8056, + "num_input_tokens_seen": 58556952, + "step": 100855 + }, + { + "epoch": 15.022341376228775, + "grad_norm": 0.034912109375, + "learning_rate": 0.005313751373382718, + "loss": 0.7931, + "num_input_tokens_seen": 58559960, + "step": 100860 + }, + { + "epoch": 15.023086088769736, + "grad_norm": 0.037841796875, + "learning_rate": 0.005312262799730067, + "loss": 0.7942, + "num_input_tokens_seen": 58563160, + "step": 100865 + }, + { + "epoch": 15.023830801310694, + "grad_norm": 0.036865234375, + "learning_rate": 0.005310774389741884, + "loss": 0.7882, + "num_input_tokens_seen": 58566136, + "step": 100870 + }, + { + "epoch": 15.024575513851653, + "grad_norm": 0.0341796875, + "learning_rate": 0.005309286143443325, + "loss": 0.7812, + "num_input_tokens_seen": 58569080, + "step": 100875 + }, + { + "epoch": 15.025320226392612, + "grad_norm": 0.047607421875, + "learning_rate": 0.005307798060859524, + "loss": 0.7785, + "num_input_tokens_seen": 58571736, + "step": 100880 + }, + { + "epoch": 15.02606493893357, + "grad_norm": 0.032958984375, + "learning_rate": 0.005306310142015626, + "loss": 0.8242, + "num_input_tokens_seen": 58574520, + "step": 100885 + }, + { + "epoch": 15.026809651474531, + "grad_norm": 0.045654296875, + "learning_rate": 0.005304822386936768, + "loss": 0.7848, + "num_input_tokens_seen": 58577176, + "step": 100890 + }, + { + "epoch": 15.02755436401549, + "grad_norm": 0.054443359375, + "learning_rate": 0.005303334795648077, + "loss": 0.7746, + "num_input_tokens_seen": 58580056, + "step": 100895 + }, + { + "epoch": 15.028299076556449, + "grad_norm": 0.053466796875, + "learning_rate": 0.005301847368174695, + "loss": 0.8004, + "num_input_tokens_seen": 58582776, + "step": 100900 + }, + { + "epoch": 15.029043789097408, + "grad_norm": 0.0322265625, + "learning_rate": 0.005300360104541738, + "loss": 0.7981, + "num_input_tokens_seen": 58585816, + "step": 100905 + }, + { + "epoch": 15.029788501638368, + "grad_norm": 0.031982421875, + "learning_rate": 0.005298873004774345, + "loss": 0.7938, + "num_input_tokens_seen": 58588632, + "step": 100910 + }, + { + "epoch": 15.030533214179327, + "grad_norm": 0.05078125, + "learning_rate": 0.0052973860688976335, + "loss": 0.7902, + "num_input_tokens_seen": 58591416, + "step": 100915 + }, + { + "epoch": 15.031277926720286, + "grad_norm": 0.0791015625, + "learning_rate": 0.005295899296936716, + "loss": 0.7995, + "num_input_tokens_seen": 58594328, + "step": 100920 + }, + { + "epoch": 15.032022639261244, + "grad_norm": 0.06494140625, + "learning_rate": 0.005294412688916725, + "loss": 0.8017, + "num_input_tokens_seen": 58597400, + "step": 100925 + }, + { + "epoch": 15.032767351802205, + "grad_norm": 0.0537109375, + "learning_rate": 0.005292926244862764, + "loss": 0.8083, + "num_input_tokens_seen": 58600120, + "step": 100930 + }, + { + "epoch": 15.033512064343164, + "grad_norm": 0.06591796875, + "learning_rate": 0.00529143996479995, + "loss": 0.8046, + "num_input_tokens_seen": 58603032, + "step": 100935 + }, + { + "epoch": 15.034256776884122, + "grad_norm": 0.0322265625, + "learning_rate": 0.005289953848753385, + "loss": 0.7959, + "num_input_tokens_seen": 58605880, + "step": 100940 + }, + { + "epoch": 15.035001489425081, + "grad_norm": 0.047119140625, + "learning_rate": 0.005288467896748187, + "loss": 0.7957, + "num_input_tokens_seen": 58608664, + "step": 100945 + }, + { + "epoch": 15.035746201966042, + "grad_norm": 0.038818359375, + "learning_rate": 0.005286982108809448, + "loss": 0.8016, + "num_input_tokens_seen": 58611288, + "step": 100950 + }, + { + "epoch": 15.036490914507, + "grad_norm": 0.03564453125, + "learning_rate": 0.005285496484962278, + "loss": 0.8398, + "num_input_tokens_seen": 58613944, + "step": 100955 + }, + { + "epoch": 15.03723562704796, + "grad_norm": 0.04150390625, + "learning_rate": 0.005284011025231768, + "loss": 0.795, + "num_input_tokens_seen": 58616696, + "step": 100960 + }, + { + "epoch": 15.037980339588918, + "grad_norm": 0.043212890625, + "learning_rate": 0.005282525729643022, + "loss": 0.7944, + "num_input_tokens_seen": 58619704, + "step": 100965 + }, + { + "epoch": 15.038725052129879, + "grad_norm": 0.07861328125, + "learning_rate": 0.005281040598221126, + "loss": 0.7944, + "num_input_tokens_seen": 58622936, + "step": 100970 + }, + { + "epoch": 15.039469764670837, + "grad_norm": 0.052001953125, + "learning_rate": 0.005279555630991168, + "loss": 0.7797, + "num_input_tokens_seen": 58625912, + "step": 100975 + }, + { + "epoch": 15.040214477211796, + "grad_norm": 0.030029296875, + "learning_rate": 0.0052780708279782425, + "loss": 0.8104, + "num_input_tokens_seen": 58628888, + "step": 100980 + }, + { + "epoch": 15.040959189752755, + "grad_norm": 0.042236328125, + "learning_rate": 0.005276586189207427, + "loss": 0.8042, + "num_input_tokens_seen": 58631992, + "step": 100985 + }, + { + "epoch": 15.041703902293715, + "grad_norm": 0.06298828125, + "learning_rate": 0.00527510171470381, + "loss": 0.8044, + "num_input_tokens_seen": 58634968, + "step": 100990 + }, + { + "epoch": 15.042448614834674, + "grad_norm": 0.046142578125, + "learning_rate": 0.005273617404492456, + "loss": 0.7681, + "num_input_tokens_seen": 58637816, + "step": 100995 + }, + { + "epoch": 15.043193327375633, + "grad_norm": 0.040283203125, + "learning_rate": 0.0052721332585984595, + "loss": 0.8128, + "num_input_tokens_seen": 58640536, + "step": 101000 + }, + { + "epoch": 15.043938039916592, + "grad_norm": 0.03173828125, + "learning_rate": 0.0052706492770468825, + "loss": 0.7952, + "num_input_tokens_seen": 58643448, + "step": 101005 + }, + { + "epoch": 15.044682752457552, + "grad_norm": 0.04931640625, + "learning_rate": 0.005269165459862791, + "loss": 0.7831, + "num_input_tokens_seen": 58646808, + "step": 101010 + }, + { + "epoch": 15.045427464998511, + "grad_norm": 0.042724609375, + "learning_rate": 0.005267681807071267, + "loss": 0.8093, + "num_input_tokens_seen": 58649656, + "step": 101015 + }, + { + "epoch": 15.04617217753947, + "grad_norm": 0.06494140625, + "learning_rate": 0.005266198318697362, + "loss": 0.7712, + "num_input_tokens_seen": 58652632, + "step": 101020 + }, + { + "epoch": 15.046916890080428, + "grad_norm": 0.068359375, + "learning_rate": 0.005264714994766146, + "loss": 0.781, + "num_input_tokens_seen": 58655480, + "step": 101025 + }, + { + "epoch": 15.047661602621389, + "grad_norm": 0.04345703125, + "learning_rate": 0.0052632318353026735, + "loss": 0.7967, + "num_input_tokens_seen": 58658520, + "step": 101030 + }, + { + "epoch": 15.048406315162348, + "grad_norm": 0.054443359375, + "learning_rate": 0.005261748840332006, + "loss": 0.7964, + "num_input_tokens_seen": 58661432, + "step": 101035 + }, + { + "epoch": 15.049151027703306, + "grad_norm": 0.038818359375, + "learning_rate": 0.005260266009879191, + "loss": 0.7907, + "num_input_tokens_seen": 58664408, + "step": 101040 + }, + { + "epoch": 15.049895740244265, + "grad_norm": 0.08740234375, + "learning_rate": 0.0052587833439692875, + "loss": 0.8015, + "num_input_tokens_seen": 58667544, + "step": 101045 + }, + { + "epoch": 15.050640452785226, + "grad_norm": 0.049072265625, + "learning_rate": 0.005257300842627339, + "loss": 0.7966, + "num_input_tokens_seen": 58670776, + "step": 101050 + }, + { + "epoch": 15.051385165326185, + "grad_norm": 0.033935546875, + "learning_rate": 0.005255818505878388, + "loss": 0.8019, + "num_input_tokens_seen": 58673496, + "step": 101055 + }, + { + "epoch": 15.052129877867143, + "grad_norm": 0.05224609375, + "learning_rate": 0.005254336333747484, + "loss": 0.7852, + "num_input_tokens_seen": 58676600, + "step": 101060 + }, + { + "epoch": 15.052874590408102, + "grad_norm": 0.023681640625, + "learning_rate": 0.005252854326259653, + "loss": 0.7836, + "num_input_tokens_seen": 58679480, + "step": 101065 + }, + { + "epoch": 15.05361930294906, + "grad_norm": 0.0673828125, + "learning_rate": 0.00525137248343995, + "loss": 0.7831, + "num_input_tokens_seen": 58682648, + "step": 101070 + }, + { + "epoch": 15.054364015490021, + "grad_norm": 0.044677734375, + "learning_rate": 0.005249890805313395, + "loss": 0.8128, + "num_input_tokens_seen": 58685336, + "step": 101075 + }, + { + "epoch": 15.05510872803098, + "grad_norm": 0.037841796875, + "learning_rate": 0.005248409291905031, + "loss": 0.7889, + "num_input_tokens_seen": 58688280, + "step": 101080 + }, + { + "epoch": 15.055853440571939, + "grad_norm": 0.0498046875, + "learning_rate": 0.00524692794323988, + "loss": 0.7915, + "num_input_tokens_seen": 58691032, + "step": 101085 + }, + { + "epoch": 15.056598153112898, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0052454467593429635, + "loss": 0.832, + "num_input_tokens_seen": 58693720, + "step": 101090 + }, + { + "epoch": 15.057342865653858, + "grad_norm": 0.039794921875, + "learning_rate": 0.005243965740239315, + "loss": 0.8045, + "num_input_tokens_seen": 58696440, + "step": 101095 + }, + { + "epoch": 15.058087578194817, + "grad_norm": 0.05078125, + "learning_rate": 0.005242484885953946, + "loss": 0.7916, + "num_input_tokens_seen": 58699480, + "step": 101100 + }, + { + "epoch": 15.058832290735776, + "grad_norm": 0.04638671875, + "learning_rate": 0.005241004196511882, + "loss": 0.7844, + "num_input_tokens_seen": 58702488, + "step": 101105 + }, + { + "epoch": 15.059577003276734, + "grad_norm": 0.03466796875, + "learning_rate": 0.005239523671938129, + "loss": 0.7887, + "num_input_tokens_seen": 58705368, + "step": 101110 + }, + { + "epoch": 15.060321715817695, + "grad_norm": 0.035400390625, + "learning_rate": 0.005238043312257706, + "loss": 0.7956, + "num_input_tokens_seen": 58708088, + "step": 101115 + }, + { + "epoch": 15.061066428358654, + "grad_norm": 0.0390625, + "learning_rate": 0.005236563117495622, + "loss": 0.7886, + "num_input_tokens_seen": 58711128, + "step": 101120 + }, + { + "epoch": 15.061811140899612, + "grad_norm": 0.0361328125, + "learning_rate": 0.005235083087676881, + "loss": 0.7903, + "num_input_tokens_seen": 58714008, + "step": 101125 + }, + { + "epoch": 15.062555853440571, + "grad_norm": 0.0419921875, + "learning_rate": 0.005233603222826486, + "loss": 0.7781, + "num_input_tokens_seen": 58716984, + "step": 101130 + }, + { + "epoch": 15.063300565981532, + "grad_norm": 0.042724609375, + "learning_rate": 0.0052321235229694325, + "loss": 0.7878, + "num_input_tokens_seen": 58719832, + "step": 101135 + }, + { + "epoch": 15.06404527852249, + "grad_norm": 0.0380859375, + "learning_rate": 0.005230643988130732, + "loss": 0.7792, + "num_input_tokens_seen": 58722712, + "step": 101140 + }, + { + "epoch": 15.06478999106345, + "grad_norm": 0.04150390625, + "learning_rate": 0.005229164618335365, + "loss": 0.7985, + "num_input_tokens_seen": 58725816, + "step": 101145 + }, + { + "epoch": 15.065534703604408, + "grad_norm": 0.042724609375, + "learning_rate": 0.005227685413608337, + "loss": 0.8006, + "num_input_tokens_seen": 58728568, + "step": 101150 + }, + { + "epoch": 15.066279416145369, + "grad_norm": 0.042236328125, + "learning_rate": 0.005226206373974626, + "loss": 0.8095, + "num_input_tokens_seen": 58731384, + "step": 101155 + }, + { + "epoch": 15.067024128686327, + "grad_norm": 0.03662109375, + "learning_rate": 0.00522472749945923, + "loss": 0.8, + "num_input_tokens_seen": 58734392, + "step": 101160 + }, + { + "epoch": 15.067768841227286, + "grad_norm": 0.030029296875, + "learning_rate": 0.005223248790087129, + "loss": 0.7992, + "num_input_tokens_seen": 58737144, + "step": 101165 + }, + { + "epoch": 15.068513553768245, + "grad_norm": 0.0390625, + "learning_rate": 0.005221770245883296, + "loss": 0.792, + "num_input_tokens_seen": 58739992, + "step": 101170 + }, + { + "epoch": 15.069258266309205, + "grad_norm": 0.028076171875, + "learning_rate": 0.005220291866872723, + "loss": 0.8125, + "num_input_tokens_seen": 58742840, + "step": 101175 + }, + { + "epoch": 15.070002978850164, + "grad_norm": 0.0517578125, + "learning_rate": 0.005218813653080375, + "loss": 0.7799, + "num_input_tokens_seen": 58747128, + "step": 101180 + }, + { + "epoch": 15.070747691391123, + "grad_norm": 0.046630859375, + "learning_rate": 0.0052173356045312315, + "loss": 0.7888, + "num_input_tokens_seen": 58750200, + "step": 101185 + }, + { + "epoch": 15.071492403932082, + "grad_norm": 0.04052734375, + "learning_rate": 0.0052158577212502635, + "loss": 0.7838, + "num_input_tokens_seen": 58753304, + "step": 101190 + }, + { + "epoch": 15.072237116473042, + "grad_norm": 0.0478515625, + "learning_rate": 0.005214380003262432, + "loss": 0.7949, + "num_input_tokens_seen": 58756344, + "step": 101195 + }, + { + "epoch": 15.072981829014001, + "grad_norm": 0.07421875, + "learning_rate": 0.005212902450592701, + "loss": 0.7865, + "num_input_tokens_seen": 58759032, + "step": 101200 + }, + { + "epoch": 15.07372654155496, + "grad_norm": 0.029296875, + "learning_rate": 0.005211425063266042, + "loss": 0.782, + "num_input_tokens_seen": 58762072, + "step": 101205 + }, + { + "epoch": 15.074471254095918, + "grad_norm": 0.06494140625, + "learning_rate": 0.005209947841307407, + "loss": 0.8028, + "num_input_tokens_seen": 58764472, + "step": 101210 + }, + { + "epoch": 15.075215966636879, + "grad_norm": 0.032958984375, + "learning_rate": 0.005208470784741748, + "loss": 0.8334, + "num_input_tokens_seen": 58767128, + "step": 101215 + }, + { + "epoch": 15.075960679177838, + "grad_norm": 0.07275390625, + "learning_rate": 0.005206993893594029, + "loss": 0.7951, + "num_input_tokens_seen": 58770040, + "step": 101220 + }, + { + "epoch": 15.076705391718797, + "grad_norm": 0.033935546875, + "learning_rate": 0.005205517167889188, + "loss": 0.7927, + "num_input_tokens_seen": 58772600, + "step": 101225 + }, + { + "epoch": 15.077450104259755, + "grad_norm": 0.047607421875, + "learning_rate": 0.005204040607652187, + "loss": 0.8248, + "num_input_tokens_seen": 58775608, + "step": 101230 + }, + { + "epoch": 15.078194816800714, + "grad_norm": 0.05712890625, + "learning_rate": 0.0052025642129079576, + "loss": 0.764, + "num_input_tokens_seen": 58778456, + "step": 101235 + }, + { + "epoch": 15.078939529341675, + "grad_norm": 0.04931640625, + "learning_rate": 0.005201087983681454, + "loss": 0.7794, + "num_input_tokens_seen": 58781080, + "step": 101240 + }, + { + "epoch": 15.079684241882633, + "grad_norm": 0.04296875, + "learning_rate": 0.00519961191999761, + "loss": 0.809, + "num_input_tokens_seen": 58784120, + "step": 101245 + }, + { + "epoch": 15.080428954423592, + "grad_norm": 0.042236328125, + "learning_rate": 0.005198136021881355, + "loss": 0.7881, + "num_input_tokens_seen": 58787032, + "step": 101250 + }, + { + "epoch": 15.08117366696455, + "grad_norm": 0.04541015625, + "learning_rate": 0.005196660289357637, + "loss": 0.7999, + "num_input_tokens_seen": 58789880, + "step": 101255 + }, + { + "epoch": 15.081918379505511, + "grad_norm": 0.035400390625, + "learning_rate": 0.005195184722451379, + "loss": 0.7944, + "num_input_tokens_seen": 58793016, + "step": 101260 + }, + { + "epoch": 15.08266309204647, + "grad_norm": 0.038330078125, + "learning_rate": 0.005193709321187509, + "loss": 0.8031, + "num_input_tokens_seen": 58796024, + "step": 101265 + }, + { + "epoch": 15.083407804587429, + "grad_norm": 0.033447265625, + "learning_rate": 0.005192234085590949, + "loss": 0.7824, + "num_input_tokens_seen": 58798840, + "step": 101270 + }, + { + "epoch": 15.084152517128388, + "grad_norm": 0.054443359375, + "learning_rate": 0.005190759015686631, + "loss": 0.8004, + "num_input_tokens_seen": 58801592, + "step": 101275 + }, + { + "epoch": 15.084897229669348, + "grad_norm": 0.080078125, + "learning_rate": 0.005189284111499465, + "loss": 0.8134, + "num_input_tokens_seen": 58804440, + "step": 101280 + }, + { + "epoch": 15.085641942210307, + "grad_norm": 0.045654296875, + "learning_rate": 0.005187809373054378, + "loss": 0.7725, + "num_input_tokens_seen": 58807544, + "step": 101285 + }, + { + "epoch": 15.086386654751266, + "grad_norm": 0.043212890625, + "learning_rate": 0.005186334800376277, + "loss": 0.7897, + "num_input_tokens_seen": 58810424, + "step": 101290 + }, + { + "epoch": 15.087131367292224, + "grad_norm": 0.031005859375, + "learning_rate": 0.0051848603934900725, + "loss": 0.801, + "num_input_tokens_seen": 58813496, + "step": 101295 + }, + { + "epoch": 15.087876079833185, + "grad_norm": 0.0322265625, + "learning_rate": 0.005183386152420681, + "loss": 0.7964, + "num_input_tokens_seen": 58816536, + "step": 101300 + }, + { + "epoch": 15.088620792374144, + "grad_norm": 0.056396484375, + "learning_rate": 0.005181912077192997, + "loss": 0.7955, + "num_input_tokens_seen": 58819672, + "step": 101305 + }, + { + "epoch": 15.089365504915103, + "grad_norm": 0.0498046875, + "learning_rate": 0.005180438167831937, + "loss": 0.7855, + "num_input_tokens_seen": 58822904, + "step": 101310 + }, + { + "epoch": 15.090110217456061, + "grad_norm": 0.038818359375, + "learning_rate": 0.005178964424362392, + "loss": 0.7955, + "num_input_tokens_seen": 58825688, + "step": 101315 + }, + { + "epoch": 15.090854929997022, + "grad_norm": 0.09814453125, + "learning_rate": 0.005177490846809262, + "loss": 0.8023, + "num_input_tokens_seen": 58828568, + "step": 101320 + }, + { + "epoch": 15.09159964253798, + "grad_norm": 0.046630859375, + "learning_rate": 0.005176017435197435, + "loss": 0.8142, + "num_input_tokens_seen": 58831448, + "step": 101325 + }, + { + "epoch": 15.09234435507894, + "grad_norm": 0.0419921875, + "learning_rate": 0.005174544189551816, + "loss": 0.779, + "num_input_tokens_seen": 58834136, + "step": 101330 + }, + { + "epoch": 15.093089067619898, + "grad_norm": 0.039306640625, + "learning_rate": 0.0051730711098972836, + "loss": 0.7855, + "num_input_tokens_seen": 58837336, + "step": 101335 + }, + { + "epoch": 15.093833780160859, + "grad_norm": 0.061279296875, + "learning_rate": 0.005171598196258724, + "loss": 0.7959, + "num_input_tokens_seen": 58840376, + "step": 101340 + }, + { + "epoch": 15.094578492701817, + "grad_norm": 0.04443359375, + "learning_rate": 0.005170125448661029, + "loss": 0.7884, + "num_input_tokens_seen": 58843608, + "step": 101345 + }, + { + "epoch": 15.095323205242776, + "grad_norm": 0.05859375, + "learning_rate": 0.005168652867129068, + "loss": 0.7961, + "num_input_tokens_seen": 58846488, + "step": 101350 + }, + { + "epoch": 15.096067917783735, + "grad_norm": 0.0390625, + "learning_rate": 0.005167180451687731, + "loss": 0.8193, + "num_input_tokens_seen": 58849624, + "step": 101355 + }, + { + "epoch": 15.096812630324695, + "grad_norm": 0.034423828125, + "learning_rate": 0.005165708202361881, + "loss": 0.8067, + "num_input_tokens_seen": 58852696, + "step": 101360 + }, + { + "epoch": 15.097557342865654, + "grad_norm": 0.0517578125, + "learning_rate": 0.005164236119176401, + "loss": 0.8154, + "num_input_tokens_seen": 58855288, + "step": 101365 + }, + { + "epoch": 15.098302055406613, + "grad_norm": 0.036865234375, + "learning_rate": 0.005162764202156155, + "loss": 0.8044, + "num_input_tokens_seen": 58858008, + "step": 101370 + }, + { + "epoch": 15.099046767947572, + "grad_norm": 0.057861328125, + "learning_rate": 0.005161292451326004, + "loss": 0.7881, + "num_input_tokens_seen": 58860696, + "step": 101375 + }, + { + "epoch": 15.099791480488532, + "grad_norm": 0.040771484375, + "learning_rate": 0.005159820866710824, + "loss": 0.7967, + "num_input_tokens_seen": 58863448, + "step": 101380 + }, + { + "epoch": 15.100536193029491, + "grad_norm": 0.0361328125, + "learning_rate": 0.005158349448335468, + "loss": 0.7892, + "num_input_tokens_seen": 58866104, + "step": 101385 + }, + { + "epoch": 15.10128090557045, + "grad_norm": 0.0458984375, + "learning_rate": 0.005156878196224795, + "loss": 0.7826, + "num_input_tokens_seen": 58868952, + "step": 101390 + }, + { + "epoch": 15.102025618111409, + "grad_norm": 0.03515625, + "learning_rate": 0.005155407110403657, + "loss": 0.8004, + "num_input_tokens_seen": 58871800, + "step": 101395 + }, + { + "epoch": 15.102770330652369, + "grad_norm": 0.05859375, + "learning_rate": 0.005153936190896915, + "loss": 0.8031, + "num_input_tokens_seen": 58874488, + "step": 101400 + }, + { + "epoch": 15.103515043193328, + "grad_norm": 0.037841796875, + "learning_rate": 0.0051524654377294084, + "loss": 0.7992, + "num_input_tokens_seen": 58877528, + "step": 101405 + }, + { + "epoch": 15.104259755734287, + "grad_norm": 0.037841796875, + "learning_rate": 0.0051509948509259945, + "loss": 0.7856, + "num_input_tokens_seen": 58880632, + "step": 101410 + }, + { + "epoch": 15.105004468275245, + "grad_norm": 0.04345703125, + "learning_rate": 0.005149524430511514, + "loss": 0.7942, + "num_input_tokens_seen": 58883480, + "step": 101415 + }, + { + "epoch": 15.105749180816204, + "grad_norm": 0.05322265625, + "learning_rate": 0.0051480541765108, + "loss": 0.7968, + "num_input_tokens_seen": 58886392, + "step": 101420 + }, + { + "epoch": 15.106493893357165, + "grad_norm": 0.048095703125, + "learning_rate": 0.0051465840889487035, + "loss": 0.7812, + "num_input_tokens_seen": 58889272, + "step": 101425 + }, + { + "epoch": 15.107238605898123, + "grad_norm": 0.037109375, + "learning_rate": 0.00514511416785005, + "loss": 0.8003, + "num_input_tokens_seen": 58892216, + "step": 101430 + }, + { + "epoch": 15.107983318439082, + "grad_norm": 0.055419921875, + "learning_rate": 0.00514364441323968, + "loss": 0.8139, + "num_input_tokens_seen": 58895096, + "step": 101435 + }, + { + "epoch": 15.108728030980041, + "grad_norm": 0.05029296875, + "learning_rate": 0.0051421748251424156, + "loss": 0.7784, + "num_input_tokens_seen": 58897880, + "step": 101440 + }, + { + "epoch": 15.109472743521001, + "grad_norm": 0.07275390625, + "learning_rate": 0.005140705403583094, + "loss": 0.8077, + "num_input_tokens_seen": 58900440, + "step": 101445 + }, + { + "epoch": 15.11021745606196, + "grad_norm": 0.044921875, + "learning_rate": 0.005139236148586534, + "loss": 0.7804, + "num_input_tokens_seen": 58903352, + "step": 101450 + }, + { + "epoch": 15.110962168602919, + "grad_norm": 0.05078125, + "learning_rate": 0.005137767060177556, + "loss": 0.8202, + "num_input_tokens_seen": 58906360, + "step": 101455 + }, + { + "epoch": 15.111706881143878, + "grad_norm": 0.038818359375, + "learning_rate": 0.005136298138380982, + "loss": 0.7972, + "num_input_tokens_seen": 58909272, + "step": 101460 + }, + { + "epoch": 15.112451593684838, + "grad_norm": 0.0400390625, + "learning_rate": 0.005134829383221618, + "loss": 0.7721, + "num_input_tokens_seen": 58911896, + "step": 101465 + }, + { + "epoch": 15.113196306225797, + "grad_norm": 0.06982421875, + "learning_rate": 0.0051333607947242925, + "loss": 0.7955, + "num_input_tokens_seen": 58914680, + "step": 101470 + }, + { + "epoch": 15.113941018766756, + "grad_norm": 0.181640625, + "learning_rate": 0.005131892372913802, + "loss": 0.8567, + "num_input_tokens_seen": 58917688, + "step": 101475 + }, + { + "epoch": 15.114685731307715, + "grad_norm": 0.05810546875, + "learning_rate": 0.005130424117814967, + "loss": 0.7983, + "num_input_tokens_seen": 58920792, + "step": 101480 + }, + { + "epoch": 15.115430443848675, + "grad_norm": 0.06787109375, + "learning_rate": 0.00512895602945258, + "loss": 0.8217, + "num_input_tokens_seen": 58923576, + "step": 101485 + }, + { + "epoch": 15.116175156389634, + "grad_norm": 0.038330078125, + "learning_rate": 0.005127488107851454, + "loss": 0.785, + "num_input_tokens_seen": 58926584, + "step": 101490 + }, + { + "epoch": 15.116919868930593, + "grad_norm": 0.083984375, + "learning_rate": 0.005126020353036381, + "loss": 0.787, + "num_input_tokens_seen": 58929432, + "step": 101495 + }, + { + "epoch": 15.117664581471551, + "grad_norm": 0.055908203125, + "learning_rate": 0.005124552765032153, + "loss": 0.7904, + "num_input_tokens_seen": 58932088, + "step": 101500 + }, + { + "epoch": 15.118409294012512, + "grad_norm": 0.034423828125, + "learning_rate": 0.005123085343863576, + "loss": 0.8045, + "num_input_tokens_seen": 58935224, + "step": 101505 + }, + { + "epoch": 15.11915400655347, + "grad_norm": 0.0625, + "learning_rate": 0.005121618089555428, + "loss": 0.8085, + "num_input_tokens_seen": 58938296, + "step": 101510 + }, + { + "epoch": 15.11989871909443, + "grad_norm": 0.048583984375, + "learning_rate": 0.005120151002132507, + "loss": 0.8114, + "num_input_tokens_seen": 58941336, + "step": 101515 + }, + { + "epoch": 15.120643431635388, + "grad_norm": 0.047119140625, + "learning_rate": 0.005118684081619595, + "loss": 0.8066, + "num_input_tokens_seen": 58944216, + "step": 101520 + }, + { + "epoch": 15.121388144176349, + "grad_norm": 0.039306640625, + "learning_rate": 0.00511721732804147, + "loss": 0.7839, + "num_input_tokens_seen": 58947000, + "step": 101525 + }, + { + "epoch": 15.122132856717307, + "grad_norm": 0.040283203125, + "learning_rate": 0.005115750741422916, + "loss": 0.7975, + "num_input_tokens_seen": 58949496, + "step": 101530 + }, + { + "epoch": 15.122877569258266, + "grad_norm": 0.05615234375, + "learning_rate": 0.005114284321788701, + "loss": 0.7936, + "num_input_tokens_seen": 58952376, + "step": 101535 + }, + { + "epoch": 15.123622281799225, + "grad_norm": 0.044921875, + "learning_rate": 0.0051128180691636105, + "loss": 0.7909, + "num_input_tokens_seen": 58955480, + "step": 101540 + }, + { + "epoch": 15.124366994340185, + "grad_norm": 0.05712890625, + "learning_rate": 0.005111351983572405, + "loss": 0.7797, + "num_input_tokens_seen": 58958552, + "step": 101545 + }, + { + "epoch": 15.125111706881144, + "grad_norm": 0.0390625, + "learning_rate": 0.005109886065039862, + "loss": 0.8039, + "num_input_tokens_seen": 58961752, + "step": 101550 + }, + { + "epoch": 15.125856419422103, + "grad_norm": 0.06591796875, + "learning_rate": 0.005108420313590737, + "loss": 0.7867, + "num_input_tokens_seen": 58964632, + "step": 101555 + }, + { + "epoch": 15.126601131963062, + "grad_norm": 0.037353515625, + "learning_rate": 0.005106954729249803, + "loss": 0.8033, + "num_input_tokens_seen": 58967832, + "step": 101560 + }, + { + "epoch": 15.127345844504022, + "grad_norm": 0.055419921875, + "learning_rate": 0.005105489312041809, + "loss": 0.7843, + "num_input_tokens_seen": 58970776, + "step": 101565 + }, + { + "epoch": 15.128090557044981, + "grad_norm": 0.060546875, + "learning_rate": 0.005104024061991522, + "loss": 0.7991, + "num_input_tokens_seen": 58974008, + "step": 101570 + }, + { + "epoch": 15.12883526958594, + "grad_norm": 0.05224609375, + "learning_rate": 0.005102558979123689, + "loss": 0.7928, + "num_input_tokens_seen": 58976632, + "step": 101575 + }, + { + "epoch": 15.129579982126899, + "grad_norm": 0.0595703125, + "learning_rate": 0.00510109406346306, + "loss": 0.7732, + "num_input_tokens_seen": 58979352, + "step": 101580 + }, + { + "epoch": 15.130324694667857, + "grad_norm": 0.04052734375, + "learning_rate": 0.005099629315034388, + "loss": 0.7878, + "num_input_tokens_seen": 58982264, + "step": 101585 + }, + { + "epoch": 15.131069407208818, + "grad_norm": 0.040771484375, + "learning_rate": 0.005098164733862419, + "loss": 0.8385, + "num_input_tokens_seen": 58985112, + "step": 101590 + }, + { + "epoch": 15.131814119749777, + "grad_norm": 0.0279541015625, + "learning_rate": 0.005096700319971892, + "loss": 0.7794, + "num_input_tokens_seen": 58988024, + "step": 101595 + }, + { + "epoch": 15.132558832290735, + "grad_norm": 0.057861328125, + "learning_rate": 0.005095236073387543, + "loss": 0.7907, + "num_input_tokens_seen": 58991192, + "step": 101600 + }, + { + "epoch": 15.133303544831694, + "grad_norm": 0.1025390625, + "learning_rate": 0.005093771994134119, + "loss": 0.7746, + "num_input_tokens_seen": 58994040, + "step": 101605 + }, + { + "epoch": 15.134048257372655, + "grad_norm": 0.0478515625, + "learning_rate": 0.005092308082236349, + "loss": 0.7769, + "num_input_tokens_seen": 58997176, + "step": 101610 + }, + { + "epoch": 15.134792969913613, + "grad_norm": 0.060302734375, + "learning_rate": 0.0050908443377189585, + "loss": 0.7828, + "num_input_tokens_seen": 58999736, + "step": 101615 + }, + { + "epoch": 15.135537682454572, + "grad_norm": 0.05078125, + "learning_rate": 0.005089380760606688, + "loss": 0.8026, + "num_input_tokens_seen": 59002776, + "step": 101620 + }, + { + "epoch": 15.136282394995531, + "grad_norm": 0.09716796875, + "learning_rate": 0.0050879173509242506, + "loss": 0.788, + "num_input_tokens_seen": 59005656, + "step": 101625 + }, + { + "epoch": 15.137027107536491, + "grad_norm": 0.10791015625, + "learning_rate": 0.005086454108696381, + "loss": 0.7929, + "num_input_tokens_seen": 59008728, + "step": 101630 + }, + { + "epoch": 15.13777182007745, + "grad_norm": 0.04736328125, + "learning_rate": 0.005084991033947789, + "loss": 0.8391, + "num_input_tokens_seen": 59011704, + "step": 101635 + }, + { + "epoch": 15.138516532618409, + "grad_norm": 0.04541015625, + "learning_rate": 0.005083528126703201, + "loss": 0.8311, + "num_input_tokens_seen": 59014616, + "step": 101640 + }, + { + "epoch": 15.139261245159368, + "grad_norm": 0.049072265625, + "learning_rate": 0.005082065386987326, + "loss": 0.7982, + "num_input_tokens_seen": 59017368, + "step": 101645 + }, + { + "epoch": 15.140005957700328, + "grad_norm": 0.05810546875, + "learning_rate": 0.005080602814824875, + "loss": 0.8003, + "num_input_tokens_seen": 59020120, + "step": 101650 + }, + { + "epoch": 15.140750670241287, + "grad_norm": 0.07763671875, + "learning_rate": 0.005079140410240556, + "loss": 0.8139, + "num_input_tokens_seen": 59023064, + "step": 101655 + }, + { + "epoch": 15.141495382782246, + "grad_norm": 0.06640625, + "learning_rate": 0.00507767817325908, + "loss": 0.7768, + "num_input_tokens_seen": 59025944, + "step": 101660 + }, + { + "epoch": 15.142240095323205, + "grad_norm": 0.049072265625, + "learning_rate": 0.005076216103905146, + "loss": 0.8157, + "num_input_tokens_seen": 59028696, + "step": 101665 + }, + { + "epoch": 15.142984807864165, + "grad_norm": 0.059326171875, + "learning_rate": 0.005074754202203451, + "loss": 0.8001, + "num_input_tokens_seen": 59031704, + "step": 101670 + }, + { + "epoch": 15.143729520405124, + "grad_norm": 0.05224609375, + "learning_rate": 0.005073292468178701, + "loss": 0.7925, + "num_input_tokens_seen": 59034520, + "step": 101675 + }, + { + "epoch": 15.144474232946083, + "grad_norm": 0.048095703125, + "learning_rate": 0.005071830901855579, + "loss": 0.7997, + "num_input_tokens_seen": 59037368, + "step": 101680 + }, + { + "epoch": 15.145218945487041, + "grad_norm": 0.041015625, + "learning_rate": 0.005070369503258791, + "loss": 0.7999, + "num_input_tokens_seen": 59039960, + "step": 101685 + }, + { + "epoch": 15.145963658028002, + "grad_norm": 0.03662109375, + "learning_rate": 0.005068908272413017, + "loss": 0.7972, + "num_input_tokens_seen": 59042808, + "step": 101690 + }, + { + "epoch": 15.14670837056896, + "grad_norm": 0.0517578125, + "learning_rate": 0.005067447209342939, + "loss": 0.7843, + "num_input_tokens_seen": 59045592, + "step": 101695 + }, + { + "epoch": 15.14745308310992, + "grad_norm": 0.0439453125, + "learning_rate": 0.00506598631407325, + "loss": 0.8024, + "num_input_tokens_seen": 59048216, + "step": 101700 + }, + { + "epoch": 15.148197795650878, + "grad_norm": 0.05078125, + "learning_rate": 0.005064525586628621, + "loss": 0.8024, + "num_input_tokens_seen": 59050936, + "step": 101705 + }, + { + "epoch": 15.148942508191839, + "grad_norm": 0.0693359375, + "learning_rate": 0.005063065027033739, + "loss": 0.7962, + "num_input_tokens_seen": 59053784, + "step": 101710 + }, + { + "epoch": 15.149687220732797, + "grad_norm": 0.049072265625, + "learning_rate": 0.005061604635313276, + "loss": 0.7935, + "num_input_tokens_seen": 59056568, + "step": 101715 + }, + { + "epoch": 15.150431933273756, + "grad_norm": 0.028564453125, + "learning_rate": 0.0050601444114919, + "loss": 0.7915, + "num_input_tokens_seen": 59059576, + "step": 101720 + }, + { + "epoch": 15.151176645814715, + "grad_norm": 0.064453125, + "learning_rate": 0.005058684355594278, + "loss": 0.7966, + "num_input_tokens_seen": 59062296, + "step": 101725 + }, + { + "epoch": 15.151921358355676, + "grad_norm": 0.050537109375, + "learning_rate": 0.005057224467645082, + "loss": 0.791, + "num_input_tokens_seen": 59065304, + "step": 101730 + }, + { + "epoch": 15.152666070896634, + "grad_norm": 0.037109375, + "learning_rate": 0.005055764747668975, + "loss": 0.8071, + "num_input_tokens_seen": 59068184, + "step": 101735 + }, + { + "epoch": 15.153410783437593, + "grad_norm": 0.0546875, + "learning_rate": 0.005054305195690612, + "loss": 0.8094, + "num_input_tokens_seen": 59070840, + "step": 101740 + }, + { + "epoch": 15.154155495978552, + "grad_norm": 0.053466796875, + "learning_rate": 0.005052845811734657, + "loss": 0.7936, + "num_input_tokens_seen": 59073624, + "step": 101745 + }, + { + "epoch": 15.15490020851951, + "grad_norm": 0.0625, + "learning_rate": 0.00505138659582576, + "loss": 0.7777, + "num_input_tokens_seen": 59076664, + "step": 101750 + }, + { + "epoch": 15.155644921060471, + "grad_norm": 0.03857421875, + "learning_rate": 0.005049927547988578, + "loss": 0.7986, + "num_input_tokens_seen": 59079416, + "step": 101755 + }, + { + "epoch": 15.15638963360143, + "grad_norm": 0.03857421875, + "learning_rate": 0.005048468668247753, + "loss": 0.7955, + "num_input_tokens_seen": 59082296, + "step": 101760 + }, + { + "epoch": 15.157134346142389, + "grad_norm": 0.040283203125, + "learning_rate": 0.005047009956627941, + "loss": 0.7854, + "num_input_tokens_seen": 59085272, + "step": 101765 + }, + { + "epoch": 15.157879058683347, + "grad_norm": 0.0654296875, + "learning_rate": 0.005045551413153778, + "loss": 0.8013, + "num_input_tokens_seen": 59088216, + "step": 101770 + }, + { + "epoch": 15.158623771224308, + "grad_norm": 0.0615234375, + "learning_rate": 0.005044093037849903, + "loss": 0.7873, + "num_input_tokens_seen": 59091000, + "step": 101775 + }, + { + "epoch": 15.159368483765267, + "grad_norm": 0.037353515625, + "learning_rate": 0.005042634830740963, + "loss": 0.7936, + "num_input_tokens_seen": 59093880, + "step": 101780 + }, + { + "epoch": 15.160113196306225, + "grad_norm": 0.046630859375, + "learning_rate": 0.005041176791851587, + "loss": 0.7893, + "num_input_tokens_seen": 59096984, + "step": 101785 + }, + { + "epoch": 15.160857908847184, + "grad_norm": 0.033935546875, + "learning_rate": 0.0050397189212064075, + "loss": 0.8013, + "num_input_tokens_seen": 59100184, + "step": 101790 + }, + { + "epoch": 15.161602621388145, + "grad_norm": 0.08056640625, + "learning_rate": 0.0050382612188300474, + "loss": 0.7729, + "num_input_tokens_seen": 59103224, + "step": 101795 + }, + { + "epoch": 15.162347333929103, + "grad_norm": 0.0498046875, + "learning_rate": 0.0050368036847471456, + "loss": 0.7951, + "num_input_tokens_seen": 59106424, + "step": 101800 + }, + { + "epoch": 15.163092046470062, + "grad_norm": 0.04638671875, + "learning_rate": 0.005035346318982315, + "loss": 0.771, + "num_input_tokens_seen": 59109080, + "step": 101805 + }, + { + "epoch": 15.163836759011021, + "grad_norm": 0.053466796875, + "learning_rate": 0.0050338891215601835, + "loss": 0.8312, + "num_input_tokens_seen": 59111672, + "step": 101810 + }, + { + "epoch": 15.164581471551982, + "grad_norm": 0.037109375, + "learning_rate": 0.005032432092505368, + "loss": 0.7882, + "num_input_tokens_seen": 59114456, + "step": 101815 + }, + { + "epoch": 15.16532618409294, + "grad_norm": 0.052490234375, + "learning_rate": 0.005030975231842475, + "loss": 0.7922, + "num_input_tokens_seen": 59117432, + "step": 101820 + }, + { + "epoch": 15.166070896633899, + "grad_norm": 0.1318359375, + "learning_rate": 0.00502951853959613, + "loss": 0.8338, + "num_input_tokens_seen": 59120184, + "step": 101825 + }, + { + "epoch": 15.166815609174858, + "grad_norm": 0.059326171875, + "learning_rate": 0.005028062015790932, + "loss": 0.808, + "num_input_tokens_seen": 59123256, + "step": 101830 + }, + { + "epoch": 15.167560321715818, + "grad_norm": 0.02587890625, + "learning_rate": 0.005026605660451494, + "loss": 0.8074, + "num_input_tokens_seen": 59126008, + "step": 101835 + }, + { + "epoch": 15.168305034256777, + "grad_norm": 0.047607421875, + "learning_rate": 0.005025149473602411, + "loss": 0.7747, + "num_input_tokens_seen": 59129368, + "step": 101840 + }, + { + "epoch": 15.169049746797736, + "grad_norm": 0.03173828125, + "learning_rate": 0.005023693455268297, + "loss": 0.8116, + "num_input_tokens_seen": 59132056, + "step": 101845 + }, + { + "epoch": 15.169794459338695, + "grad_norm": 0.03857421875, + "learning_rate": 0.005022237605473741, + "loss": 0.8159, + "num_input_tokens_seen": 59134872, + "step": 101850 + }, + { + "epoch": 15.170539171879655, + "grad_norm": 0.06005859375, + "learning_rate": 0.00502078192424334, + "loss": 0.8195, + "num_input_tokens_seen": 59137752, + "step": 101855 + }, + { + "epoch": 15.171283884420614, + "grad_norm": 0.039306640625, + "learning_rate": 0.005019326411601688, + "loss": 0.8005, + "num_input_tokens_seen": 59140696, + "step": 101860 + }, + { + "epoch": 15.172028596961573, + "grad_norm": 0.0390625, + "learning_rate": 0.005017871067573365, + "loss": 0.8025, + "num_input_tokens_seen": 59144312, + "step": 101865 + }, + { + "epoch": 15.172773309502531, + "grad_norm": 0.048583984375, + "learning_rate": 0.0050164158921829705, + "loss": 0.7992, + "num_input_tokens_seen": 59147512, + "step": 101870 + }, + { + "epoch": 15.173518022043492, + "grad_norm": 0.037109375, + "learning_rate": 0.005014960885455077, + "loss": 0.7817, + "num_input_tokens_seen": 59150360, + "step": 101875 + }, + { + "epoch": 15.17426273458445, + "grad_norm": 0.05810546875, + "learning_rate": 0.005013506047414278, + "loss": 0.8237, + "num_input_tokens_seen": 59153208, + "step": 101880 + }, + { + "epoch": 15.17500744712541, + "grad_norm": 0.0576171875, + "learning_rate": 0.005012051378085139, + "loss": 0.7885, + "num_input_tokens_seen": 59156088, + "step": 101885 + }, + { + "epoch": 15.175752159666368, + "grad_norm": 0.06689453125, + "learning_rate": 0.005010596877492244, + "loss": 0.7905, + "num_input_tokens_seen": 59159000, + "step": 101890 + }, + { + "epoch": 15.176496872207329, + "grad_norm": 0.05322265625, + "learning_rate": 0.005009142545660163, + "loss": 0.7979, + "num_input_tokens_seen": 59162072, + "step": 101895 + }, + { + "epoch": 15.177241584748288, + "grad_norm": 0.046875, + "learning_rate": 0.00500768838261346, + "loss": 0.8116, + "num_input_tokens_seen": 59164792, + "step": 101900 + }, + { + "epoch": 15.177986297289246, + "grad_norm": 0.03271484375, + "learning_rate": 0.005006234388376709, + "loss": 0.7946, + "num_input_tokens_seen": 59167640, + "step": 101905 + }, + { + "epoch": 15.178731009830205, + "grad_norm": 0.060546875, + "learning_rate": 0.005004780562974468, + "loss": 0.8042, + "num_input_tokens_seen": 59170648, + "step": 101910 + }, + { + "epoch": 15.179475722371166, + "grad_norm": 0.052001953125, + "learning_rate": 0.005003326906431305, + "loss": 0.7814, + "num_input_tokens_seen": 59173432, + "step": 101915 + }, + { + "epoch": 15.180220434912124, + "grad_norm": 0.047119140625, + "learning_rate": 0.005001873418771773, + "loss": 0.8269, + "num_input_tokens_seen": 59176120, + "step": 101920 + }, + { + "epoch": 15.180965147453083, + "grad_norm": 0.03857421875, + "learning_rate": 0.00500042010002043, + "loss": 0.8004, + "num_input_tokens_seen": 59178904, + "step": 101925 + }, + { + "epoch": 15.181709859994042, + "grad_norm": 0.061767578125, + "learning_rate": 0.004998966950201819, + "loss": 0.799, + "num_input_tokens_seen": 59181624, + "step": 101930 + }, + { + "epoch": 15.182454572535, + "grad_norm": 0.046630859375, + "learning_rate": 0.004997513969340503, + "loss": 0.8, + "num_input_tokens_seen": 59184504, + "step": 101935 + }, + { + "epoch": 15.183199285075961, + "grad_norm": 0.048095703125, + "learning_rate": 0.004996061157461021, + "loss": 0.7912, + "num_input_tokens_seen": 59187128, + "step": 101940 + }, + { + "epoch": 15.18394399761692, + "grad_norm": 0.032470703125, + "learning_rate": 0.004994608514587915, + "loss": 0.7957, + "num_input_tokens_seen": 59189944, + "step": 101945 + }, + { + "epoch": 15.184688710157879, + "grad_norm": 0.041015625, + "learning_rate": 0.004993156040745734, + "loss": 0.7848, + "num_input_tokens_seen": 59192952, + "step": 101950 + }, + { + "epoch": 15.185433422698837, + "grad_norm": 0.03173828125, + "learning_rate": 0.004991703735959005, + "loss": 0.7936, + "num_input_tokens_seen": 59195672, + "step": 101955 + }, + { + "epoch": 15.186178135239798, + "grad_norm": 0.04931640625, + "learning_rate": 0.004990251600252275, + "loss": 0.7939, + "num_input_tokens_seen": 59198616, + "step": 101960 + }, + { + "epoch": 15.186922847780757, + "grad_norm": 0.047119140625, + "learning_rate": 0.004988799633650067, + "loss": 0.7987, + "num_input_tokens_seen": 59201464, + "step": 101965 + }, + { + "epoch": 15.187667560321715, + "grad_norm": 0.033935546875, + "learning_rate": 0.004987347836176916, + "loss": 0.8161, + "num_input_tokens_seen": 59204792, + "step": 101970 + }, + { + "epoch": 15.188412272862674, + "grad_norm": 0.026611328125, + "learning_rate": 0.0049858962078573495, + "loss": 0.7841, + "num_input_tokens_seen": 59207864, + "step": 101975 + }, + { + "epoch": 15.189156985403635, + "grad_norm": 0.044189453125, + "learning_rate": 0.004984444748715887, + "loss": 0.7799, + "num_input_tokens_seen": 59210680, + "step": 101980 + }, + { + "epoch": 15.189901697944594, + "grad_norm": 0.076171875, + "learning_rate": 0.004982993458777049, + "loss": 0.8125, + "num_input_tokens_seen": 59213464, + "step": 101985 + }, + { + "epoch": 15.190646410485552, + "grad_norm": 0.04345703125, + "learning_rate": 0.004981542338065358, + "loss": 0.7956, + "num_input_tokens_seen": 59216376, + "step": 101990 + }, + { + "epoch": 15.191391123026511, + "grad_norm": 0.06591796875, + "learning_rate": 0.004980091386605329, + "loss": 0.8066, + "num_input_tokens_seen": 59219320, + "step": 101995 + }, + { + "epoch": 15.192135835567472, + "grad_norm": 0.095703125, + "learning_rate": 0.0049786406044214655, + "loss": 0.8153, + "num_input_tokens_seen": 59222072, + "step": 102000 + }, + { + "epoch": 15.19288054810843, + "grad_norm": 0.0380859375, + "learning_rate": 0.00497718999153829, + "loss": 0.7938, + "num_input_tokens_seen": 59225304, + "step": 102005 + }, + { + "epoch": 15.19362526064939, + "grad_norm": 0.06689453125, + "learning_rate": 0.004975739547980296, + "loss": 0.7716, + "num_input_tokens_seen": 59228120, + "step": 102010 + }, + { + "epoch": 15.194369973190348, + "grad_norm": 0.052490234375, + "learning_rate": 0.004974289273772001, + "loss": 0.7836, + "num_input_tokens_seen": 59230904, + "step": 102015 + }, + { + "epoch": 15.195114685731308, + "grad_norm": 0.053466796875, + "learning_rate": 0.004972839168937898, + "loss": 0.7939, + "num_input_tokens_seen": 59233752, + "step": 102020 + }, + { + "epoch": 15.195859398272267, + "grad_norm": 0.052734375, + "learning_rate": 0.004971389233502481, + "loss": 0.7984, + "num_input_tokens_seen": 59236888, + "step": 102025 + }, + { + "epoch": 15.196604110813226, + "grad_norm": 0.034423828125, + "learning_rate": 0.004969939467490255, + "loss": 0.7977, + "num_input_tokens_seen": 59239864, + "step": 102030 + }, + { + "epoch": 15.197348823354185, + "grad_norm": 0.040771484375, + "learning_rate": 0.004968489870925704, + "loss": 0.7899, + "num_input_tokens_seen": 59242648, + "step": 102035 + }, + { + "epoch": 15.198093535895145, + "grad_norm": 0.04150390625, + "learning_rate": 0.004967040443833327, + "loss": 0.7891, + "num_input_tokens_seen": 59245592, + "step": 102040 + }, + { + "epoch": 15.198838248436104, + "grad_norm": 0.029052734375, + "learning_rate": 0.004965591186237602, + "loss": 0.8115, + "num_input_tokens_seen": 59248344, + "step": 102045 + }, + { + "epoch": 15.199582960977063, + "grad_norm": 0.05712890625, + "learning_rate": 0.004964142098163015, + "loss": 0.7946, + "num_input_tokens_seen": 59251224, + "step": 102050 + }, + { + "epoch": 15.200327673518021, + "grad_norm": 0.05517578125, + "learning_rate": 0.004962693179634049, + "loss": 0.8123, + "num_input_tokens_seen": 59253880, + "step": 102055 + }, + { + "epoch": 15.201072386058982, + "grad_norm": 0.037353515625, + "learning_rate": 0.004961244430675175, + "loss": 0.7914, + "num_input_tokens_seen": 59256664, + "step": 102060 + }, + { + "epoch": 15.20181709859994, + "grad_norm": 0.053955078125, + "learning_rate": 0.00495979585131088, + "loss": 0.7983, + "num_input_tokens_seen": 59259384, + "step": 102065 + }, + { + "epoch": 15.2025618111409, + "grad_norm": 0.056396484375, + "learning_rate": 0.004958347441565621, + "loss": 0.785, + "num_input_tokens_seen": 59262392, + "step": 102070 + }, + { + "epoch": 15.203306523681858, + "grad_norm": 0.051513671875, + "learning_rate": 0.004956899201463884, + "loss": 0.8092, + "num_input_tokens_seen": 59265432, + "step": 102075 + }, + { + "epoch": 15.204051236222819, + "grad_norm": 0.0294189453125, + "learning_rate": 0.004955451131030121, + "loss": 0.7882, + "num_input_tokens_seen": 59268184, + "step": 102080 + }, + { + "epoch": 15.204795948763778, + "grad_norm": 0.0291748046875, + "learning_rate": 0.004954003230288808, + "loss": 0.8007, + "num_input_tokens_seen": 59271000, + "step": 102085 + }, + { + "epoch": 15.205540661304736, + "grad_norm": 0.055419921875, + "learning_rate": 0.004952555499264396, + "loss": 0.8053, + "num_input_tokens_seen": 59273720, + "step": 102090 + }, + { + "epoch": 15.206285373845695, + "grad_norm": 0.027099609375, + "learning_rate": 0.004951107937981352, + "loss": 0.7888, + "num_input_tokens_seen": 59276856, + "step": 102095 + }, + { + "epoch": 15.207030086386654, + "grad_norm": 0.061279296875, + "learning_rate": 0.004949660546464124, + "loss": 0.8187, + "num_input_tokens_seen": 59279736, + "step": 102100 + }, + { + "epoch": 15.207774798927614, + "grad_norm": 0.06591796875, + "learning_rate": 0.004948213324737162, + "loss": 0.7743, + "num_input_tokens_seen": 59282648, + "step": 102105 + }, + { + "epoch": 15.208519511468573, + "grad_norm": 0.0634765625, + "learning_rate": 0.004946766272824927, + "loss": 0.7869, + "num_input_tokens_seen": 59285592, + "step": 102110 + }, + { + "epoch": 15.209264224009532, + "grad_norm": 0.05224609375, + "learning_rate": 0.004945319390751856, + "loss": 0.7776, + "num_input_tokens_seen": 59288664, + "step": 102115 + }, + { + "epoch": 15.21000893655049, + "grad_norm": 0.056396484375, + "learning_rate": 0.004943872678542396, + "loss": 0.8144, + "num_input_tokens_seen": 59291512, + "step": 102120 + }, + { + "epoch": 15.210753649091451, + "grad_norm": 0.035400390625, + "learning_rate": 0.0049424261362209804, + "loss": 0.7883, + "num_input_tokens_seen": 59294232, + "step": 102125 + }, + { + "epoch": 15.21149836163241, + "grad_norm": 0.057373046875, + "learning_rate": 0.0049409797638120595, + "loss": 0.8098, + "num_input_tokens_seen": 59297208, + "step": 102130 + }, + { + "epoch": 15.212243074173369, + "grad_norm": 0.0361328125, + "learning_rate": 0.004939533561340061, + "loss": 0.7899, + "num_input_tokens_seen": 59300024, + "step": 102135 + }, + { + "epoch": 15.212987786714327, + "grad_norm": 0.048095703125, + "learning_rate": 0.004938087528829413, + "loss": 0.8018, + "num_input_tokens_seen": 59302712, + "step": 102140 + }, + { + "epoch": 15.213732499255288, + "grad_norm": 0.033935546875, + "learning_rate": 0.004936641666304554, + "loss": 0.7833, + "num_input_tokens_seen": 59305496, + "step": 102145 + }, + { + "epoch": 15.214477211796247, + "grad_norm": 0.047607421875, + "learning_rate": 0.004935195973789904, + "loss": 0.7883, + "num_input_tokens_seen": 59308472, + "step": 102150 + }, + { + "epoch": 15.215221924337206, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00493375045130989, + "loss": 0.7976, + "num_input_tokens_seen": 59311544, + "step": 102155 + }, + { + "epoch": 15.215966636878164, + "grad_norm": 0.0498046875, + "learning_rate": 0.004932305098888928, + "loss": 0.8137, + "num_input_tokens_seen": 59314648, + "step": 102160 + }, + { + "epoch": 15.216711349419125, + "grad_norm": 0.0634765625, + "learning_rate": 0.004930859916551443, + "loss": 0.808, + "num_input_tokens_seen": 59317176, + "step": 102165 + }, + { + "epoch": 15.217456061960084, + "grad_norm": 0.07666015625, + "learning_rate": 0.004929414904321842, + "loss": 0.7784, + "num_input_tokens_seen": 59320248, + "step": 102170 + }, + { + "epoch": 15.218200774501042, + "grad_norm": 0.053466796875, + "learning_rate": 0.004927970062224546, + "loss": 0.8104, + "num_input_tokens_seen": 59323096, + "step": 102175 + }, + { + "epoch": 15.218945487042001, + "grad_norm": 0.083984375, + "learning_rate": 0.004926525390283959, + "loss": 0.7986, + "num_input_tokens_seen": 59326232, + "step": 102180 + }, + { + "epoch": 15.219690199582962, + "grad_norm": 0.05517578125, + "learning_rate": 0.004925080888524488, + "loss": 0.7992, + "num_input_tokens_seen": 59329240, + "step": 102185 + }, + { + "epoch": 15.22043491212392, + "grad_norm": 0.04931640625, + "learning_rate": 0.004923636556970533, + "loss": 0.8026, + "num_input_tokens_seen": 59332056, + "step": 102190 + }, + { + "epoch": 15.22117962466488, + "grad_norm": 0.04248046875, + "learning_rate": 0.004922192395646495, + "loss": 0.797, + "num_input_tokens_seen": 59335096, + "step": 102195 + }, + { + "epoch": 15.221924337205838, + "grad_norm": 0.09130859375, + "learning_rate": 0.004920748404576777, + "loss": 0.8064, + "num_input_tokens_seen": 59337976, + "step": 102200 + }, + { + "epoch": 15.222669049746798, + "grad_norm": 0.0478515625, + "learning_rate": 0.004919304583785767, + "loss": 0.7931, + "num_input_tokens_seen": 59340696, + "step": 102205 + }, + { + "epoch": 15.223413762287757, + "grad_norm": 0.06787109375, + "learning_rate": 0.004917860933297864, + "loss": 0.8238, + "num_input_tokens_seen": 59343416, + "step": 102210 + }, + { + "epoch": 15.224158474828716, + "grad_norm": 0.0849609375, + "learning_rate": 0.004916417453137454, + "loss": 0.7926, + "num_input_tokens_seen": 59346168, + "step": 102215 + }, + { + "epoch": 15.224903187369675, + "grad_norm": 0.0341796875, + "learning_rate": 0.004914974143328919, + "loss": 0.795, + "num_input_tokens_seen": 59348696, + "step": 102220 + }, + { + "epoch": 15.225647899910635, + "grad_norm": 0.03271484375, + "learning_rate": 0.00491353100389665, + "loss": 0.8008, + "num_input_tokens_seen": 59351736, + "step": 102225 + }, + { + "epoch": 15.226392612451594, + "grad_norm": 0.059814453125, + "learning_rate": 0.004912088034865018, + "loss": 0.7868, + "num_input_tokens_seen": 59354680, + "step": 102230 + }, + { + "epoch": 15.227137324992553, + "grad_norm": 0.042724609375, + "learning_rate": 0.004910645236258412, + "loss": 0.7942, + "num_input_tokens_seen": 59357720, + "step": 102235 + }, + { + "epoch": 15.227882037533512, + "grad_norm": 0.047119140625, + "learning_rate": 0.004909202608101193, + "loss": 0.7922, + "num_input_tokens_seen": 59360376, + "step": 102240 + }, + { + "epoch": 15.228626750074472, + "grad_norm": 0.039794921875, + "learning_rate": 0.0049077601504177466, + "loss": 0.8074, + "num_input_tokens_seen": 59363256, + "step": 102245 + }, + { + "epoch": 15.22937146261543, + "grad_norm": 0.05322265625, + "learning_rate": 0.004906317863232435, + "loss": 0.8027, + "num_input_tokens_seen": 59366200, + "step": 102250 + }, + { + "epoch": 15.23011617515639, + "grad_norm": 0.0419921875, + "learning_rate": 0.004904875746569624, + "loss": 0.7957, + "num_input_tokens_seen": 59368824, + "step": 102255 + }, + { + "epoch": 15.230860887697348, + "grad_norm": 0.07421875, + "learning_rate": 0.004903433800453678, + "loss": 0.805, + "num_input_tokens_seen": 59371704, + "step": 102260 + }, + { + "epoch": 15.231605600238307, + "grad_norm": 0.0576171875, + "learning_rate": 0.00490199202490895, + "loss": 0.7969, + "num_input_tokens_seen": 59374616, + "step": 102265 + }, + { + "epoch": 15.232350312779268, + "grad_norm": 0.041015625, + "learning_rate": 0.004900550419959808, + "loss": 0.8139, + "num_input_tokens_seen": 59377432, + "step": 102270 + }, + { + "epoch": 15.233095025320226, + "grad_norm": 0.040771484375, + "learning_rate": 0.0048991089856306, + "loss": 0.7956, + "num_input_tokens_seen": 59380312, + "step": 102275 + }, + { + "epoch": 15.233839737861185, + "grad_norm": 0.06884765625, + "learning_rate": 0.004897667721945682, + "loss": 0.8298, + "num_input_tokens_seen": 59383448, + "step": 102280 + }, + { + "epoch": 15.234584450402144, + "grad_norm": 0.04248046875, + "learning_rate": 0.0048962266289293965, + "loss": 0.7989, + "num_input_tokens_seen": 59386360, + "step": 102285 + }, + { + "epoch": 15.235329162943104, + "grad_norm": 0.05810546875, + "learning_rate": 0.004894785706606097, + "loss": 0.7936, + "num_input_tokens_seen": 59389208, + "step": 102290 + }, + { + "epoch": 15.236073875484063, + "grad_norm": 0.045166015625, + "learning_rate": 0.004893344955000123, + "loss": 0.8033, + "num_input_tokens_seen": 59392184, + "step": 102295 + }, + { + "epoch": 15.236818588025022, + "grad_norm": 0.0400390625, + "learning_rate": 0.00489190437413581, + "loss": 0.7913, + "num_input_tokens_seen": 59395352, + "step": 102300 + }, + { + "epoch": 15.23756330056598, + "grad_norm": 0.0390625, + "learning_rate": 0.004890463964037504, + "loss": 0.8022, + "num_input_tokens_seen": 59398232, + "step": 102305 + }, + { + "epoch": 15.238308013106941, + "grad_norm": 0.07177734375, + "learning_rate": 0.0048890237247295335, + "loss": 0.7979, + "num_input_tokens_seen": 59401016, + "step": 102310 + }, + { + "epoch": 15.2390527256479, + "grad_norm": 0.06884765625, + "learning_rate": 0.004887583656236226, + "loss": 0.7763, + "num_input_tokens_seen": 59403800, + "step": 102315 + }, + { + "epoch": 15.239797438188859, + "grad_norm": 0.06103515625, + "learning_rate": 0.00488614375858192, + "loss": 0.8144, + "num_input_tokens_seen": 59406456, + "step": 102320 + }, + { + "epoch": 15.240542150729818, + "grad_norm": 0.06201171875, + "learning_rate": 0.0048847040317909355, + "loss": 0.7983, + "num_input_tokens_seen": 59409880, + "step": 102325 + }, + { + "epoch": 15.241286863270778, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0048832644758875915, + "loss": 0.8068, + "num_input_tokens_seen": 59412632, + "step": 102330 + }, + { + "epoch": 15.242031575811737, + "grad_norm": 0.036376953125, + "learning_rate": 0.004881825090896216, + "loss": 0.8042, + "num_input_tokens_seen": 59415192, + "step": 102335 + }, + { + "epoch": 15.242776288352696, + "grad_norm": 0.036865234375, + "learning_rate": 0.004880385876841122, + "loss": 0.8124, + "num_input_tokens_seen": 59417784, + "step": 102340 + }, + { + "epoch": 15.243521000893654, + "grad_norm": 0.05126953125, + "learning_rate": 0.004878946833746619, + "loss": 0.7952, + "num_input_tokens_seen": 59420728, + "step": 102345 + }, + { + "epoch": 15.244265713434615, + "grad_norm": 0.0517578125, + "learning_rate": 0.004877507961637028, + "loss": 0.8101, + "num_input_tokens_seen": 59423448, + "step": 102350 + }, + { + "epoch": 15.245010425975574, + "grad_norm": 0.05615234375, + "learning_rate": 0.004876069260536646, + "loss": 0.7867, + "num_input_tokens_seen": 59426392, + "step": 102355 + }, + { + "epoch": 15.245755138516532, + "grad_norm": 0.04345703125, + "learning_rate": 0.00487463073046979, + "loss": 0.7874, + "num_input_tokens_seen": 59429336, + "step": 102360 + }, + { + "epoch": 15.246499851057491, + "grad_norm": 0.044921875, + "learning_rate": 0.004873192371460752, + "loss": 0.7988, + "num_input_tokens_seen": 59432472, + "step": 102365 + }, + { + "epoch": 15.247244563598452, + "grad_norm": 0.038818359375, + "learning_rate": 0.0048717541835338425, + "loss": 0.8135, + "num_input_tokens_seen": 59435352, + "step": 102370 + }, + { + "epoch": 15.24798927613941, + "grad_norm": 0.0556640625, + "learning_rate": 0.00487031616671335, + "loss": 0.7808, + "num_input_tokens_seen": 59438072, + "step": 102375 + }, + { + "epoch": 15.24873398868037, + "grad_norm": 0.031982421875, + "learning_rate": 0.0048688783210235715, + "loss": 0.8021, + "num_input_tokens_seen": 59441432, + "step": 102380 + }, + { + "epoch": 15.249478701221328, + "grad_norm": 0.038818359375, + "learning_rate": 0.004867440646488797, + "loss": 0.8138, + "num_input_tokens_seen": 59444152, + "step": 102385 + }, + { + "epoch": 15.250223413762289, + "grad_norm": 0.059814453125, + "learning_rate": 0.004866003143133311, + "loss": 0.8152, + "num_input_tokens_seen": 59447032, + "step": 102390 + }, + { + "epoch": 15.250968126303247, + "grad_norm": 0.035400390625, + "learning_rate": 0.004864565810981405, + "loss": 0.8058, + "num_input_tokens_seen": 59449624, + "step": 102395 + }, + { + "epoch": 15.251712838844206, + "grad_norm": 0.05322265625, + "learning_rate": 0.004863128650057353, + "loss": 0.7985, + "num_input_tokens_seen": 59452760, + "step": 102400 + }, + { + "epoch": 15.252457551385165, + "grad_norm": 0.041748046875, + "learning_rate": 0.004861691660385446, + "loss": 0.8062, + "num_input_tokens_seen": 59456152, + "step": 102405 + }, + { + "epoch": 15.253202263926125, + "grad_norm": 0.052490234375, + "learning_rate": 0.00486025484198995, + "loss": 0.7917, + "num_input_tokens_seen": 59459000, + "step": 102410 + }, + { + "epoch": 15.253946976467084, + "grad_norm": 0.05517578125, + "learning_rate": 0.004858818194895147, + "loss": 0.788, + "num_input_tokens_seen": 59461656, + "step": 102415 + }, + { + "epoch": 15.254691689008043, + "grad_norm": 0.049072265625, + "learning_rate": 0.004857381719125303, + "loss": 0.8121, + "num_input_tokens_seen": 59464408, + "step": 102420 + }, + { + "epoch": 15.255436401549002, + "grad_norm": 0.05712890625, + "learning_rate": 0.004855945414704681, + "loss": 0.7898, + "num_input_tokens_seen": 59467288, + "step": 102425 + }, + { + "epoch": 15.256181114089962, + "grad_norm": 0.056396484375, + "learning_rate": 0.004854509281657557, + "loss": 0.7973, + "num_input_tokens_seen": 59470136, + "step": 102430 + }, + { + "epoch": 15.256925826630921, + "grad_norm": 0.057861328125, + "learning_rate": 0.004853073320008183, + "loss": 0.7963, + "num_input_tokens_seen": 59472568, + "step": 102435 + }, + { + "epoch": 15.25767053917188, + "grad_norm": 0.0341796875, + "learning_rate": 0.0048516375297808275, + "loss": 0.7962, + "num_input_tokens_seen": 59475576, + "step": 102440 + }, + { + "epoch": 15.258415251712838, + "grad_norm": 0.0244140625, + "learning_rate": 0.00485020191099974, + "loss": 0.7896, + "num_input_tokens_seen": 59478232, + "step": 102445 + }, + { + "epoch": 15.259159964253797, + "grad_norm": 0.0771484375, + "learning_rate": 0.004848766463689176, + "loss": 0.8054, + "num_input_tokens_seen": 59481336, + "step": 102450 + }, + { + "epoch": 15.259904676794758, + "grad_norm": 0.04345703125, + "learning_rate": 0.004847331187873386, + "loss": 0.7867, + "num_input_tokens_seen": 59484216, + "step": 102455 + }, + { + "epoch": 15.260649389335716, + "grad_norm": 0.052978515625, + "learning_rate": 0.0048458960835766115, + "loss": 0.7986, + "num_input_tokens_seen": 59487384, + "step": 102460 + }, + { + "epoch": 15.261394101876675, + "grad_norm": 0.0751953125, + "learning_rate": 0.004844461150823107, + "loss": 0.8065, + "num_input_tokens_seen": 59490328, + "step": 102465 + }, + { + "epoch": 15.262138814417634, + "grad_norm": 0.04296875, + "learning_rate": 0.004843026389637107, + "loss": 0.799, + "num_input_tokens_seen": 59493304, + "step": 102470 + }, + { + "epoch": 15.262883526958595, + "grad_norm": 0.08154296875, + "learning_rate": 0.004841591800042855, + "loss": 0.7973, + "num_input_tokens_seen": 59496184, + "step": 102475 + }, + { + "epoch": 15.263628239499553, + "grad_norm": 0.041259765625, + "learning_rate": 0.004840157382064582, + "loss": 0.7796, + "num_input_tokens_seen": 59499128, + "step": 102480 + }, + { + "epoch": 15.264372952040512, + "grad_norm": 0.052978515625, + "learning_rate": 0.004838723135726528, + "loss": 0.7998, + "num_input_tokens_seen": 59502104, + "step": 102485 + }, + { + "epoch": 15.26511766458147, + "grad_norm": 0.03759765625, + "learning_rate": 0.004837289061052913, + "loss": 0.8033, + "num_input_tokens_seen": 59504824, + "step": 102490 + }, + { + "epoch": 15.265862377122431, + "grad_norm": 0.046142578125, + "learning_rate": 0.004835855158067979, + "loss": 0.781, + "num_input_tokens_seen": 59507896, + "step": 102495 + }, + { + "epoch": 15.26660708966339, + "grad_norm": 0.07421875, + "learning_rate": 0.004834421426795939, + "loss": 0.8045, + "num_input_tokens_seen": 59511032, + "step": 102500 + }, + { + "epoch": 15.267351802204349, + "grad_norm": 0.0537109375, + "learning_rate": 0.004832987867261012, + "loss": 0.7913, + "num_input_tokens_seen": 59513688, + "step": 102505 + }, + { + "epoch": 15.268096514745308, + "grad_norm": 0.051513671875, + "learning_rate": 0.004831554479487429, + "loss": 0.8, + "num_input_tokens_seen": 59516824, + "step": 102510 + }, + { + "epoch": 15.268841227286268, + "grad_norm": 0.037109375, + "learning_rate": 0.004830121263499396, + "loss": 0.8067, + "num_input_tokens_seen": 59519608, + "step": 102515 + }, + { + "epoch": 15.269585939827227, + "grad_norm": 0.050537109375, + "learning_rate": 0.004828688219321128, + "loss": 0.7914, + "num_input_tokens_seen": 59522520, + "step": 102520 + }, + { + "epoch": 15.270330652368186, + "grad_norm": 0.039306640625, + "learning_rate": 0.004827255346976831, + "loss": 0.8016, + "num_input_tokens_seen": 59525848, + "step": 102525 + }, + { + "epoch": 15.271075364909144, + "grad_norm": 0.04150390625, + "learning_rate": 0.004825822646490721, + "loss": 0.7884, + "num_input_tokens_seen": 59529016, + "step": 102530 + }, + { + "epoch": 15.271820077450105, + "grad_norm": 0.039794921875, + "learning_rate": 0.004824390117886991, + "loss": 0.8116, + "num_input_tokens_seen": 59531640, + "step": 102535 + }, + { + "epoch": 15.272564789991064, + "grad_norm": 0.060302734375, + "learning_rate": 0.0048229577611898534, + "loss": 0.7864, + "num_input_tokens_seen": 59534360, + "step": 102540 + }, + { + "epoch": 15.273309502532022, + "grad_norm": 0.048583984375, + "learning_rate": 0.004821525576423501, + "loss": 0.7884, + "num_input_tokens_seen": 59537016, + "step": 102545 + }, + { + "epoch": 15.274054215072981, + "grad_norm": 0.0947265625, + "learning_rate": 0.004820093563612123, + "loss": 0.8113, + "num_input_tokens_seen": 59539672, + "step": 102550 + }, + { + "epoch": 15.274798927613942, + "grad_norm": 0.0546875, + "learning_rate": 0.004818661722779925, + "loss": 0.7858, + "num_input_tokens_seen": 59542552, + "step": 102555 + }, + { + "epoch": 15.2755436401549, + "grad_norm": 0.0634765625, + "learning_rate": 0.004817230053951083, + "loss": 0.8057, + "num_input_tokens_seen": 59545688, + "step": 102560 + }, + { + "epoch": 15.27628835269586, + "grad_norm": 0.03662109375, + "learning_rate": 0.004815798557149795, + "loss": 0.8032, + "num_input_tokens_seen": 59548376, + "step": 102565 + }, + { + "epoch": 15.277033065236818, + "grad_norm": 0.041015625, + "learning_rate": 0.0048143672324002336, + "loss": 0.7894, + "num_input_tokens_seen": 59551576, + "step": 102570 + }, + { + "epoch": 15.277777777777779, + "grad_norm": 0.07373046875, + "learning_rate": 0.004812936079726592, + "loss": 0.7898, + "num_input_tokens_seen": 59554552, + "step": 102575 + }, + { + "epoch": 15.278522490318737, + "grad_norm": 0.03759765625, + "learning_rate": 0.004811505099153041, + "loss": 0.7725, + "num_input_tokens_seen": 59557368, + "step": 102580 + }, + { + "epoch": 15.279267202859696, + "grad_norm": 0.056396484375, + "learning_rate": 0.004810074290703756, + "loss": 0.7935, + "num_input_tokens_seen": 59560120, + "step": 102585 + }, + { + "epoch": 15.280011915400655, + "grad_norm": 0.047607421875, + "learning_rate": 0.00480864365440291, + "loss": 0.8044, + "num_input_tokens_seen": 59563032, + "step": 102590 + }, + { + "epoch": 15.280756627941615, + "grad_norm": 0.0308837890625, + "learning_rate": 0.004807213190274667, + "loss": 0.7973, + "num_input_tokens_seen": 59565976, + "step": 102595 + }, + { + "epoch": 15.281501340482574, + "grad_norm": 0.04150390625, + "learning_rate": 0.004805782898343202, + "loss": 0.8017, + "num_input_tokens_seen": 59568760, + "step": 102600 + }, + { + "epoch": 15.282246053023533, + "grad_norm": 0.0751953125, + "learning_rate": 0.004804352778632668, + "loss": 0.8114, + "num_input_tokens_seen": 59571672, + "step": 102605 + }, + { + "epoch": 15.282990765564492, + "grad_norm": 0.03466796875, + "learning_rate": 0.004802922831167239, + "loss": 0.7965, + "num_input_tokens_seen": 59574488, + "step": 102610 + }, + { + "epoch": 15.283735478105452, + "grad_norm": 0.04638671875, + "learning_rate": 0.004801493055971057, + "loss": 0.8076, + "num_input_tokens_seen": 59577176, + "step": 102615 + }, + { + "epoch": 15.284480190646411, + "grad_norm": 0.0458984375, + "learning_rate": 0.004800063453068291, + "loss": 0.8031, + "num_input_tokens_seen": 59580472, + "step": 102620 + }, + { + "epoch": 15.28522490318737, + "grad_norm": 0.05712890625, + "learning_rate": 0.004798634022483086, + "loss": 0.7995, + "num_input_tokens_seen": 59583192, + "step": 102625 + }, + { + "epoch": 15.285969615728328, + "grad_norm": 0.031494140625, + "learning_rate": 0.004797204764239585, + "loss": 0.7984, + "num_input_tokens_seen": 59586232, + "step": 102630 + }, + { + "epoch": 15.286714328269287, + "grad_norm": 0.05517578125, + "learning_rate": 0.004795775678361946, + "loss": 0.8036, + "num_input_tokens_seen": 59589144, + "step": 102635 + }, + { + "epoch": 15.287459040810248, + "grad_norm": 0.051513671875, + "learning_rate": 0.004794346764874305, + "loss": 0.8115, + "num_input_tokens_seen": 59592056, + "step": 102640 + }, + { + "epoch": 15.288203753351207, + "grad_norm": 0.0498046875, + "learning_rate": 0.004792918023800799, + "loss": 0.7914, + "num_input_tokens_seen": 59594936, + "step": 102645 + }, + { + "epoch": 15.288948465892165, + "grad_norm": 0.05078125, + "learning_rate": 0.004791489455165573, + "loss": 0.7895, + "num_input_tokens_seen": 59597560, + "step": 102650 + }, + { + "epoch": 15.289693178433124, + "grad_norm": 0.06591796875, + "learning_rate": 0.004790061058992757, + "loss": 0.8049, + "num_input_tokens_seen": 59600344, + "step": 102655 + }, + { + "epoch": 15.290437890974085, + "grad_norm": 0.0732421875, + "learning_rate": 0.004788632835306483, + "loss": 0.8033, + "num_input_tokens_seen": 59604216, + "step": 102660 + }, + { + "epoch": 15.291182603515043, + "grad_norm": 0.034423828125, + "learning_rate": 0.004787204784130875, + "loss": 0.779, + "num_input_tokens_seen": 59607032, + "step": 102665 + }, + { + "epoch": 15.291927316056002, + "grad_norm": 0.047607421875, + "learning_rate": 0.004785776905490067, + "loss": 0.7892, + "num_input_tokens_seen": 59609912, + "step": 102670 + }, + { + "epoch": 15.29267202859696, + "grad_norm": 0.06787109375, + "learning_rate": 0.00478434919940817, + "loss": 0.8219, + "num_input_tokens_seen": 59613016, + "step": 102675 + }, + { + "epoch": 15.293416741137921, + "grad_norm": 0.049072265625, + "learning_rate": 0.004782921665909317, + "loss": 0.8005, + "num_input_tokens_seen": 59616184, + "step": 102680 + }, + { + "epoch": 15.29416145367888, + "grad_norm": 0.049560546875, + "learning_rate": 0.004781494305017612, + "loss": 0.7827, + "num_input_tokens_seen": 59618968, + "step": 102685 + }, + { + "epoch": 15.294906166219839, + "grad_norm": 0.038818359375, + "learning_rate": 0.004780067116757182, + "loss": 0.7945, + "num_input_tokens_seen": 59621784, + "step": 102690 + }, + { + "epoch": 15.295650878760798, + "grad_norm": 0.060546875, + "learning_rate": 0.004778640101152125, + "loss": 0.8092, + "num_input_tokens_seen": 59624824, + "step": 102695 + }, + { + "epoch": 15.296395591301758, + "grad_norm": 0.046630859375, + "learning_rate": 0.00477721325822656, + "loss": 0.7854, + "num_input_tokens_seen": 59627480, + "step": 102700 + }, + { + "epoch": 15.297140303842717, + "grad_norm": 0.062255859375, + "learning_rate": 0.004775786588004586, + "loss": 0.7856, + "num_input_tokens_seen": 59630552, + "step": 102705 + }, + { + "epoch": 15.297885016383676, + "grad_norm": 0.05615234375, + "learning_rate": 0.0047743600905103085, + "loss": 0.7971, + "num_input_tokens_seen": 59633304, + "step": 102710 + }, + { + "epoch": 15.298629728924634, + "grad_norm": 0.051025390625, + "learning_rate": 0.004772933765767823, + "loss": 0.7923, + "num_input_tokens_seen": 59636120, + "step": 102715 + }, + { + "epoch": 15.299374441465595, + "grad_norm": 0.062255859375, + "learning_rate": 0.004771507613801222, + "loss": 0.7794, + "num_input_tokens_seen": 59639000, + "step": 102720 + }, + { + "epoch": 15.300119154006554, + "grad_norm": 0.078125, + "learning_rate": 0.004770081634634609, + "loss": 0.8033, + "num_input_tokens_seen": 59641912, + "step": 102725 + }, + { + "epoch": 15.300863866547513, + "grad_norm": 0.04443359375, + "learning_rate": 0.004768655828292066, + "loss": 0.7962, + "num_input_tokens_seen": 59645048, + "step": 102730 + }, + { + "epoch": 15.301608579088471, + "grad_norm": 0.037109375, + "learning_rate": 0.004767230194797688, + "loss": 0.7851, + "num_input_tokens_seen": 59647896, + "step": 102735 + }, + { + "epoch": 15.302353291629432, + "grad_norm": 0.05712890625, + "learning_rate": 0.004765804734175555, + "loss": 0.7904, + "num_input_tokens_seen": 59650840, + "step": 102740 + }, + { + "epoch": 15.30309800417039, + "grad_norm": 0.058837890625, + "learning_rate": 0.004764379446449745, + "loss": 0.8132, + "num_input_tokens_seen": 59654040, + "step": 102745 + }, + { + "epoch": 15.30384271671135, + "grad_norm": 0.051025390625, + "learning_rate": 0.004762954331644348, + "loss": 0.8055, + "num_input_tokens_seen": 59657240, + "step": 102750 + }, + { + "epoch": 15.304587429252308, + "grad_norm": 0.05078125, + "learning_rate": 0.004761529389783426, + "loss": 0.7959, + "num_input_tokens_seen": 59660248, + "step": 102755 + }, + { + "epoch": 15.305332141793269, + "grad_norm": 0.11669921875, + "learning_rate": 0.004760104620891064, + "loss": 0.8073, + "num_input_tokens_seen": 59663064, + "step": 102760 + }, + { + "epoch": 15.306076854334227, + "grad_norm": 0.04541015625, + "learning_rate": 0.004758680024991323, + "loss": 0.8042, + "num_input_tokens_seen": 59665816, + "step": 102765 + }, + { + "epoch": 15.306821566875186, + "grad_norm": 0.033447265625, + "learning_rate": 0.004757255602108279, + "loss": 0.7978, + "num_input_tokens_seen": 59668792, + "step": 102770 + }, + { + "epoch": 15.307566279416145, + "grad_norm": 0.032958984375, + "learning_rate": 0.004755831352265991, + "loss": 0.808, + "num_input_tokens_seen": 59671768, + "step": 102775 + }, + { + "epoch": 15.308310991957104, + "grad_norm": 0.047607421875, + "learning_rate": 0.004754407275488522, + "loss": 0.771, + "num_input_tokens_seen": 59674616, + "step": 102780 + }, + { + "epoch": 15.309055704498064, + "grad_norm": 0.0498046875, + "learning_rate": 0.004752983371799927, + "loss": 0.7839, + "num_input_tokens_seen": 59677528, + "step": 102785 + }, + { + "epoch": 15.309800417039023, + "grad_norm": 0.0238037109375, + "learning_rate": 0.004751559641224258, + "loss": 0.7946, + "num_input_tokens_seen": 59680760, + "step": 102790 + }, + { + "epoch": 15.310545129579982, + "grad_norm": 0.042724609375, + "learning_rate": 0.0047501360837855795, + "loss": 0.7967, + "num_input_tokens_seen": 59683768, + "step": 102795 + }, + { + "epoch": 15.31128984212094, + "grad_norm": 0.043701171875, + "learning_rate": 0.004748712699507929, + "loss": 0.7994, + "num_input_tokens_seen": 59686744, + "step": 102800 + }, + { + "epoch": 15.312034554661901, + "grad_norm": 0.0556640625, + "learning_rate": 0.004747289488415363, + "loss": 0.7835, + "num_input_tokens_seen": 59689816, + "step": 102805 + }, + { + "epoch": 15.31277926720286, + "grad_norm": 0.0306396484375, + "learning_rate": 0.004745866450531916, + "loss": 0.7891, + "num_input_tokens_seen": 59692600, + "step": 102810 + }, + { + "epoch": 15.313523979743819, + "grad_norm": 0.040283203125, + "learning_rate": 0.0047444435858816384, + "loss": 0.793, + "num_input_tokens_seen": 59695480, + "step": 102815 + }, + { + "epoch": 15.314268692284777, + "grad_norm": 0.0517578125, + "learning_rate": 0.004743020894488563, + "loss": 0.7736, + "num_input_tokens_seen": 59698360, + "step": 102820 + }, + { + "epoch": 15.315013404825738, + "grad_norm": 0.0478515625, + "learning_rate": 0.00474159837637672, + "loss": 0.796, + "num_input_tokens_seen": 59701368, + "step": 102825 + }, + { + "epoch": 15.315758117366697, + "grad_norm": 0.058837890625, + "learning_rate": 0.0047401760315701515, + "loss": 0.7978, + "num_input_tokens_seen": 59703992, + "step": 102830 + }, + { + "epoch": 15.316502829907655, + "grad_norm": 0.0908203125, + "learning_rate": 0.004738753860092876, + "loss": 0.809, + "num_input_tokens_seen": 59706968, + "step": 102835 + }, + { + "epoch": 15.317247542448614, + "grad_norm": 0.04736328125, + "learning_rate": 0.004737331861968931, + "loss": 0.8057, + "num_input_tokens_seen": 59710008, + "step": 102840 + }, + { + "epoch": 15.317992254989575, + "grad_norm": 0.08203125, + "learning_rate": 0.004735910037222332, + "loss": 0.7998, + "num_input_tokens_seen": 59712728, + "step": 102845 + }, + { + "epoch": 15.318736967530533, + "grad_norm": 0.041748046875, + "learning_rate": 0.004734488385877101, + "loss": 0.8084, + "num_input_tokens_seen": 59715864, + "step": 102850 + }, + { + "epoch": 15.319481680071492, + "grad_norm": 0.0341796875, + "learning_rate": 0.004733066907957253, + "loss": 0.8235, + "num_input_tokens_seen": 59718616, + "step": 102855 + }, + { + "epoch": 15.320226392612451, + "grad_norm": 0.040771484375, + "learning_rate": 0.004731645603486808, + "loss": 0.863, + "num_input_tokens_seen": 59721400, + "step": 102860 + }, + { + "epoch": 15.320971105153411, + "grad_norm": 0.041015625, + "learning_rate": 0.004730224472489773, + "loss": 0.8138, + "num_input_tokens_seen": 59724440, + "step": 102865 + }, + { + "epoch": 15.32171581769437, + "grad_norm": 0.034912109375, + "learning_rate": 0.004728803514990153, + "loss": 0.7985, + "num_input_tokens_seen": 59727352, + "step": 102870 + }, + { + "epoch": 15.322460530235329, + "grad_norm": 0.044921875, + "learning_rate": 0.004727382731011965, + "loss": 0.8026, + "num_input_tokens_seen": 59730200, + "step": 102875 + }, + { + "epoch": 15.323205242776288, + "grad_norm": 0.0390625, + "learning_rate": 0.004725962120579199, + "loss": 0.7794, + "num_input_tokens_seen": 59732824, + "step": 102880 + }, + { + "epoch": 15.323949955317248, + "grad_norm": 0.048583984375, + "learning_rate": 0.0047245416837158665, + "loss": 0.7988, + "num_input_tokens_seen": 59735864, + "step": 102885 + }, + { + "epoch": 15.324694667858207, + "grad_norm": 0.042724609375, + "learning_rate": 0.0047231214204459535, + "loss": 0.7983, + "num_input_tokens_seen": 59739192, + "step": 102890 + }, + { + "epoch": 15.325439380399166, + "grad_norm": 0.0556640625, + "learning_rate": 0.004721701330793465, + "loss": 0.8188, + "num_input_tokens_seen": 59742168, + "step": 102895 + }, + { + "epoch": 15.326184092940125, + "grad_norm": 0.038818359375, + "learning_rate": 0.004720281414782385, + "loss": 0.797, + "num_input_tokens_seen": 59745432, + "step": 102900 + }, + { + "epoch": 15.326928805481085, + "grad_norm": 0.057861328125, + "learning_rate": 0.004718861672436698, + "loss": 0.8158, + "num_input_tokens_seen": 59748760, + "step": 102905 + }, + { + "epoch": 15.327673518022044, + "grad_norm": 0.0517578125, + "learning_rate": 0.004717442103780398, + "loss": 0.8152, + "num_input_tokens_seen": 59751544, + "step": 102910 + }, + { + "epoch": 15.328418230563003, + "grad_norm": 0.054443359375, + "learning_rate": 0.004716022708837464, + "loss": 0.8145, + "num_input_tokens_seen": 59754392, + "step": 102915 + }, + { + "epoch": 15.329162943103961, + "grad_norm": 0.0869140625, + "learning_rate": 0.0047146034876318745, + "loss": 0.8043, + "num_input_tokens_seen": 59757208, + "step": 102920 + }, + { + "epoch": 15.329907655644922, + "grad_norm": 0.047607421875, + "learning_rate": 0.0047131844401875995, + "loss": 0.7735, + "num_input_tokens_seen": 59760472, + "step": 102925 + }, + { + "epoch": 15.33065236818588, + "grad_norm": 0.02880859375, + "learning_rate": 0.004711765566528623, + "loss": 0.811, + "num_input_tokens_seen": 59763288, + "step": 102930 + }, + { + "epoch": 15.33139708072684, + "grad_norm": 0.04638671875, + "learning_rate": 0.004710346866678907, + "loss": 0.7958, + "num_input_tokens_seen": 59766168, + "step": 102935 + }, + { + "epoch": 15.332141793267798, + "grad_norm": 0.032470703125, + "learning_rate": 0.004708928340662427, + "loss": 0.8055, + "num_input_tokens_seen": 59769112, + "step": 102940 + }, + { + "epoch": 15.332886505808759, + "grad_norm": 0.03271484375, + "learning_rate": 0.004707509988503144, + "loss": 0.7891, + "num_input_tokens_seen": 59772568, + "step": 102945 + }, + { + "epoch": 15.333631218349717, + "grad_norm": 0.03857421875, + "learning_rate": 0.004706091810225014, + "loss": 0.8063, + "num_input_tokens_seen": 59775384, + "step": 102950 + }, + { + "epoch": 15.334375930890676, + "grad_norm": 0.05322265625, + "learning_rate": 0.0047046738058520035, + "loss": 0.812, + "num_input_tokens_seen": 59778488, + "step": 102955 + }, + { + "epoch": 15.335120643431635, + "grad_norm": 0.045654296875, + "learning_rate": 0.004703255975408063, + "loss": 0.7838, + "num_input_tokens_seen": 59781272, + "step": 102960 + }, + { + "epoch": 15.335865355972594, + "grad_norm": 0.037109375, + "learning_rate": 0.004701838318917151, + "loss": 0.7981, + "num_input_tokens_seen": 59784504, + "step": 102965 + }, + { + "epoch": 15.336610068513554, + "grad_norm": 0.036865234375, + "learning_rate": 0.0047004208364032135, + "loss": 0.8067, + "num_input_tokens_seen": 59787448, + "step": 102970 + }, + { + "epoch": 15.337354781054513, + "grad_norm": 0.024658203125, + "learning_rate": 0.004699003527890195, + "loss": 0.7877, + "num_input_tokens_seen": 59790296, + "step": 102975 + }, + { + "epoch": 15.338099493595472, + "grad_norm": 0.03271484375, + "learning_rate": 0.004697586393402046, + "loss": 0.7947, + "num_input_tokens_seen": 59792984, + "step": 102980 + }, + { + "epoch": 15.33884420613643, + "grad_norm": 0.03857421875, + "learning_rate": 0.004696169432962704, + "loss": 0.7983, + "num_input_tokens_seen": 59796120, + "step": 102985 + }, + { + "epoch": 15.339588918677391, + "grad_norm": 0.038330078125, + "learning_rate": 0.004694752646596108, + "loss": 0.8152, + "num_input_tokens_seen": 59799352, + "step": 102990 + }, + { + "epoch": 15.34033363121835, + "grad_norm": 0.0419921875, + "learning_rate": 0.004693336034326186, + "loss": 0.8177, + "num_input_tokens_seen": 59802264, + "step": 102995 + }, + { + "epoch": 15.341078343759309, + "grad_norm": 0.041015625, + "learning_rate": 0.004691919596176883, + "loss": 0.8023, + "num_input_tokens_seen": 59805112, + "step": 103000 + }, + { + "epoch": 15.341823056300267, + "grad_norm": 0.045654296875, + "learning_rate": 0.004690503332172116, + "loss": 0.7973, + "num_input_tokens_seen": 59807832, + "step": 103005 + }, + { + "epoch": 15.342567768841228, + "grad_norm": 0.0625, + "learning_rate": 0.004689087242335822, + "loss": 0.8048, + "num_input_tokens_seen": 59810712, + "step": 103010 + }, + { + "epoch": 15.343312481382187, + "grad_norm": 0.0272216796875, + "learning_rate": 0.004687671326691915, + "loss": 0.7979, + "num_input_tokens_seen": 59813432, + "step": 103015 + }, + { + "epoch": 15.344057193923145, + "grad_norm": 0.035888671875, + "learning_rate": 0.004686255585264324, + "loss": 0.7871, + "num_input_tokens_seen": 59816280, + "step": 103020 + }, + { + "epoch": 15.344801906464104, + "grad_norm": 0.04931640625, + "learning_rate": 0.004684840018076963, + "loss": 0.8015, + "num_input_tokens_seen": 59819160, + "step": 103025 + }, + { + "epoch": 15.345546619005065, + "grad_norm": 0.0400390625, + "learning_rate": 0.004683424625153741, + "loss": 0.8014, + "num_input_tokens_seen": 59822104, + "step": 103030 + }, + { + "epoch": 15.346291331546023, + "grad_norm": 0.072265625, + "learning_rate": 0.004682009406518579, + "loss": 0.7912, + "num_input_tokens_seen": 59825080, + "step": 103035 + }, + { + "epoch": 15.347036044086982, + "grad_norm": 0.072265625, + "learning_rate": 0.004680594362195381, + "loss": 0.8019, + "num_input_tokens_seen": 59827960, + "step": 103040 + }, + { + "epoch": 15.347780756627941, + "grad_norm": 0.068359375, + "learning_rate": 0.004679179492208056, + "loss": 0.8041, + "num_input_tokens_seen": 59831224, + "step": 103045 + }, + { + "epoch": 15.348525469168901, + "grad_norm": 0.06494140625, + "learning_rate": 0.004677764796580497, + "loss": 0.7906, + "num_input_tokens_seen": 59834008, + "step": 103050 + }, + { + "epoch": 15.34927018170986, + "grad_norm": 0.038330078125, + "learning_rate": 0.004676350275336616, + "loss": 0.7917, + "num_input_tokens_seen": 59836760, + "step": 103055 + }, + { + "epoch": 15.350014894250819, + "grad_norm": 0.04931640625, + "learning_rate": 0.004674935928500303, + "loss": 0.8027, + "num_input_tokens_seen": 59839448, + "step": 103060 + }, + { + "epoch": 15.350759606791778, + "grad_norm": 0.06494140625, + "learning_rate": 0.00467352175609545, + "loss": 0.8054, + "num_input_tokens_seen": 59842424, + "step": 103065 + }, + { + "epoch": 15.351504319332738, + "grad_norm": 0.03857421875, + "learning_rate": 0.004672107758145957, + "loss": 0.798, + "num_input_tokens_seen": 59845304, + "step": 103070 + }, + { + "epoch": 15.352249031873697, + "grad_norm": 0.07177734375, + "learning_rate": 0.004670693934675701, + "loss": 0.8103, + "num_input_tokens_seen": 59848344, + "step": 103075 + }, + { + "epoch": 15.352993744414656, + "grad_norm": 0.037353515625, + "learning_rate": 0.004669280285708578, + "loss": 0.8399, + "num_input_tokens_seen": 59851224, + "step": 103080 + }, + { + "epoch": 15.353738456955615, + "grad_norm": 0.05615234375, + "learning_rate": 0.004667866811268458, + "loss": 0.8015, + "num_input_tokens_seen": 59854264, + "step": 103085 + }, + { + "epoch": 15.354483169496575, + "grad_norm": 0.059326171875, + "learning_rate": 0.004666453511379234, + "loss": 0.8175, + "num_input_tokens_seen": 59857240, + "step": 103090 + }, + { + "epoch": 15.355227882037534, + "grad_norm": 0.0302734375, + "learning_rate": 0.004665040386064772, + "loss": 0.8019, + "num_input_tokens_seen": 59860504, + "step": 103095 + }, + { + "epoch": 15.355972594578493, + "grad_norm": 0.02294921875, + "learning_rate": 0.0046636274353489505, + "loss": 0.795, + "num_input_tokens_seen": 59863480, + "step": 103100 + }, + { + "epoch": 15.356717307119451, + "grad_norm": 0.047119140625, + "learning_rate": 0.00466221465925564, + "loss": 0.8004, + "num_input_tokens_seen": 59866456, + "step": 103105 + }, + { + "epoch": 15.357462019660412, + "grad_norm": 0.0245361328125, + "learning_rate": 0.004660802057808705, + "loss": 0.785, + "num_input_tokens_seen": 59868856, + "step": 103110 + }, + { + "epoch": 15.35820673220137, + "grad_norm": 0.04638671875, + "learning_rate": 0.004659389631032011, + "loss": 0.8015, + "num_input_tokens_seen": 59871864, + "step": 103115 + }, + { + "epoch": 15.35895144474233, + "grad_norm": 0.039306640625, + "learning_rate": 0.004657977378949415, + "loss": 0.7875, + "num_input_tokens_seen": 59874712, + "step": 103120 + }, + { + "epoch": 15.359696157283288, + "grad_norm": 0.0439453125, + "learning_rate": 0.004656565301584785, + "loss": 0.8, + "num_input_tokens_seen": 59877528, + "step": 103125 + }, + { + "epoch": 15.360440869824249, + "grad_norm": 0.0478515625, + "learning_rate": 0.004655153398961966, + "loss": 0.8046, + "num_input_tokens_seen": 59880888, + "step": 103130 + }, + { + "epoch": 15.361185582365207, + "grad_norm": 0.058349609375, + "learning_rate": 0.004653741671104821, + "loss": 0.7792, + "num_input_tokens_seen": 59883672, + "step": 103135 + }, + { + "epoch": 15.361930294906166, + "grad_norm": 0.037841796875, + "learning_rate": 0.004652330118037192, + "loss": 0.7938, + "num_input_tokens_seen": 59886680, + "step": 103140 + }, + { + "epoch": 15.362675007447125, + "grad_norm": 0.0306396484375, + "learning_rate": 0.004650918739782932, + "loss": 0.8031, + "num_input_tokens_seen": 59889912, + "step": 103145 + }, + { + "epoch": 15.363419719988084, + "grad_norm": 0.0419921875, + "learning_rate": 0.004649507536365882, + "loss": 0.7882, + "num_input_tokens_seen": 59892952, + "step": 103150 + }, + { + "epoch": 15.364164432529044, + "grad_norm": 0.04736328125, + "learning_rate": 0.0046480965078098775, + "loss": 0.7819, + "num_input_tokens_seen": 59896056, + "step": 103155 + }, + { + "epoch": 15.364909145070003, + "grad_norm": 0.02294921875, + "learning_rate": 0.004646685654138765, + "loss": 0.7869, + "num_input_tokens_seen": 59898616, + "step": 103160 + }, + { + "epoch": 15.365653857610962, + "grad_norm": 0.054443359375, + "learning_rate": 0.004645274975376373, + "loss": 0.8044, + "num_input_tokens_seen": 59901912, + "step": 103165 + }, + { + "epoch": 15.36639857015192, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00464386447154654, + "loss": 0.7915, + "num_input_tokens_seen": 59904952, + "step": 103170 + }, + { + "epoch": 15.367143282692881, + "grad_norm": 0.042236328125, + "learning_rate": 0.004642454142673091, + "loss": 0.7922, + "num_input_tokens_seen": 59907704, + "step": 103175 + }, + { + "epoch": 15.36788799523384, + "grad_norm": 0.04541015625, + "learning_rate": 0.004641043988779853, + "loss": 0.8119, + "num_input_tokens_seen": 59910328, + "step": 103180 + }, + { + "epoch": 15.368632707774799, + "grad_norm": 0.037841796875, + "learning_rate": 0.0046396340098906474, + "loss": 0.8096, + "num_input_tokens_seen": 59913656, + "step": 103185 + }, + { + "epoch": 15.369377420315757, + "grad_norm": 0.037109375, + "learning_rate": 0.004638224206029292, + "loss": 0.8192, + "num_input_tokens_seen": 59916600, + "step": 103190 + }, + { + "epoch": 15.370122132856718, + "grad_norm": 0.044677734375, + "learning_rate": 0.00463681457721961, + "loss": 0.7994, + "num_input_tokens_seen": 59919416, + "step": 103195 + }, + { + "epoch": 15.370866845397677, + "grad_norm": 0.039794921875, + "learning_rate": 0.004635405123485411, + "loss": 0.7895, + "num_input_tokens_seen": 59922424, + "step": 103200 + }, + { + "epoch": 15.371611557938635, + "grad_norm": 0.033203125, + "learning_rate": 0.004633995844850511, + "loss": 0.7871, + "num_input_tokens_seen": 59925496, + "step": 103205 + }, + { + "epoch": 15.372356270479594, + "grad_norm": 0.043212890625, + "learning_rate": 0.004632586741338713, + "loss": 0.796, + "num_input_tokens_seen": 59928152, + "step": 103210 + }, + { + "epoch": 15.373100983020555, + "grad_norm": 0.042236328125, + "learning_rate": 0.004631177812973827, + "loss": 0.7867, + "num_input_tokens_seen": 59931064, + "step": 103215 + }, + { + "epoch": 15.373845695561513, + "grad_norm": 0.0498046875, + "learning_rate": 0.004629769059779651, + "loss": 0.7751, + "num_input_tokens_seen": 59934040, + "step": 103220 + }, + { + "epoch": 15.374590408102472, + "grad_norm": 0.040771484375, + "learning_rate": 0.004628360481779991, + "loss": 0.8081, + "num_input_tokens_seen": 59936600, + "step": 103225 + }, + { + "epoch": 15.375335120643431, + "grad_norm": 0.048828125, + "learning_rate": 0.004626952078998639, + "loss": 0.7992, + "num_input_tokens_seen": 59939320, + "step": 103230 + }, + { + "epoch": 15.376079833184392, + "grad_norm": 0.048095703125, + "learning_rate": 0.004625543851459385, + "loss": 0.7956, + "num_input_tokens_seen": 59942232, + "step": 103235 + }, + { + "epoch": 15.37682454572535, + "grad_norm": 0.043701171875, + "learning_rate": 0.004624135799186028, + "loss": 0.7968, + "num_input_tokens_seen": 59945304, + "step": 103240 + }, + { + "epoch": 15.377569258266309, + "grad_norm": 0.034423828125, + "learning_rate": 0.00462272792220235, + "loss": 0.8108, + "num_input_tokens_seen": 59948344, + "step": 103245 + }, + { + "epoch": 15.378313970807268, + "grad_norm": 0.053955078125, + "learning_rate": 0.004621320220532138, + "loss": 0.8076, + "num_input_tokens_seen": 59951256, + "step": 103250 + }, + { + "epoch": 15.379058683348228, + "grad_norm": 0.030029296875, + "learning_rate": 0.004619912694199168, + "loss": 0.7923, + "num_input_tokens_seen": 59954200, + "step": 103255 + }, + { + "epoch": 15.379803395889187, + "grad_norm": 0.055908203125, + "learning_rate": 0.004618505343227226, + "loss": 0.7935, + "num_input_tokens_seen": 59957048, + "step": 103260 + }, + { + "epoch": 15.380548108430146, + "grad_norm": 0.03369140625, + "learning_rate": 0.004617098167640088, + "loss": 0.7963, + "num_input_tokens_seen": 59959608, + "step": 103265 + }, + { + "epoch": 15.381292820971105, + "grad_norm": 0.046875, + "learning_rate": 0.004615691167461517, + "loss": 0.8199, + "num_input_tokens_seen": 59962328, + "step": 103270 + }, + { + "epoch": 15.382037533512065, + "grad_norm": 0.0289306640625, + "learning_rate": 0.004614284342715294, + "loss": 0.7793, + "num_input_tokens_seen": 59965560, + "step": 103275 + }, + { + "epoch": 15.382782246053024, + "grad_norm": 0.041748046875, + "learning_rate": 0.004612877693425177, + "loss": 0.7662, + "num_input_tokens_seen": 59968280, + "step": 103280 + }, + { + "epoch": 15.383526958593983, + "grad_norm": 0.06884765625, + "learning_rate": 0.00461147121961494, + "loss": 0.8054, + "num_input_tokens_seen": 59971256, + "step": 103285 + }, + { + "epoch": 15.384271671134941, + "grad_norm": 0.0400390625, + "learning_rate": 0.004610064921308332, + "loss": 0.7766, + "num_input_tokens_seen": 59974168, + "step": 103290 + }, + { + "epoch": 15.3850163836759, + "grad_norm": 0.02587890625, + "learning_rate": 0.004608658798529123, + "loss": 0.7961, + "num_input_tokens_seen": 59977400, + "step": 103295 + }, + { + "epoch": 15.38576109621686, + "grad_norm": 0.0869140625, + "learning_rate": 0.004607252851301063, + "loss": 0.7926, + "num_input_tokens_seen": 59979960, + "step": 103300 + }, + { + "epoch": 15.38650580875782, + "grad_norm": 0.049560546875, + "learning_rate": 0.004605847079647897, + "loss": 0.7931, + "num_input_tokens_seen": 59982936, + "step": 103305 + }, + { + "epoch": 15.387250521298778, + "grad_norm": 0.048828125, + "learning_rate": 0.0046044414835933875, + "loss": 0.7822, + "num_input_tokens_seen": 59985592, + "step": 103310 + }, + { + "epoch": 15.387995233839739, + "grad_norm": 0.041748046875, + "learning_rate": 0.004603036063161272, + "loss": 0.806, + "num_input_tokens_seen": 59988568, + "step": 103315 + }, + { + "epoch": 15.388739946380698, + "grad_norm": 0.04541015625, + "learning_rate": 0.004601630818375295, + "loss": 0.8167, + "num_input_tokens_seen": 59991352, + "step": 103320 + }, + { + "epoch": 15.389484658921656, + "grad_norm": 0.036865234375, + "learning_rate": 0.004600225749259192, + "loss": 0.7884, + "num_input_tokens_seen": 59994360, + "step": 103325 + }, + { + "epoch": 15.390229371462615, + "grad_norm": 0.07275390625, + "learning_rate": 0.0045988208558367105, + "loss": 0.7818, + "num_input_tokens_seen": 59997528, + "step": 103330 + }, + { + "epoch": 15.390974084003574, + "grad_norm": 0.0537109375, + "learning_rate": 0.004597416138131575, + "loss": 0.816, + "num_input_tokens_seen": 60000248, + "step": 103335 + }, + { + "epoch": 15.391718796544534, + "grad_norm": 0.033935546875, + "learning_rate": 0.004596011596167525, + "loss": 0.7889, + "num_input_tokens_seen": 60003128, + "step": 103340 + }, + { + "epoch": 15.392463509085493, + "grad_norm": 0.06982421875, + "learning_rate": 0.0045946072299682846, + "loss": 0.8012, + "num_input_tokens_seen": 60005784, + "step": 103345 + }, + { + "epoch": 15.393208221626452, + "grad_norm": 0.034423828125, + "learning_rate": 0.004593203039557576, + "loss": 0.8104, + "num_input_tokens_seen": 60008952, + "step": 103350 + }, + { + "epoch": 15.39395293416741, + "grad_norm": 0.046630859375, + "learning_rate": 0.004591799024959129, + "loss": 0.7805, + "num_input_tokens_seen": 60011768, + "step": 103355 + }, + { + "epoch": 15.394697646708371, + "grad_norm": 0.04833984375, + "learning_rate": 0.004590395186196654, + "loss": 0.8057, + "num_input_tokens_seen": 60014680, + "step": 103360 + }, + { + "epoch": 15.39544235924933, + "grad_norm": 0.037841796875, + "learning_rate": 0.0045889915232938784, + "loss": 0.8105, + "num_input_tokens_seen": 60017624, + "step": 103365 + }, + { + "epoch": 15.396187071790289, + "grad_norm": 0.036865234375, + "learning_rate": 0.0045875880362745095, + "loss": 0.833, + "num_input_tokens_seen": 60020600, + "step": 103370 + }, + { + "epoch": 15.396931784331247, + "grad_norm": 0.04345703125, + "learning_rate": 0.004586184725162258, + "loss": 0.8077, + "num_input_tokens_seen": 60023480, + "step": 103375 + }, + { + "epoch": 15.397676496872208, + "grad_norm": 0.05810546875, + "learning_rate": 0.004584781589980826, + "loss": 0.7956, + "num_input_tokens_seen": 60026520, + "step": 103380 + }, + { + "epoch": 15.398421209413167, + "grad_norm": 0.044677734375, + "learning_rate": 0.004583378630753928, + "loss": 0.7999, + "num_input_tokens_seen": 60029400, + "step": 103385 + }, + { + "epoch": 15.399165921954125, + "grad_norm": 0.049560546875, + "learning_rate": 0.004581975847505262, + "loss": 0.7879, + "num_input_tokens_seen": 60032216, + "step": 103390 + }, + { + "epoch": 15.399910634495084, + "grad_norm": 0.0546875, + "learning_rate": 0.00458057324025852, + "loss": 0.7963, + "num_input_tokens_seen": 60035416, + "step": 103395 + }, + { + "epoch": 15.400655347036045, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00457917080903741, + "loss": 0.7994, + "num_input_tokens_seen": 60038456, + "step": 103400 + }, + { + "epoch": 15.401400059577004, + "grad_norm": 0.04248046875, + "learning_rate": 0.004577768553865611, + "loss": 0.7796, + "num_input_tokens_seen": 60041496, + "step": 103405 + }, + { + "epoch": 15.402144772117962, + "grad_norm": 0.05126953125, + "learning_rate": 0.004576366474766823, + "loss": 0.806, + "num_input_tokens_seen": 60044472, + "step": 103410 + }, + { + "epoch": 15.402889484658921, + "grad_norm": 0.1591796875, + "learning_rate": 0.004574964571764728, + "loss": 0.8424, + "num_input_tokens_seen": 60047224, + "step": 103415 + }, + { + "epoch": 15.403634197199882, + "grad_norm": 0.0458984375, + "learning_rate": 0.004573562844883015, + "loss": 0.7908, + "num_input_tokens_seen": 60050200, + "step": 103420 + }, + { + "epoch": 15.40437890974084, + "grad_norm": 0.07470703125, + "learning_rate": 0.004572161294145359, + "loss": 0.7803, + "num_input_tokens_seen": 60053304, + "step": 103425 + }, + { + "epoch": 15.405123622281799, + "grad_norm": 0.045166015625, + "learning_rate": 0.004570759919575437, + "loss": 0.7908, + "num_input_tokens_seen": 60056280, + "step": 103430 + }, + { + "epoch": 15.405868334822758, + "grad_norm": 0.040283203125, + "learning_rate": 0.00456935872119693, + "loss": 0.8126, + "num_input_tokens_seen": 60059224, + "step": 103435 + }, + { + "epoch": 15.406613047363718, + "grad_norm": 0.0458984375, + "learning_rate": 0.004567957699033505, + "loss": 0.7975, + "num_input_tokens_seen": 60062072, + "step": 103440 + }, + { + "epoch": 15.407357759904677, + "grad_norm": 0.04541015625, + "learning_rate": 0.004566556853108834, + "loss": 0.7916, + "num_input_tokens_seen": 60064696, + "step": 103445 + }, + { + "epoch": 15.408102472445636, + "grad_norm": 0.03466796875, + "learning_rate": 0.0045651561834465755, + "loss": 0.7826, + "num_input_tokens_seen": 60067736, + "step": 103450 + }, + { + "epoch": 15.408847184986595, + "grad_norm": 0.05322265625, + "learning_rate": 0.004563755690070402, + "loss": 0.7831, + "num_input_tokens_seen": 60070392, + "step": 103455 + }, + { + "epoch": 15.409591897527555, + "grad_norm": 0.07275390625, + "learning_rate": 0.004562355373003964, + "loss": 0.7885, + "num_input_tokens_seen": 60073080, + "step": 103460 + }, + { + "epoch": 15.410336610068514, + "grad_norm": 0.045166015625, + "learning_rate": 0.00456095523227093, + "loss": 0.775, + "num_input_tokens_seen": 60076120, + "step": 103465 + }, + { + "epoch": 15.411081322609473, + "grad_norm": 0.056884765625, + "learning_rate": 0.0045595552678949465, + "loss": 0.7782, + "num_input_tokens_seen": 60079256, + "step": 103470 + }, + { + "epoch": 15.411826035150431, + "grad_norm": 0.053466796875, + "learning_rate": 0.004558155479899661, + "loss": 0.8356, + "num_input_tokens_seen": 60082072, + "step": 103475 + }, + { + "epoch": 15.41257074769139, + "grad_norm": 0.08203125, + "learning_rate": 0.004556755868308731, + "loss": 0.8082, + "num_input_tokens_seen": 60084920, + "step": 103480 + }, + { + "epoch": 15.41331546023235, + "grad_norm": 0.05908203125, + "learning_rate": 0.004555356433145793, + "loss": 0.8157, + "num_input_tokens_seen": 60087448, + "step": 103485 + }, + { + "epoch": 15.41406017277331, + "grad_norm": 0.04638671875, + "learning_rate": 0.0045539571744344955, + "loss": 0.7977, + "num_input_tokens_seen": 60090360, + "step": 103490 + }, + { + "epoch": 15.414804885314268, + "grad_norm": 0.048095703125, + "learning_rate": 0.004552558092198469, + "loss": 0.7979, + "num_input_tokens_seen": 60093336, + "step": 103495 + }, + { + "epoch": 15.415549597855227, + "grad_norm": 0.04833984375, + "learning_rate": 0.004551159186461363, + "loss": 0.7823, + "num_input_tokens_seen": 60096152, + "step": 103500 + }, + { + "epoch": 15.416294310396188, + "grad_norm": 0.04443359375, + "learning_rate": 0.004549760457246802, + "loss": 0.7811, + "num_input_tokens_seen": 60098840, + "step": 103505 + }, + { + "epoch": 15.417039022937146, + "grad_norm": 0.040771484375, + "learning_rate": 0.0045483619045784155, + "loss": 0.7874, + "num_input_tokens_seen": 60101752, + "step": 103510 + }, + { + "epoch": 15.417783735478105, + "grad_norm": 0.041748046875, + "learning_rate": 0.004546963528479833, + "loss": 0.774, + "num_input_tokens_seen": 60104536, + "step": 103515 + }, + { + "epoch": 15.418528448019064, + "grad_norm": 0.0269775390625, + "learning_rate": 0.004545565328974672, + "loss": 0.8113, + "num_input_tokens_seen": 60107576, + "step": 103520 + }, + { + "epoch": 15.419273160560024, + "grad_norm": 0.03759765625, + "learning_rate": 0.004544167306086566, + "loss": 0.8169, + "num_input_tokens_seen": 60110776, + "step": 103525 + }, + { + "epoch": 15.420017873100983, + "grad_norm": 0.032958984375, + "learning_rate": 0.004542769459839119, + "loss": 0.8034, + "num_input_tokens_seen": 60113592, + "step": 103530 + }, + { + "epoch": 15.420762585641942, + "grad_norm": 0.052001953125, + "learning_rate": 0.004541371790255961, + "loss": 0.7623, + "num_input_tokens_seen": 60116568, + "step": 103535 + }, + { + "epoch": 15.4215072981829, + "grad_norm": 0.046142578125, + "learning_rate": 0.004539974297360691, + "loss": 0.8051, + "num_input_tokens_seen": 60119320, + "step": 103540 + }, + { + "epoch": 15.422252010723861, + "grad_norm": 0.09521484375, + "learning_rate": 0.0045385769811769295, + "loss": 0.7642, + "num_input_tokens_seen": 60122168, + "step": 103545 + }, + { + "epoch": 15.42299672326482, + "grad_norm": 0.0556640625, + "learning_rate": 0.004537179841728276, + "loss": 0.8329, + "num_input_tokens_seen": 60125240, + "step": 103550 + }, + { + "epoch": 15.423741435805779, + "grad_norm": 0.04638671875, + "learning_rate": 0.004535782879038332, + "loss": 0.8006, + "num_input_tokens_seen": 60128312, + "step": 103555 + }, + { + "epoch": 15.424486148346737, + "grad_norm": 0.05810546875, + "learning_rate": 0.004534386093130705, + "loss": 0.7728, + "num_input_tokens_seen": 60131160, + "step": 103560 + }, + { + "epoch": 15.425230860887698, + "grad_norm": 0.046875, + "learning_rate": 0.004532989484028983, + "loss": 0.7858, + "num_input_tokens_seen": 60133656, + "step": 103565 + }, + { + "epoch": 15.425975573428657, + "grad_norm": 0.0439453125, + "learning_rate": 0.004531593051756772, + "loss": 0.7841, + "num_input_tokens_seen": 60136632, + "step": 103570 + }, + { + "epoch": 15.426720285969616, + "grad_norm": 0.08447265625, + "learning_rate": 0.004530196796337655, + "loss": 0.8028, + "num_input_tokens_seen": 60139320, + "step": 103575 + }, + { + "epoch": 15.427464998510574, + "grad_norm": 0.062255859375, + "learning_rate": 0.004528800717795223, + "loss": 0.8184, + "num_input_tokens_seen": 60142200, + "step": 103580 + }, + { + "epoch": 15.428209711051535, + "grad_norm": 0.1796875, + "learning_rate": 0.00452740481615306, + "loss": 0.8388, + "num_input_tokens_seen": 60145304, + "step": 103585 + }, + { + "epoch": 15.428954423592494, + "grad_norm": 0.0546875, + "learning_rate": 0.004526009091434743, + "loss": 0.798, + "num_input_tokens_seen": 60148248, + "step": 103590 + }, + { + "epoch": 15.429699136133452, + "grad_norm": 0.03515625, + "learning_rate": 0.004524613543663864, + "loss": 0.7983, + "num_input_tokens_seen": 60151352, + "step": 103595 + }, + { + "epoch": 15.430443848674411, + "grad_norm": 0.046142578125, + "learning_rate": 0.004523218172863986, + "loss": 0.7977, + "num_input_tokens_seen": 60154200, + "step": 103600 + }, + { + "epoch": 15.431188561215372, + "grad_norm": 0.04931640625, + "learning_rate": 0.004521822979058694, + "loss": 0.793, + "num_input_tokens_seen": 60156824, + "step": 103605 + }, + { + "epoch": 15.43193327375633, + "grad_norm": 0.036376953125, + "learning_rate": 0.004520427962271549, + "loss": 0.7985, + "num_input_tokens_seen": 60159512, + "step": 103610 + }, + { + "epoch": 15.43267798629729, + "grad_norm": 0.045166015625, + "learning_rate": 0.004519033122526127, + "loss": 0.7908, + "num_input_tokens_seen": 60162456, + "step": 103615 + }, + { + "epoch": 15.433422698838248, + "grad_norm": 0.039794921875, + "learning_rate": 0.0045176384598459825, + "loss": 0.7698, + "num_input_tokens_seen": 60165144, + "step": 103620 + }, + { + "epoch": 15.434167411379208, + "grad_norm": 0.055419921875, + "learning_rate": 0.004516243974254688, + "loss": 0.8043, + "num_input_tokens_seen": 60168120, + "step": 103625 + }, + { + "epoch": 15.434912123920167, + "grad_norm": 0.03173828125, + "learning_rate": 0.004514849665775796, + "loss": 0.8148, + "num_input_tokens_seen": 60170680, + "step": 103630 + }, + { + "epoch": 15.435656836461126, + "grad_norm": 0.052001953125, + "learning_rate": 0.004513455534432859, + "loss": 0.7951, + "num_input_tokens_seen": 60173816, + "step": 103635 + }, + { + "epoch": 15.436401549002085, + "grad_norm": 0.05078125, + "learning_rate": 0.004512061580249436, + "loss": 0.8554, + "num_input_tokens_seen": 60176728, + "step": 103640 + }, + { + "epoch": 15.437146261543045, + "grad_norm": 0.05615234375, + "learning_rate": 0.0045106678032490735, + "loss": 0.7948, + "num_input_tokens_seen": 60179608, + "step": 103645 + }, + { + "epoch": 15.437890974084004, + "grad_norm": 0.039794921875, + "learning_rate": 0.0045092742034553174, + "loss": 0.7684, + "num_input_tokens_seen": 60182808, + "step": 103650 + }, + { + "epoch": 15.438635686624963, + "grad_norm": 0.0673828125, + "learning_rate": 0.004507880780891706, + "loss": 0.8075, + "num_input_tokens_seen": 60186008, + "step": 103655 + }, + { + "epoch": 15.439380399165922, + "grad_norm": 0.051513671875, + "learning_rate": 0.004506487535581791, + "loss": 0.802, + "num_input_tokens_seen": 60188856, + "step": 103660 + }, + { + "epoch": 15.44012511170688, + "grad_norm": 0.04833984375, + "learning_rate": 0.004505094467549103, + "loss": 0.7954, + "num_input_tokens_seen": 60191864, + "step": 103665 + }, + { + "epoch": 15.44086982424784, + "grad_norm": 0.06884765625, + "learning_rate": 0.004503701576817174, + "loss": 0.8018, + "num_input_tokens_seen": 60194584, + "step": 103670 + }, + { + "epoch": 15.4416145367888, + "grad_norm": 0.03564453125, + "learning_rate": 0.004502308863409542, + "loss": 0.7877, + "num_input_tokens_seen": 60197624, + "step": 103675 + }, + { + "epoch": 15.442359249329758, + "grad_norm": 0.038330078125, + "learning_rate": 0.004500916327349729, + "loss": 0.804, + "num_input_tokens_seen": 60200632, + "step": 103680 + }, + { + "epoch": 15.443103961870717, + "grad_norm": 0.04541015625, + "learning_rate": 0.004499523968661268, + "loss": 0.7887, + "num_input_tokens_seen": 60203608, + "step": 103685 + }, + { + "epoch": 15.443848674411678, + "grad_norm": 0.026611328125, + "learning_rate": 0.004498131787367673, + "loss": 0.8005, + "num_input_tokens_seen": 60206264, + "step": 103690 + }, + { + "epoch": 15.444593386952636, + "grad_norm": 0.040283203125, + "learning_rate": 0.004496739783492473, + "loss": 0.7945, + "num_input_tokens_seen": 60209176, + "step": 103695 + }, + { + "epoch": 15.445338099493595, + "grad_norm": 0.04345703125, + "learning_rate": 0.004495347957059177, + "loss": 0.79, + "num_input_tokens_seen": 60212056, + "step": 103700 + }, + { + "epoch": 15.446082812034554, + "grad_norm": 0.0458984375, + "learning_rate": 0.004493956308091304, + "loss": 0.8071, + "num_input_tokens_seen": 60214968, + "step": 103705 + }, + { + "epoch": 15.446827524575514, + "grad_norm": 0.0546875, + "learning_rate": 0.0044925648366123585, + "loss": 0.7832, + "num_input_tokens_seen": 60217880, + "step": 103710 + }, + { + "epoch": 15.447572237116473, + "grad_norm": 0.05419921875, + "learning_rate": 0.004491173542645847, + "loss": 0.7939, + "num_input_tokens_seen": 60221112, + "step": 103715 + }, + { + "epoch": 15.448316949657432, + "grad_norm": 0.0291748046875, + "learning_rate": 0.004489782426215283, + "loss": 0.7827, + "num_input_tokens_seen": 60223896, + "step": 103720 + }, + { + "epoch": 15.44906166219839, + "grad_norm": 0.048095703125, + "learning_rate": 0.004488391487344158, + "loss": 0.7869, + "num_input_tokens_seen": 60226616, + "step": 103725 + }, + { + "epoch": 15.449806374739351, + "grad_norm": 0.03564453125, + "learning_rate": 0.004487000726055979, + "loss": 0.7862, + "num_input_tokens_seen": 60229464, + "step": 103730 + }, + { + "epoch": 15.45055108728031, + "grad_norm": 0.03466796875, + "learning_rate": 0.004485610142374235, + "loss": 0.7826, + "num_input_tokens_seen": 60232536, + "step": 103735 + }, + { + "epoch": 15.451295799821269, + "grad_norm": 0.0322265625, + "learning_rate": 0.004484219736322425, + "loss": 0.8206, + "num_input_tokens_seen": 60235512, + "step": 103740 + }, + { + "epoch": 15.452040512362228, + "grad_norm": 0.060791015625, + "learning_rate": 0.00448282950792403, + "loss": 0.8069, + "num_input_tokens_seen": 60238328, + "step": 103745 + }, + { + "epoch": 15.452785224903188, + "grad_norm": 0.052734375, + "learning_rate": 0.004481439457202545, + "loss": 0.8115, + "num_input_tokens_seen": 60240824, + "step": 103750 + }, + { + "epoch": 15.453529937444147, + "grad_norm": 0.05615234375, + "learning_rate": 0.004480049584181452, + "loss": 0.813, + "num_input_tokens_seen": 60243352, + "step": 103755 + }, + { + "epoch": 15.454274649985106, + "grad_norm": 0.03857421875, + "learning_rate": 0.004478659888884222, + "loss": 0.799, + "num_input_tokens_seen": 60246328, + "step": 103760 + }, + { + "epoch": 15.455019362526064, + "grad_norm": 0.0361328125, + "learning_rate": 0.004477270371334347, + "loss": 0.8016, + "num_input_tokens_seen": 60249304, + "step": 103765 + }, + { + "epoch": 15.455764075067025, + "grad_norm": 0.056640625, + "learning_rate": 0.004475881031555292, + "loss": 0.8019, + "num_input_tokens_seen": 60252056, + "step": 103770 + }, + { + "epoch": 15.456508787607984, + "grad_norm": 0.038818359375, + "learning_rate": 0.004474491869570532, + "loss": 0.7974, + "num_input_tokens_seen": 60255064, + "step": 103775 + }, + { + "epoch": 15.457253500148942, + "grad_norm": 0.04052734375, + "learning_rate": 0.0044731028854035305, + "loss": 0.811, + "num_input_tokens_seen": 60257944, + "step": 103780 + }, + { + "epoch": 15.457998212689901, + "grad_norm": 0.048828125, + "learning_rate": 0.00447171407907776, + "loss": 0.8051, + "num_input_tokens_seen": 60260696, + "step": 103785 + }, + { + "epoch": 15.458742925230862, + "grad_norm": 0.0390625, + "learning_rate": 0.00447032545061668, + "loss": 0.798, + "num_input_tokens_seen": 60263800, + "step": 103790 + }, + { + "epoch": 15.45948763777182, + "grad_norm": 0.046630859375, + "learning_rate": 0.004468937000043744, + "loss": 0.7813, + "num_input_tokens_seen": 60266840, + "step": 103795 + }, + { + "epoch": 15.46023235031278, + "grad_norm": 0.0380859375, + "learning_rate": 0.00446754872738242, + "loss": 0.825, + "num_input_tokens_seen": 60270040, + "step": 103800 + }, + { + "epoch": 15.460977062853738, + "grad_norm": 0.0400390625, + "learning_rate": 0.004466160632656151, + "loss": 0.806, + "num_input_tokens_seen": 60272824, + "step": 103805 + }, + { + "epoch": 15.461721775394698, + "grad_norm": 0.04833984375, + "learning_rate": 0.004464772715888396, + "loss": 0.7987, + "num_input_tokens_seen": 60275960, + "step": 103810 + }, + { + "epoch": 15.462466487935657, + "grad_norm": 0.048095703125, + "learning_rate": 0.004463384977102593, + "loss": 0.8107, + "num_input_tokens_seen": 60278456, + "step": 103815 + }, + { + "epoch": 15.463211200476616, + "grad_norm": 0.06298828125, + "learning_rate": 0.004461997416322198, + "loss": 0.7893, + "num_input_tokens_seen": 60281464, + "step": 103820 + }, + { + "epoch": 15.463955913017575, + "grad_norm": 0.035888671875, + "learning_rate": 0.004460610033570641, + "loss": 0.7816, + "num_input_tokens_seen": 60284440, + "step": 103825 + }, + { + "epoch": 15.464700625558535, + "grad_norm": 0.0546875, + "learning_rate": 0.004459222828871371, + "loss": 0.809, + "num_input_tokens_seen": 60287256, + "step": 103830 + }, + { + "epoch": 15.465445338099494, + "grad_norm": 0.057861328125, + "learning_rate": 0.004457835802247817, + "loss": 0.7875, + "num_input_tokens_seen": 60289976, + "step": 103835 + }, + { + "epoch": 15.466190050640453, + "grad_norm": 0.034423828125, + "learning_rate": 0.004456448953723413, + "loss": 0.7946, + "num_input_tokens_seen": 60292920, + "step": 103840 + }, + { + "epoch": 15.466934763181412, + "grad_norm": 0.048583984375, + "learning_rate": 0.004455062283321589, + "loss": 0.7913, + "num_input_tokens_seen": 60296184, + "step": 103845 + }, + { + "epoch": 15.46767947572237, + "grad_norm": 0.06591796875, + "learning_rate": 0.004453675791065765, + "loss": 0.781, + "num_input_tokens_seen": 60299128, + "step": 103850 + }, + { + "epoch": 15.46842418826333, + "grad_norm": 0.037109375, + "learning_rate": 0.0044522894769793726, + "loss": 0.8069, + "num_input_tokens_seen": 60301944, + "step": 103855 + }, + { + "epoch": 15.46916890080429, + "grad_norm": 0.034912109375, + "learning_rate": 0.004450903341085826, + "loss": 0.8208, + "num_input_tokens_seen": 60304600, + "step": 103860 + }, + { + "epoch": 15.469913613345248, + "grad_norm": 0.040771484375, + "learning_rate": 0.004449517383408552, + "loss": 0.7941, + "num_input_tokens_seen": 60307320, + "step": 103865 + }, + { + "epoch": 15.470658325886207, + "grad_norm": 0.057861328125, + "learning_rate": 0.004448131603970957, + "loss": 0.7927, + "num_input_tokens_seen": 60309976, + "step": 103870 + }, + { + "epoch": 15.471403038427168, + "grad_norm": 0.05126953125, + "learning_rate": 0.00444674600279645, + "loss": 0.787, + "num_input_tokens_seen": 60312824, + "step": 103875 + }, + { + "epoch": 15.472147750968126, + "grad_norm": 0.041259765625, + "learning_rate": 0.004445360579908448, + "loss": 0.7905, + "num_input_tokens_seen": 60315480, + "step": 103880 + }, + { + "epoch": 15.472892463509085, + "grad_norm": 0.03955078125, + "learning_rate": 0.004443975335330347, + "loss": 0.7988, + "num_input_tokens_seen": 60318456, + "step": 103885 + }, + { + "epoch": 15.473637176050044, + "grad_norm": 0.03759765625, + "learning_rate": 0.004442590269085558, + "loss": 0.7898, + "num_input_tokens_seen": 60321144, + "step": 103890 + }, + { + "epoch": 15.474381888591004, + "grad_norm": 0.078125, + "learning_rate": 0.004441205381197472, + "loss": 0.8039, + "num_input_tokens_seen": 60324312, + "step": 103895 + }, + { + "epoch": 15.475126601131963, + "grad_norm": 0.05810546875, + "learning_rate": 0.004439820671689496, + "loss": 0.8461, + "num_input_tokens_seen": 60327160, + "step": 103900 + }, + { + "epoch": 15.475871313672922, + "grad_norm": 0.047607421875, + "learning_rate": 0.004438436140585016, + "loss": 0.794, + "num_input_tokens_seen": 60329848, + "step": 103905 + }, + { + "epoch": 15.47661602621388, + "grad_norm": 0.05810546875, + "learning_rate": 0.004437051787907422, + "loss": 0.8127, + "num_input_tokens_seen": 60332632, + "step": 103910 + }, + { + "epoch": 15.477360738754841, + "grad_norm": 0.06591796875, + "learning_rate": 0.004435667613680103, + "loss": 0.7747, + "num_input_tokens_seen": 60335544, + "step": 103915 + }, + { + "epoch": 15.4781054512958, + "grad_norm": 0.048095703125, + "learning_rate": 0.004434283617926439, + "loss": 0.7998, + "num_input_tokens_seen": 60338392, + "step": 103920 + }, + { + "epoch": 15.478850163836759, + "grad_norm": 0.052978515625, + "learning_rate": 0.004432899800669818, + "loss": 0.7936, + "num_input_tokens_seen": 60341048, + "step": 103925 + }, + { + "epoch": 15.479594876377718, + "grad_norm": 0.057373046875, + "learning_rate": 0.00443151616193361, + "loss": 0.7944, + "num_input_tokens_seen": 60343896, + "step": 103930 + }, + { + "epoch": 15.480339588918678, + "grad_norm": 0.0537109375, + "learning_rate": 0.0044301327017412006, + "loss": 0.8185, + "num_input_tokens_seen": 60346744, + "step": 103935 + }, + { + "epoch": 15.481084301459637, + "grad_norm": 0.044921875, + "learning_rate": 0.004428749420115951, + "loss": 0.7756, + "num_input_tokens_seen": 60349784, + "step": 103940 + }, + { + "epoch": 15.481829014000596, + "grad_norm": 0.033203125, + "learning_rate": 0.004427366317081242, + "loss": 0.7957, + "num_input_tokens_seen": 60352728, + "step": 103945 + }, + { + "epoch": 15.482573726541554, + "grad_norm": 0.03125, + "learning_rate": 0.004425983392660432, + "loss": 0.8053, + "num_input_tokens_seen": 60355672, + "step": 103950 + }, + { + "epoch": 15.483318439082515, + "grad_norm": 0.043701171875, + "learning_rate": 0.004424600646876882, + "loss": 0.7892, + "num_input_tokens_seen": 60358552, + "step": 103955 + }, + { + "epoch": 15.484063151623474, + "grad_norm": 0.0322265625, + "learning_rate": 0.004423218079753959, + "loss": 0.7854, + "num_input_tokens_seen": 60361656, + "step": 103960 + }, + { + "epoch": 15.484807864164432, + "grad_norm": 0.030517578125, + "learning_rate": 0.004421835691315015, + "loss": 0.7893, + "num_input_tokens_seen": 60364504, + "step": 103965 + }, + { + "epoch": 15.485552576705391, + "grad_norm": 0.06298828125, + "learning_rate": 0.004420453481583407, + "loss": 0.7985, + "num_input_tokens_seen": 60367288, + "step": 103970 + }, + { + "epoch": 15.486297289246352, + "grad_norm": 0.03857421875, + "learning_rate": 0.004419071450582486, + "loss": 0.8068, + "num_input_tokens_seen": 60370168, + "step": 103975 + }, + { + "epoch": 15.48704200178731, + "grad_norm": 0.05712890625, + "learning_rate": 0.0044176895983356, + "loss": 0.7847, + "num_input_tokens_seen": 60373016, + "step": 103980 + }, + { + "epoch": 15.48778671432827, + "grad_norm": 0.038818359375, + "learning_rate": 0.004416307924866089, + "loss": 0.8164, + "num_input_tokens_seen": 60376088, + "step": 103985 + }, + { + "epoch": 15.488531426869228, + "grad_norm": 0.03564453125, + "learning_rate": 0.0044149264301973, + "loss": 0.8023, + "num_input_tokens_seen": 60378968, + "step": 103990 + }, + { + "epoch": 15.489276139410187, + "grad_norm": 0.036865234375, + "learning_rate": 0.004413545114352574, + "loss": 0.8024, + "num_input_tokens_seen": 60381752, + "step": 103995 + }, + { + "epoch": 15.490020851951147, + "grad_norm": 0.04248046875, + "learning_rate": 0.004412163977355238, + "loss": 0.794, + "num_input_tokens_seen": 60384600, + "step": 104000 + }, + { + "epoch": 15.490765564492106, + "grad_norm": 0.037353515625, + "learning_rate": 0.004410783019228635, + "loss": 0.7908, + "num_input_tokens_seen": 60387416, + "step": 104005 + }, + { + "epoch": 15.491510277033065, + "grad_norm": 0.049560546875, + "learning_rate": 0.004409402239996085, + "loss": 0.8033, + "num_input_tokens_seen": 60390584, + "step": 104010 + }, + { + "epoch": 15.492254989574024, + "grad_norm": 0.05419921875, + "learning_rate": 0.004408021639680926, + "loss": 0.7966, + "num_input_tokens_seen": 60393624, + "step": 104015 + }, + { + "epoch": 15.492999702114984, + "grad_norm": 0.04931640625, + "learning_rate": 0.004406641218306471, + "loss": 0.7934, + "num_input_tokens_seen": 60396600, + "step": 104020 + }, + { + "epoch": 15.493744414655943, + "grad_norm": 0.052734375, + "learning_rate": 0.004405260975896052, + "loss": 0.8288, + "num_input_tokens_seen": 60399160, + "step": 104025 + }, + { + "epoch": 15.494489127196902, + "grad_norm": 0.031982421875, + "learning_rate": 0.00440388091247298, + "loss": 0.8089, + "num_input_tokens_seen": 60401976, + "step": 104030 + }, + { + "epoch": 15.49523383973786, + "grad_norm": 0.0341796875, + "learning_rate": 0.00440250102806057, + "loss": 0.799, + "num_input_tokens_seen": 60404760, + "step": 104035 + }, + { + "epoch": 15.495978552278821, + "grad_norm": 0.0673828125, + "learning_rate": 0.004401121322682135, + "loss": 0.7905, + "num_input_tokens_seen": 60407896, + "step": 104040 + }, + { + "epoch": 15.49672326481978, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0043997417963609784, + "loss": 0.7737, + "num_input_tokens_seen": 60410712, + "step": 104045 + }, + { + "epoch": 15.497467977360738, + "grad_norm": 0.04931640625, + "learning_rate": 0.004398362449120416, + "loss": 0.7723, + "num_input_tokens_seen": 60413752, + "step": 104050 + }, + { + "epoch": 15.498212689901697, + "grad_norm": 0.04833984375, + "learning_rate": 0.004396983280983737, + "loss": 0.7902, + "num_input_tokens_seen": 60416568, + "step": 104055 + }, + { + "epoch": 15.498957402442658, + "grad_norm": 0.03564453125, + "learning_rate": 0.004395604291974257, + "loss": 0.7802, + "num_input_tokens_seen": 60419640, + "step": 104060 + }, + { + "epoch": 15.499702114983616, + "grad_norm": 0.0380859375, + "learning_rate": 0.004394225482115258, + "loss": 0.8015, + "num_input_tokens_seen": 60422712, + "step": 104065 + }, + { + "epoch": 15.500446827524575, + "grad_norm": 0.03759765625, + "learning_rate": 0.004392846851430047, + "loss": 0.8131, + "num_input_tokens_seen": 60425688, + "step": 104070 + }, + { + "epoch": 15.501191540065534, + "grad_norm": 0.051513671875, + "learning_rate": 0.004391468399941906, + "loss": 0.7938, + "num_input_tokens_seen": 60428472, + "step": 104075 + }, + { + "epoch": 15.501936252606495, + "grad_norm": 0.044677734375, + "learning_rate": 0.004390090127674121, + "loss": 0.7871, + "num_input_tokens_seen": 60431640, + "step": 104080 + }, + { + "epoch": 15.502680965147453, + "grad_norm": 0.038330078125, + "learning_rate": 0.004388712034649984, + "loss": 0.7973, + "num_input_tokens_seen": 60434360, + "step": 104085 + }, + { + "epoch": 15.503425677688412, + "grad_norm": 0.0634765625, + "learning_rate": 0.004387334120892768, + "loss": 0.7896, + "num_input_tokens_seen": 60437208, + "step": 104090 + }, + { + "epoch": 15.50417039022937, + "grad_norm": 0.04248046875, + "learning_rate": 0.004385956386425759, + "loss": 0.7872, + "num_input_tokens_seen": 60440184, + "step": 104095 + }, + { + "epoch": 15.504915102770331, + "grad_norm": 0.05322265625, + "learning_rate": 0.00438457883127223, + "loss": 0.8136, + "num_input_tokens_seen": 60442968, + "step": 104100 + }, + { + "epoch": 15.50565981531129, + "grad_norm": 0.03515625, + "learning_rate": 0.004383201455455454, + "loss": 0.8217, + "num_input_tokens_seen": 60446200, + "step": 104105 + }, + { + "epoch": 15.506404527852249, + "grad_norm": 0.050537109375, + "learning_rate": 0.004381824258998697, + "loss": 0.7995, + "num_input_tokens_seen": 60448920, + "step": 104110 + }, + { + "epoch": 15.507149240393208, + "grad_norm": 0.04736328125, + "learning_rate": 0.004380447241925223, + "loss": 0.7857, + "num_input_tokens_seen": 60451416, + "step": 104115 + }, + { + "epoch": 15.507893952934168, + "grad_norm": 0.057861328125, + "learning_rate": 0.004379070404258305, + "loss": 0.8031, + "num_input_tokens_seen": 60454456, + "step": 104120 + }, + { + "epoch": 15.508638665475127, + "grad_norm": 0.0458984375, + "learning_rate": 0.004377693746021194, + "loss": 0.7779, + "num_input_tokens_seen": 60457368, + "step": 104125 + }, + { + "epoch": 15.509383378016086, + "grad_norm": 0.0264892578125, + "learning_rate": 0.0043763172672371545, + "loss": 0.8369, + "num_input_tokens_seen": 60459992, + "step": 104130 + }, + { + "epoch": 15.510128090557044, + "grad_norm": 0.036865234375, + "learning_rate": 0.004374940967929433, + "loss": 0.7976, + "num_input_tokens_seen": 60462840, + "step": 104135 + }, + { + "epoch": 15.510872803098005, + "grad_norm": 0.051513671875, + "learning_rate": 0.004373564848121291, + "loss": 0.7985, + "num_input_tokens_seen": 60465816, + "step": 104140 + }, + { + "epoch": 15.511617515638964, + "grad_norm": 0.03759765625, + "learning_rate": 0.0043721889078359635, + "loss": 0.8008, + "num_input_tokens_seen": 60468568, + "step": 104145 + }, + { + "epoch": 15.512362228179922, + "grad_norm": 0.052734375, + "learning_rate": 0.004370813147096708, + "loss": 0.7949, + "num_input_tokens_seen": 60471256, + "step": 104150 + }, + { + "epoch": 15.513106940720881, + "grad_norm": 0.045654296875, + "learning_rate": 0.0043694375659267615, + "loss": 0.79, + "num_input_tokens_seen": 60474456, + "step": 104155 + }, + { + "epoch": 15.513851653261842, + "grad_norm": 0.039794921875, + "learning_rate": 0.0043680621643493595, + "loss": 0.803, + "num_input_tokens_seen": 60476952, + "step": 104160 + }, + { + "epoch": 15.5145963658028, + "grad_norm": 0.03564453125, + "learning_rate": 0.004366686942387745, + "loss": 0.8027, + "num_input_tokens_seen": 60479864, + "step": 104165 + }, + { + "epoch": 15.51534107834376, + "grad_norm": 0.03466796875, + "learning_rate": 0.004365311900065149, + "loss": 0.8053, + "num_input_tokens_seen": 60482904, + "step": 104170 + }, + { + "epoch": 15.516085790884718, + "grad_norm": 0.050537109375, + "learning_rate": 0.004363937037404798, + "loss": 0.7956, + "num_input_tokens_seen": 60485496, + "step": 104175 + }, + { + "epoch": 15.516830503425677, + "grad_norm": 0.047119140625, + "learning_rate": 0.004362562354429916, + "loss": 0.7829, + "num_input_tokens_seen": 60488408, + "step": 104180 + }, + { + "epoch": 15.517575215966637, + "grad_norm": 0.03271484375, + "learning_rate": 0.004361187851163737, + "loss": 0.7981, + "num_input_tokens_seen": 60491288, + "step": 104185 + }, + { + "epoch": 15.518319928507596, + "grad_norm": 0.0289306640625, + "learning_rate": 0.004359813527629476, + "loss": 0.7785, + "num_input_tokens_seen": 60494200, + "step": 104190 + }, + { + "epoch": 15.519064641048555, + "grad_norm": 0.054931640625, + "learning_rate": 0.004358439383850348, + "loss": 0.7754, + "num_input_tokens_seen": 60496952, + "step": 104195 + }, + { + "epoch": 15.519809353589514, + "grad_norm": 0.04541015625, + "learning_rate": 0.004357065419849574, + "loss": 0.7985, + "num_input_tokens_seen": 60499928, + "step": 104200 + }, + { + "epoch": 15.520554066130474, + "grad_norm": 0.058837890625, + "learning_rate": 0.004355691635650358, + "loss": 0.8012, + "num_input_tokens_seen": 60503128, + "step": 104205 + }, + { + "epoch": 15.521298778671433, + "grad_norm": 0.0390625, + "learning_rate": 0.00435431803127592, + "loss": 0.82, + "num_input_tokens_seen": 60506264, + "step": 104210 + }, + { + "epoch": 15.522043491212392, + "grad_norm": 0.035888671875, + "learning_rate": 0.004352944606749454, + "loss": 0.8176, + "num_input_tokens_seen": 60509176, + "step": 104215 + }, + { + "epoch": 15.52278820375335, + "grad_norm": 0.041015625, + "learning_rate": 0.004351571362094171, + "loss": 0.843, + "num_input_tokens_seen": 60511960, + "step": 104220 + }, + { + "epoch": 15.523532916294311, + "grad_norm": 0.0517578125, + "learning_rate": 0.004350198297333263, + "loss": 0.7961, + "num_input_tokens_seen": 60514904, + "step": 104225 + }, + { + "epoch": 15.52427762883527, + "grad_norm": 0.0732421875, + "learning_rate": 0.004348825412489935, + "loss": 0.8434, + "num_input_tokens_seen": 60517752, + "step": 104230 + }, + { + "epoch": 15.525022341376228, + "grad_norm": 0.0458984375, + "learning_rate": 0.004347452707587377, + "loss": 0.7947, + "num_input_tokens_seen": 60520824, + "step": 104235 + }, + { + "epoch": 15.525767053917187, + "grad_norm": 0.02978515625, + "learning_rate": 0.0043460801826487785, + "loss": 0.8007, + "num_input_tokens_seen": 60523512, + "step": 104240 + }, + { + "epoch": 15.526511766458148, + "grad_norm": 0.037841796875, + "learning_rate": 0.004344707837697327, + "loss": 0.8072, + "num_input_tokens_seen": 60526456, + "step": 104245 + }, + { + "epoch": 15.527256478999107, + "grad_norm": 0.031982421875, + "learning_rate": 0.004343335672756202, + "loss": 0.7706, + "num_input_tokens_seen": 60529048, + "step": 104250 + }, + { + "epoch": 15.528001191540065, + "grad_norm": 0.04833984375, + "learning_rate": 0.004341963687848596, + "loss": 0.7908, + "num_input_tokens_seen": 60532024, + "step": 104255 + }, + { + "epoch": 15.528745904081024, + "grad_norm": 0.052978515625, + "learning_rate": 0.004340591882997675, + "loss": 0.787, + "num_input_tokens_seen": 60534744, + "step": 104260 + }, + { + "epoch": 15.529490616621985, + "grad_norm": 0.042236328125, + "learning_rate": 0.004339220258226625, + "loss": 0.8304, + "num_input_tokens_seen": 60537688, + "step": 104265 + }, + { + "epoch": 15.530235329162943, + "grad_norm": 0.038330078125, + "learning_rate": 0.0043378488135586155, + "loss": 0.7828, + "num_input_tokens_seen": 60540600, + "step": 104270 + }, + { + "epoch": 15.530980041703902, + "grad_norm": 0.05322265625, + "learning_rate": 0.004336477549016808, + "loss": 0.8373, + "num_input_tokens_seen": 60543192, + "step": 104275 + }, + { + "epoch": 15.53172475424486, + "grad_norm": 0.050537109375, + "learning_rate": 0.004335106464624378, + "loss": 0.8054, + "num_input_tokens_seen": 60545880, + "step": 104280 + }, + { + "epoch": 15.532469466785821, + "grad_norm": 0.05126953125, + "learning_rate": 0.004333735560404483, + "loss": 0.7752, + "num_input_tokens_seen": 60548792, + "step": 104285 + }, + { + "epoch": 15.53321417932678, + "grad_norm": 0.055908203125, + "learning_rate": 0.004332364836380287, + "loss": 0.7925, + "num_input_tokens_seen": 60551864, + "step": 104290 + }, + { + "epoch": 15.533958891867739, + "grad_norm": 0.03759765625, + "learning_rate": 0.004330994292574941, + "loss": 0.7934, + "num_input_tokens_seen": 60554808, + "step": 104295 + }, + { + "epoch": 15.534703604408698, + "grad_norm": 0.038330078125, + "learning_rate": 0.004329623929011608, + "loss": 0.7871, + "num_input_tokens_seen": 60557784, + "step": 104300 + }, + { + "epoch": 15.535448316949658, + "grad_norm": 0.049560546875, + "learning_rate": 0.0043282537457134335, + "loss": 0.7936, + "num_input_tokens_seen": 60560696, + "step": 104305 + }, + { + "epoch": 15.536193029490617, + "grad_norm": 0.04296875, + "learning_rate": 0.004326883742703566, + "loss": 0.7978, + "num_input_tokens_seen": 60563512, + "step": 104310 + }, + { + "epoch": 15.536937742031576, + "grad_norm": 0.035888671875, + "learning_rate": 0.00432551392000515, + "loss": 0.7963, + "num_input_tokens_seen": 60566552, + "step": 104315 + }, + { + "epoch": 15.537682454572534, + "grad_norm": 0.03564453125, + "learning_rate": 0.0043241442776413225, + "loss": 0.8117, + "num_input_tokens_seen": 60569464, + "step": 104320 + }, + { + "epoch": 15.538427167113493, + "grad_norm": 0.033447265625, + "learning_rate": 0.004322774815635232, + "loss": 0.8018, + "num_input_tokens_seen": 60572280, + "step": 104325 + }, + { + "epoch": 15.539171879654454, + "grad_norm": 0.048583984375, + "learning_rate": 0.004321405534010003, + "loss": 0.7934, + "num_input_tokens_seen": 60575192, + "step": 104330 + }, + { + "epoch": 15.539916592195413, + "grad_norm": 0.05810546875, + "learning_rate": 0.0043200364327887816, + "loss": 0.7766, + "num_input_tokens_seen": 60577912, + "step": 104335 + }, + { + "epoch": 15.540661304736371, + "grad_norm": 0.047119140625, + "learning_rate": 0.004318667511994683, + "loss": 0.7725, + "num_input_tokens_seen": 60580664, + "step": 104340 + }, + { + "epoch": 15.541406017277332, + "grad_norm": 0.02978515625, + "learning_rate": 0.0043172987716508454, + "loss": 0.775, + "num_input_tokens_seen": 60583640, + "step": 104345 + }, + { + "epoch": 15.54215072981829, + "grad_norm": 0.09423828125, + "learning_rate": 0.004315930211780384, + "loss": 0.8179, + "num_input_tokens_seen": 60586552, + "step": 104350 + }, + { + "epoch": 15.54289544235925, + "grad_norm": 0.042724609375, + "learning_rate": 0.004314561832406427, + "loss": 0.7888, + "num_input_tokens_seen": 60589272, + "step": 104355 + }, + { + "epoch": 15.543640154900208, + "grad_norm": 0.043701171875, + "learning_rate": 0.004313193633552088, + "loss": 0.7931, + "num_input_tokens_seen": 60592376, + "step": 104360 + }, + { + "epoch": 15.544384867441167, + "grad_norm": 0.08154296875, + "learning_rate": 0.004311825615240482, + "loss": 0.7776, + "num_input_tokens_seen": 60595160, + "step": 104365 + }, + { + "epoch": 15.545129579982127, + "grad_norm": 0.03271484375, + "learning_rate": 0.004310457777494717, + "loss": 0.815, + "num_input_tokens_seen": 60597912, + "step": 104370 + }, + { + "epoch": 15.545874292523086, + "grad_norm": 0.042724609375, + "learning_rate": 0.0043090901203379, + "loss": 0.781, + "num_input_tokens_seen": 60600792, + "step": 104375 + }, + { + "epoch": 15.546619005064045, + "grad_norm": 0.042724609375, + "learning_rate": 0.0043077226437931455, + "loss": 0.8121, + "num_input_tokens_seen": 60603864, + "step": 104380 + }, + { + "epoch": 15.547363717605004, + "grad_norm": 0.03466796875, + "learning_rate": 0.004306355347883544, + "loss": 0.8016, + "num_input_tokens_seen": 60606392, + "step": 104385 + }, + { + "epoch": 15.548108430145964, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0043049882326322035, + "loss": 0.7877, + "num_input_tokens_seen": 60609464, + "step": 104390 + }, + { + "epoch": 15.548853142686923, + "grad_norm": 0.035888671875, + "learning_rate": 0.00430362129806222, + "loss": 0.7908, + "num_input_tokens_seen": 60612376, + "step": 104395 + }, + { + "epoch": 15.549597855227882, + "grad_norm": 0.0308837890625, + "learning_rate": 0.004302254544196677, + "loss": 0.81, + "num_input_tokens_seen": 60615384, + "step": 104400 + }, + { + "epoch": 15.55034256776884, + "grad_norm": 0.0419921875, + "learning_rate": 0.004300887971058675, + "loss": 0.7873, + "num_input_tokens_seen": 60618424, + "step": 104405 + }, + { + "epoch": 15.551087280309801, + "grad_norm": 0.037353515625, + "learning_rate": 0.004299521578671292, + "loss": 0.7993, + "num_input_tokens_seen": 60621048, + "step": 104410 + }, + { + "epoch": 15.55183199285076, + "grad_norm": 0.04931640625, + "learning_rate": 0.004298155367057622, + "loss": 0.8175, + "num_input_tokens_seen": 60623896, + "step": 104415 + }, + { + "epoch": 15.552576705391719, + "grad_norm": 0.033935546875, + "learning_rate": 0.0042967893362407346, + "loss": 0.8055, + "num_input_tokens_seen": 60626616, + "step": 104420 + }, + { + "epoch": 15.553321417932677, + "grad_norm": 0.03515625, + "learning_rate": 0.0042954234862437185, + "loss": 0.7939, + "num_input_tokens_seen": 60629240, + "step": 104425 + }, + { + "epoch": 15.554066130473638, + "grad_norm": 0.05224609375, + "learning_rate": 0.004294057817089642, + "loss": 0.7843, + "num_input_tokens_seen": 60632248, + "step": 104430 + }, + { + "epoch": 15.554810843014597, + "grad_norm": 0.035888671875, + "learning_rate": 0.00429269232880158, + "loss": 0.8052, + "num_input_tokens_seen": 60635192, + "step": 104435 + }, + { + "epoch": 15.555555555555555, + "grad_norm": 0.038330078125, + "learning_rate": 0.004291327021402596, + "loss": 0.773, + "num_input_tokens_seen": 60637784, + "step": 104440 + }, + { + "epoch": 15.556300268096514, + "grad_norm": 0.03466796875, + "learning_rate": 0.004289961894915754, + "loss": 0.822, + "num_input_tokens_seen": 60640664, + "step": 104445 + }, + { + "epoch": 15.557044980637475, + "grad_norm": 0.1474609375, + "learning_rate": 0.004288596949364127, + "loss": 0.8611, + "num_input_tokens_seen": 60643448, + "step": 104450 + }, + { + "epoch": 15.557789693178433, + "grad_norm": 0.054931640625, + "learning_rate": 0.004287232184770762, + "loss": 0.7703, + "num_input_tokens_seen": 60646168, + "step": 104455 + }, + { + "epoch": 15.558534405719392, + "grad_norm": 0.044189453125, + "learning_rate": 0.004285867601158725, + "loss": 0.7958, + "num_input_tokens_seen": 60649208, + "step": 104460 + }, + { + "epoch": 15.559279118260351, + "grad_norm": 0.039306640625, + "learning_rate": 0.004284503198551063, + "loss": 0.8042, + "num_input_tokens_seen": 60651928, + "step": 104465 + }, + { + "epoch": 15.560023830801311, + "grad_norm": 0.032470703125, + "learning_rate": 0.004283138976970833, + "loss": 0.7843, + "num_input_tokens_seen": 60655000, + "step": 104470 + }, + { + "epoch": 15.56076854334227, + "grad_norm": 0.06396484375, + "learning_rate": 0.004281774936441077, + "loss": 0.8023, + "num_input_tokens_seen": 60657752, + "step": 104475 + }, + { + "epoch": 15.561513255883229, + "grad_norm": 0.032470703125, + "learning_rate": 0.004280411076984835, + "loss": 0.7909, + "num_input_tokens_seen": 60660408, + "step": 104480 + }, + { + "epoch": 15.562257968424188, + "grad_norm": 0.032958984375, + "learning_rate": 0.004279047398625158, + "loss": 0.8076, + "num_input_tokens_seen": 60663256, + "step": 104485 + }, + { + "epoch": 15.563002680965148, + "grad_norm": 0.033935546875, + "learning_rate": 0.004277683901385073, + "loss": 0.8065, + "num_input_tokens_seen": 60666136, + "step": 104490 + }, + { + "epoch": 15.563747393506107, + "grad_norm": 0.03857421875, + "learning_rate": 0.004276320585287628, + "loss": 0.7783, + "num_input_tokens_seen": 60668824, + "step": 104495 + }, + { + "epoch": 15.564492106047066, + "grad_norm": 0.048095703125, + "learning_rate": 0.004274957450355846, + "loss": 0.7861, + "num_input_tokens_seen": 60671480, + "step": 104500 + }, + { + "epoch": 15.565236818588025, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0042735944966127565, + "loss": 0.7961, + "num_input_tokens_seen": 60674072, + "step": 104505 + }, + { + "epoch": 15.565981531128983, + "grad_norm": 0.059814453125, + "learning_rate": 0.004272231724081384, + "loss": 0.7958, + "num_input_tokens_seen": 60677752, + "step": 104510 + }, + { + "epoch": 15.566726243669944, + "grad_norm": 0.04052734375, + "learning_rate": 0.004270869132784756, + "loss": 0.7912, + "num_input_tokens_seen": 60680824, + "step": 104515 + }, + { + "epoch": 15.567470956210903, + "grad_norm": 0.07080078125, + "learning_rate": 0.004269506722745891, + "loss": 0.7856, + "num_input_tokens_seen": 60684408, + "step": 104520 + }, + { + "epoch": 15.568215668751861, + "grad_norm": 0.035888671875, + "learning_rate": 0.0042681444939877975, + "loss": 0.8002, + "num_input_tokens_seen": 60687192, + "step": 104525 + }, + { + "epoch": 15.568960381292822, + "grad_norm": 0.054931640625, + "learning_rate": 0.004266782446533501, + "loss": 0.8015, + "num_input_tokens_seen": 60690296, + "step": 104530 + }, + { + "epoch": 15.56970509383378, + "grad_norm": 0.041259765625, + "learning_rate": 0.004265420580406002, + "loss": 0.7859, + "num_input_tokens_seen": 60693240, + "step": 104535 + }, + { + "epoch": 15.57044980637474, + "grad_norm": 0.03955078125, + "learning_rate": 0.004264058895628317, + "loss": 0.7869, + "num_input_tokens_seen": 60696056, + "step": 104540 + }, + { + "epoch": 15.571194518915698, + "grad_norm": 0.036376953125, + "learning_rate": 0.00426269739222344, + "loss": 0.7939, + "num_input_tokens_seen": 60698840, + "step": 104545 + }, + { + "epoch": 15.571939231456657, + "grad_norm": 0.043701171875, + "learning_rate": 0.004261336070214382, + "loss": 0.8272, + "num_input_tokens_seen": 60701464, + "step": 104550 + }, + { + "epoch": 15.572683943997617, + "grad_norm": 0.040771484375, + "learning_rate": 0.004259974929624139, + "loss": 0.8078, + "num_input_tokens_seen": 60704856, + "step": 104555 + }, + { + "epoch": 15.573428656538576, + "grad_norm": 0.051513671875, + "learning_rate": 0.004258613970475698, + "loss": 0.794, + "num_input_tokens_seen": 60707640, + "step": 104560 + }, + { + "epoch": 15.574173369079535, + "grad_norm": 0.03271484375, + "learning_rate": 0.00425725319279206, + "loss": 0.7905, + "num_input_tokens_seen": 60710264, + "step": 104565 + }, + { + "epoch": 15.574918081620494, + "grad_norm": 0.042236328125, + "learning_rate": 0.0042558925965962115, + "loss": 0.7823, + "num_input_tokens_seen": 60713048, + "step": 104570 + }, + { + "epoch": 15.575662794161454, + "grad_norm": 0.05029296875, + "learning_rate": 0.004254532181911138, + "loss": 0.7843, + "num_input_tokens_seen": 60716120, + "step": 104575 + }, + { + "epoch": 15.576407506702413, + "grad_norm": 0.049560546875, + "learning_rate": 0.004253171948759818, + "loss": 0.7977, + "num_input_tokens_seen": 60719032, + "step": 104580 + }, + { + "epoch": 15.577152219243372, + "grad_norm": 0.047119140625, + "learning_rate": 0.004251811897165239, + "loss": 0.8087, + "num_input_tokens_seen": 60721560, + "step": 104585 + }, + { + "epoch": 15.57789693178433, + "grad_norm": 0.0625, + "learning_rate": 0.004250452027150372, + "loss": 0.8044, + "num_input_tokens_seen": 60724408, + "step": 104590 + }, + { + "epoch": 15.578641644325291, + "grad_norm": 0.23046875, + "learning_rate": 0.004249092338738194, + "loss": 0.8233, + "num_input_tokens_seen": 60727224, + "step": 104595 + }, + { + "epoch": 15.57938635686625, + "grad_norm": 0.034423828125, + "learning_rate": 0.004247732831951677, + "loss": 0.8093, + "num_input_tokens_seen": 60731064, + "step": 104600 + }, + { + "epoch": 15.580131069407209, + "grad_norm": 0.06884765625, + "learning_rate": 0.00424637350681378, + "loss": 0.7974, + "num_input_tokens_seen": 60733944, + "step": 104605 + }, + { + "epoch": 15.580875781948167, + "grad_norm": 0.05029296875, + "learning_rate": 0.004245014363347476, + "loss": 0.7972, + "num_input_tokens_seen": 60736856, + "step": 104610 + }, + { + "epoch": 15.581620494489128, + "grad_norm": 0.044189453125, + "learning_rate": 0.0042436554015757195, + "loss": 0.7865, + "num_input_tokens_seen": 60739928, + "step": 104615 + }, + { + "epoch": 15.582365207030087, + "grad_norm": 0.02734375, + "learning_rate": 0.004242296621521479, + "loss": 0.7906, + "num_input_tokens_seen": 60742552, + "step": 104620 + }, + { + "epoch": 15.583109919571045, + "grad_norm": 0.051025390625, + "learning_rate": 0.004240938023207698, + "loss": 0.8047, + "num_input_tokens_seen": 60745400, + "step": 104625 + }, + { + "epoch": 15.583854632112004, + "grad_norm": 0.033203125, + "learning_rate": 0.004239579606657338, + "loss": 0.7947, + "num_input_tokens_seen": 60748536, + "step": 104630 + }, + { + "epoch": 15.584599344652965, + "grad_norm": 0.03759765625, + "learning_rate": 0.0042382213718933455, + "loss": 0.7857, + "num_input_tokens_seen": 60751672, + "step": 104635 + }, + { + "epoch": 15.585344057193923, + "grad_norm": 0.044677734375, + "learning_rate": 0.004236863318938664, + "loss": 0.7938, + "num_input_tokens_seen": 60754456, + "step": 104640 + }, + { + "epoch": 15.586088769734882, + "grad_norm": 0.048828125, + "learning_rate": 0.004235505447816238, + "loss": 0.7877, + "num_input_tokens_seen": 60757592, + "step": 104645 + }, + { + "epoch": 15.586833482275841, + "grad_norm": 0.07080078125, + "learning_rate": 0.004234147758549003, + "loss": 0.8211, + "num_input_tokens_seen": 60760216, + "step": 104650 + }, + { + "epoch": 15.587578194816802, + "grad_norm": 0.400390625, + "learning_rate": 0.004232790251159902, + "loss": 0.8161, + "num_input_tokens_seen": 60763128, + "step": 104655 + }, + { + "epoch": 15.58832290735776, + "grad_norm": 0.03857421875, + "learning_rate": 0.004231432925671863, + "loss": 0.8085, + "num_input_tokens_seen": 60765944, + "step": 104660 + }, + { + "epoch": 15.589067619898719, + "grad_norm": 0.0361328125, + "learning_rate": 0.004230075782107826, + "loss": 0.8177, + "num_input_tokens_seen": 60768856, + "step": 104665 + }, + { + "epoch": 15.589812332439678, + "grad_norm": 0.037841796875, + "learning_rate": 0.004228718820490706, + "loss": 0.8155, + "num_input_tokens_seen": 60771672, + "step": 104670 + }, + { + "epoch": 15.590557044980638, + "grad_norm": 0.039794921875, + "learning_rate": 0.00422736204084344, + "loss": 0.8055, + "num_input_tokens_seen": 60774648, + "step": 104675 + }, + { + "epoch": 15.591301757521597, + "grad_norm": 0.0478515625, + "learning_rate": 0.004226005443188944, + "loss": 0.7928, + "num_input_tokens_seen": 60777464, + "step": 104680 + }, + { + "epoch": 15.592046470062556, + "grad_norm": 0.053466796875, + "learning_rate": 0.004224649027550129, + "loss": 0.7948, + "num_input_tokens_seen": 60780120, + "step": 104685 + }, + { + "epoch": 15.592791182603515, + "grad_norm": 0.0537109375, + "learning_rate": 0.004223292793949925, + "loss": 0.7883, + "num_input_tokens_seen": 60782904, + "step": 104690 + }, + { + "epoch": 15.593535895144473, + "grad_norm": 0.043701171875, + "learning_rate": 0.004221936742411234, + "loss": 0.7825, + "num_input_tokens_seen": 60785976, + "step": 104695 + }, + { + "epoch": 15.594280607685434, + "grad_norm": 0.02734375, + "learning_rate": 0.004220580872956968, + "loss": 0.7722, + "num_input_tokens_seen": 60788728, + "step": 104700 + }, + { + "epoch": 15.595025320226393, + "grad_norm": 0.035400390625, + "learning_rate": 0.0042192251856100265, + "loss": 0.7879, + "num_input_tokens_seen": 60791608, + "step": 104705 + }, + { + "epoch": 15.595770032767351, + "grad_norm": 0.03759765625, + "learning_rate": 0.004217869680393324, + "loss": 0.7731, + "num_input_tokens_seen": 60794584, + "step": 104710 + }, + { + "epoch": 15.59651474530831, + "grad_norm": 0.05029296875, + "learning_rate": 0.004216514357329755, + "loss": 0.798, + "num_input_tokens_seen": 60797336, + "step": 104715 + }, + { + "epoch": 15.59725945784927, + "grad_norm": 0.07861328125, + "learning_rate": 0.004215159216442213, + "loss": 0.7873, + "num_input_tokens_seen": 60799992, + "step": 104720 + }, + { + "epoch": 15.59800417039023, + "grad_norm": 0.046630859375, + "learning_rate": 0.004213804257753597, + "loss": 0.7988, + "num_input_tokens_seen": 60803000, + "step": 104725 + }, + { + "epoch": 15.598748882931188, + "grad_norm": 0.051513671875, + "learning_rate": 0.004212449481286791, + "loss": 0.7737, + "num_input_tokens_seen": 60805752, + "step": 104730 + }, + { + "epoch": 15.599493595472147, + "grad_norm": 0.0281982421875, + "learning_rate": 0.004211094887064693, + "loss": 0.801, + "num_input_tokens_seen": 60808760, + "step": 104735 + }, + { + "epoch": 15.600238308013108, + "grad_norm": 0.03076171875, + "learning_rate": 0.004209740475110176, + "loss": 0.7946, + "num_input_tokens_seen": 60811928, + "step": 104740 + }, + { + "epoch": 15.600983020554066, + "grad_norm": 0.044189453125, + "learning_rate": 0.004208386245446131, + "loss": 0.8025, + "num_input_tokens_seen": 60814424, + "step": 104745 + }, + { + "epoch": 15.601727733095025, + "grad_norm": 0.040283203125, + "learning_rate": 0.004207032198095429, + "loss": 0.798, + "num_input_tokens_seen": 60817368, + "step": 104750 + }, + { + "epoch": 15.602472445635984, + "grad_norm": 0.072265625, + "learning_rate": 0.004205678333080952, + "loss": 0.7997, + "num_input_tokens_seen": 60820088, + "step": 104755 + }, + { + "epoch": 15.603217158176944, + "grad_norm": 0.029541015625, + "learning_rate": 0.004204324650425569, + "loss": 0.7978, + "num_input_tokens_seen": 60822776, + "step": 104760 + }, + { + "epoch": 15.603961870717903, + "grad_norm": 0.0361328125, + "learning_rate": 0.004202971150152149, + "loss": 0.8, + "num_input_tokens_seen": 60825720, + "step": 104765 + }, + { + "epoch": 15.604706583258862, + "grad_norm": 0.033447265625, + "learning_rate": 0.004201617832283559, + "loss": 0.8009, + "num_input_tokens_seen": 60828568, + "step": 104770 + }, + { + "epoch": 15.60545129579982, + "grad_norm": 0.0361328125, + "learning_rate": 0.004200264696842655, + "loss": 0.7766, + "num_input_tokens_seen": 60831640, + "step": 104775 + }, + { + "epoch": 15.606196008340781, + "grad_norm": 0.051513671875, + "learning_rate": 0.004198911743852307, + "loss": 0.8035, + "num_input_tokens_seen": 60834648, + "step": 104780 + }, + { + "epoch": 15.60694072088174, + "grad_norm": 0.034423828125, + "learning_rate": 0.0041975589733353625, + "loss": 0.799, + "num_input_tokens_seen": 60837784, + "step": 104785 + }, + { + "epoch": 15.607685433422699, + "grad_norm": 0.05078125, + "learning_rate": 0.004196206385314686, + "loss": 0.8027, + "num_input_tokens_seen": 60840632, + "step": 104790 + }, + { + "epoch": 15.608430145963657, + "grad_norm": 0.0361328125, + "learning_rate": 0.0041948539798131205, + "loss": 0.7723, + "num_input_tokens_seen": 60843608, + "step": 104795 + }, + { + "epoch": 15.609174858504618, + "grad_norm": 0.043701171875, + "learning_rate": 0.004193501756853512, + "loss": 0.8098, + "num_input_tokens_seen": 60846328, + "step": 104800 + }, + { + "epoch": 15.609919571045577, + "grad_norm": 0.0361328125, + "learning_rate": 0.004192149716458712, + "loss": 0.7809, + "num_input_tokens_seen": 60849336, + "step": 104805 + }, + { + "epoch": 15.610664283586535, + "grad_norm": 0.03857421875, + "learning_rate": 0.004190797858651553, + "loss": 0.8031, + "num_input_tokens_seen": 60852248, + "step": 104810 + }, + { + "epoch": 15.611408996127494, + "grad_norm": 0.0322265625, + "learning_rate": 0.004189446183454883, + "loss": 0.8102, + "num_input_tokens_seen": 60855512, + "step": 104815 + }, + { + "epoch": 15.612153708668455, + "grad_norm": 0.062255859375, + "learning_rate": 0.004188094690891528, + "loss": 0.8037, + "num_input_tokens_seen": 60858648, + "step": 104820 + }, + { + "epoch": 15.612898421209414, + "grad_norm": 0.035888671875, + "learning_rate": 0.004186743380984328, + "loss": 0.7831, + "num_input_tokens_seen": 60861624, + "step": 104825 + }, + { + "epoch": 15.613643133750372, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0041853922537561095, + "loss": 0.8076, + "num_input_tokens_seen": 60864568, + "step": 104830 + }, + { + "epoch": 15.614387846291331, + "grad_norm": 0.05908203125, + "learning_rate": 0.004184041309229697, + "loss": 0.7967, + "num_input_tokens_seen": 60867512, + "step": 104835 + }, + { + "epoch": 15.615132558832292, + "grad_norm": 0.036865234375, + "learning_rate": 0.004182690547427915, + "loss": 0.8152, + "num_input_tokens_seen": 60870328, + "step": 104840 + }, + { + "epoch": 15.61587727137325, + "grad_norm": 0.0634765625, + "learning_rate": 0.004181339968373577, + "loss": 0.7931, + "num_input_tokens_seen": 60872952, + "step": 104845 + }, + { + "epoch": 15.616621983914209, + "grad_norm": 0.052001953125, + "learning_rate": 0.004179989572089507, + "loss": 0.7915, + "num_input_tokens_seen": 60875928, + "step": 104850 + }, + { + "epoch": 15.617366696455168, + "grad_norm": 0.03662109375, + "learning_rate": 0.004178639358598513, + "loss": 0.8032, + "num_input_tokens_seen": 60878872, + "step": 104855 + }, + { + "epoch": 15.618111408996128, + "grad_norm": 0.03564453125, + "learning_rate": 0.004177289327923414, + "loss": 0.7839, + "num_input_tokens_seen": 60881848, + "step": 104860 + }, + { + "epoch": 15.618856121537087, + "grad_norm": 0.053955078125, + "learning_rate": 0.004175939480087008, + "loss": 0.7985, + "num_input_tokens_seen": 60884824, + "step": 104865 + }, + { + "epoch": 15.619600834078046, + "grad_norm": 0.05126953125, + "learning_rate": 0.004174589815112107, + "loss": 0.7789, + "num_input_tokens_seen": 60887416, + "step": 104870 + }, + { + "epoch": 15.620345546619005, + "grad_norm": 0.028564453125, + "learning_rate": 0.004173240333021509, + "loss": 0.7886, + "num_input_tokens_seen": 60890520, + "step": 104875 + }, + { + "epoch": 15.621090259159963, + "grad_norm": 0.058349609375, + "learning_rate": 0.004171891033838007, + "loss": 0.8021, + "num_input_tokens_seen": 60893304, + "step": 104880 + }, + { + "epoch": 15.621834971700924, + "grad_norm": 0.03271484375, + "learning_rate": 0.004170541917584405, + "loss": 0.7856, + "num_input_tokens_seen": 60896344, + "step": 104885 + }, + { + "epoch": 15.622579684241883, + "grad_norm": 0.0303955078125, + "learning_rate": 0.004169192984283486, + "loss": 0.7793, + "num_input_tokens_seen": 60899544, + "step": 104890 + }, + { + "epoch": 15.623324396782841, + "grad_norm": 0.0274658203125, + "learning_rate": 0.004167844233958049, + "loss": 0.7918, + "num_input_tokens_seen": 60902488, + "step": 104895 + }, + { + "epoch": 15.6240691093238, + "grad_norm": 0.057861328125, + "learning_rate": 0.004166495666630874, + "loss": 0.8029, + "num_input_tokens_seen": 60905400, + "step": 104900 + }, + { + "epoch": 15.62481382186476, + "grad_norm": 0.049560546875, + "learning_rate": 0.0041651472823247436, + "loss": 0.7981, + "num_input_tokens_seen": 60908024, + "step": 104905 + }, + { + "epoch": 15.62555853440572, + "grad_norm": 0.04833984375, + "learning_rate": 0.004163799081062434, + "loss": 0.8282, + "num_input_tokens_seen": 60910936, + "step": 104910 + }, + { + "epoch": 15.626303246946678, + "grad_norm": 0.04248046875, + "learning_rate": 0.0041624510628667285, + "loss": 0.8015, + "num_input_tokens_seen": 60913624, + "step": 104915 + }, + { + "epoch": 15.627047959487637, + "grad_norm": 0.052490234375, + "learning_rate": 0.004161103227760399, + "loss": 0.7915, + "num_input_tokens_seen": 60916696, + "step": 104920 + }, + { + "epoch": 15.627792672028598, + "grad_norm": 0.041015625, + "learning_rate": 0.00415975557576621, + "loss": 0.8035, + "num_input_tokens_seen": 60919384, + "step": 104925 + }, + { + "epoch": 15.628537384569556, + "grad_norm": 0.035888671875, + "learning_rate": 0.004158408106906937, + "loss": 0.7895, + "num_input_tokens_seen": 60922168, + "step": 104930 + }, + { + "epoch": 15.629282097110515, + "grad_norm": 0.04931640625, + "learning_rate": 0.004157060821205335, + "loss": 0.8099, + "num_input_tokens_seen": 60925240, + "step": 104935 + }, + { + "epoch": 15.630026809651474, + "grad_norm": 0.068359375, + "learning_rate": 0.004155713718684175, + "loss": 0.8334, + "num_input_tokens_seen": 60927832, + "step": 104940 + }, + { + "epoch": 15.630771522192434, + "grad_norm": 0.030029296875, + "learning_rate": 0.004154366799366206, + "loss": 0.8038, + "num_input_tokens_seen": 60931032, + "step": 104945 + }, + { + "epoch": 15.631516234733393, + "grad_norm": 0.037353515625, + "learning_rate": 0.004153020063274191, + "loss": 0.7892, + "num_input_tokens_seen": 60933560, + "step": 104950 + }, + { + "epoch": 15.632260947274352, + "grad_norm": 0.033935546875, + "learning_rate": 0.004151673510430874, + "loss": 0.7945, + "num_input_tokens_seen": 60936376, + "step": 104955 + }, + { + "epoch": 15.63300565981531, + "grad_norm": 0.03759765625, + "learning_rate": 0.00415032714085901, + "loss": 0.7812, + "num_input_tokens_seen": 60939192, + "step": 104960 + }, + { + "epoch": 15.633750372356271, + "grad_norm": 0.050537109375, + "learning_rate": 0.004148980954581344, + "loss": 0.7952, + "num_input_tokens_seen": 60942008, + "step": 104965 + }, + { + "epoch": 15.63449508489723, + "grad_norm": 0.050048828125, + "learning_rate": 0.004147634951620615, + "loss": 0.7825, + "num_input_tokens_seen": 60944696, + "step": 104970 + }, + { + "epoch": 15.635239797438189, + "grad_norm": 0.0419921875, + "learning_rate": 0.004146289131999563, + "loss": 0.794, + "num_input_tokens_seen": 60947416, + "step": 104975 + }, + { + "epoch": 15.635984509979147, + "grad_norm": 0.091796875, + "learning_rate": 0.004144943495740921, + "loss": 0.801, + "num_input_tokens_seen": 60950488, + "step": 104980 + }, + { + "epoch": 15.636729222520108, + "grad_norm": 0.048583984375, + "learning_rate": 0.004143598042867431, + "loss": 0.808, + "num_input_tokens_seen": 60953272, + "step": 104985 + }, + { + "epoch": 15.637473935061067, + "grad_norm": 0.047607421875, + "learning_rate": 0.004142252773401815, + "loss": 0.8049, + "num_input_tokens_seen": 60955928, + "step": 104990 + }, + { + "epoch": 15.638218647602026, + "grad_norm": 0.03955078125, + "learning_rate": 0.004140907687366806, + "loss": 0.8086, + "num_input_tokens_seen": 60958840, + "step": 104995 + }, + { + "epoch": 15.638963360142984, + "grad_norm": 0.0291748046875, + "learning_rate": 0.004139562784785124, + "loss": 0.8138, + "num_input_tokens_seen": 60961560, + "step": 105000 + }, + { + "epoch": 15.639708072683945, + "grad_norm": 0.0311279296875, + "learning_rate": 0.004138218065679488, + "loss": 0.805, + "num_input_tokens_seen": 60964312, + "step": 105005 + }, + { + "epoch": 15.640452785224904, + "grad_norm": 0.03857421875, + "learning_rate": 0.004136873530072621, + "loss": 0.7958, + "num_input_tokens_seen": 60967576, + "step": 105010 + }, + { + "epoch": 15.641197497765862, + "grad_norm": 0.0361328125, + "learning_rate": 0.004135529177987232, + "loss": 0.7952, + "num_input_tokens_seen": 60970680, + "step": 105015 + }, + { + "epoch": 15.641942210306821, + "grad_norm": 0.04931640625, + "learning_rate": 0.004134185009446039, + "loss": 0.7933, + "num_input_tokens_seen": 60973528, + "step": 105020 + }, + { + "epoch": 15.64268692284778, + "grad_norm": 0.047607421875, + "learning_rate": 0.0041328410244717475, + "loss": 0.8015, + "num_input_tokens_seen": 60976376, + "step": 105025 + }, + { + "epoch": 15.64343163538874, + "grad_norm": 0.07177734375, + "learning_rate": 0.0041314972230870614, + "loss": 0.7763, + "num_input_tokens_seen": 60979128, + "step": 105030 + }, + { + "epoch": 15.6441763479297, + "grad_norm": 0.0361328125, + "learning_rate": 0.004130153605314679, + "loss": 0.7928, + "num_input_tokens_seen": 60981848, + "step": 105035 + }, + { + "epoch": 15.644921060470658, + "grad_norm": 0.0625, + "learning_rate": 0.0041288101711773086, + "loss": 0.7911, + "num_input_tokens_seen": 60984856, + "step": 105040 + }, + { + "epoch": 15.645665773011618, + "grad_norm": 0.0556640625, + "learning_rate": 0.004127466920697641, + "loss": 0.7853, + "num_input_tokens_seen": 60987832, + "step": 105045 + }, + { + "epoch": 15.646410485552577, + "grad_norm": 0.041259765625, + "learning_rate": 0.004126123853898366, + "loss": 0.8064, + "num_input_tokens_seen": 60990584, + "step": 105050 + }, + { + "epoch": 15.647155198093536, + "grad_norm": 0.0625, + "learning_rate": 0.00412478097080218, + "loss": 0.7853, + "num_input_tokens_seen": 60993304, + "step": 105055 + }, + { + "epoch": 15.647899910634495, + "grad_norm": 0.04443359375, + "learning_rate": 0.004123438271431762, + "loss": 0.7982, + "num_input_tokens_seen": 60996504, + "step": 105060 + }, + { + "epoch": 15.648644623175453, + "grad_norm": 0.052978515625, + "learning_rate": 0.0041220957558098055, + "loss": 0.8046, + "num_input_tokens_seen": 60999544, + "step": 105065 + }, + { + "epoch": 15.649389335716414, + "grad_norm": 0.036376953125, + "learning_rate": 0.00412075342395898, + "loss": 0.7993, + "num_input_tokens_seen": 61002232, + "step": 105070 + }, + { + "epoch": 15.650134048257373, + "grad_norm": 0.058837890625, + "learning_rate": 0.004119411275901972, + "loss": 0.8049, + "num_input_tokens_seen": 61004888, + "step": 105075 + }, + { + "epoch": 15.650878760798332, + "grad_norm": 0.06494140625, + "learning_rate": 0.004118069311661455, + "loss": 0.8041, + "num_input_tokens_seen": 61007672, + "step": 105080 + }, + { + "epoch": 15.65162347333929, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0041167275312600895, + "loss": 0.8023, + "num_input_tokens_seen": 61010680, + "step": 105085 + }, + { + "epoch": 15.65236818588025, + "grad_norm": 0.052734375, + "learning_rate": 0.004115385934720556, + "loss": 0.7719, + "num_input_tokens_seen": 61014008, + "step": 105090 + }, + { + "epoch": 15.65311289842121, + "grad_norm": 0.054443359375, + "learning_rate": 0.004114044522065516, + "loss": 0.7623, + "num_input_tokens_seen": 61016920, + "step": 105095 + }, + { + "epoch": 15.653857610962168, + "grad_norm": 0.05419921875, + "learning_rate": 0.0041127032933176285, + "loss": 0.8099, + "num_input_tokens_seen": 61020120, + "step": 105100 + }, + { + "epoch": 15.654602323503127, + "grad_norm": 0.05322265625, + "learning_rate": 0.0041113622484995485, + "loss": 0.8586, + "num_input_tokens_seen": 61023064, + "step": 105105 + }, + { + "epoch": 15.655347036044088, + "grad_norm": 0.033447265625, + "learning_rate": 0.0041100213876339425, + "loss": 0.788, + "num_input_tokens_seen": 61026104, + "step": 105110 + }, + { + "epoch": 15.656091748585046, + "grad_norm": 0.054443359375, + "learning_rate": 0.004108680710743453, + "loss": 0.8038, + "num_input_tokens_seen": 61029112, + "step": 105115 + }, + { + "epoch": 15.656836461126005, + "grad_norm": 0.037841796875, + "learning_rate": 0.004107340217850738, + "loss": 0.7875, + "num_input_tokens_seen": 61031896, + "step": 105120 + }, + { + "epoch": 15.657581173666964, + "grad_norm": 0.033935546875, + "learning_rate": 0.004105999908978437, + "loss": 0.8238, + "num_input_tokens_seen": 61034872, + "step": 105125 + }, + { + "epoch": 15.658325886207924, + "grad_norm": 0.07958984375, + "learning_rate": 0.004104659784149193, + "loss": 0.818, + "num_input_tokens_seen": 61037528, + "step": 105130 + }, + { + "epoch": 15.659070598748883, + "grad_norm": 0.05712890625, + "learning_rate": 0.004103319843385651, + "loss": 0.7863, + "num_input_tokens_seen": 61040376, + "step": 105135 + }, + { + "epoch": 15.659815311289842, + "grad_norm": 0.055908203125, + "learning_rate": 0.0041019800867104425, + "loss": 0.8034, + "num_input_tokens_seen": 61043128, + "step": 105140 + }, + { + "epoch": 15.6605600238308, + "grad_norm": 0.049560546875, + "learning_rate": 0.004100640514146208, + "loss": 0.7845, + "num_input_tokens_seen": 61046424, + "step": 105145 + }, + { + "epoch": 15.661304736371761, + "grad_norm": 0.0693359375, + "learning_rate": 0.0040993011257155695, + "loss": 0.8173, + "num_input_tokens_seen": 61049688, + "step": 105150 + }, + { + "epoch": 15.66204944891272, + "grad_norm": 0.03759765625, + "learning_rate": 0.004097961921441164, + "loss": 0.7912, + "num_input_tokens_seen": 61052280, + "step": 105155 + }, + { + "epoch": 15.662794161453679, + "grad_norm": 0.09130859375, + "learning_rate": 0.004096622901345612, + "loss": 0.7953, + "num_input_tokens_seen": 61055288, + "step": 105160 + }, + { + "epoch": 15.663538873994638, + "grad_norm": 0.045654296875, + "learning_rate": 0.004095284065451535, + "loss": 0.7944, + "num_input_tokens_seen": 61058072, + "step": 105165 + }, + { + "epoch": 15.664283586535598, + "grad_norm": 0.047119140625, + "learning_rate": 0.00409394541378155, + "loss": 0.8182, + "num_input_tokens_seen": 61060920, + "step": 105170 + }, + { + "epoch": 15.665028299076557, + "grad_norm": 0.031494140625, + "learning_rate": 0.004092606946358268, + "loss": 0.7917, + "num_input_tokens_seen": 61063768, + "step": 105175 + }, + { + "epoch": 15.665773011617516, + "grad_norm": 0.052734375, + "learning_rate": 0.004091268663204311, + "loss": 0.8032, + "num_input_tokens_seen": 61066808, + "step": 105180 + }, + { + "epoch": 15.666517724158474, + "grad_norm": 0.037841796875, + "learning_rate": 0.004089930564342277, + "loss": 0.7923, + "num_input_tokens_seen": 61069976, + "step": 105185 + }, + { + "epoch": 15.667262436699435, + "grad_norm": 0.044189453125, + "learning_rate": 0.004088592649794783, + "loss": 0.8007, + "num_input_tokens_seen": 61072792, + "step": 105190 + }, + { + "epoch": 15.668007149240394, + "grad_norm": 0.0279541015625, + "learning_rate": 0.004087254919584423, + "loss": 0.792, + "num_input_tokens_seen": 61075896, + "step": 105195 + }, + { + "epoch": 15.668751861781352, + "grad_norm": 0.0498046875, + "learning_rate": 0.004085917373733804, + "loss": 0.8331, + "num_input_tokens_seen": 61078456, + "step": 105200 + }, + { + "epoch": 15.669496574322311, + "grad_norm": 0.04931640625, + "learning_rate": 0.004084580012265516, + "loss": 0.7936, + "num_input_tokens_seen": 61081272, + "step": 105205 + }, + { + "epoch": 15.67024128686327, + "grad_norm": 0.033935546875, + "learning_rate": 0.004083242835202153, + "loss": 0.8389, + "num_input_tokens_seen": 61084088, + "step": 105210 + }, + { + "epoch": 15.67098599940423, + "grad_norm": 0.0439453125, + "learning_rate": 0.00408190584256631, + "loss": 0.7985, + "num_input_tokens_seen": 61087032, + "step": 105215 + }, + { + "epoch": 15.67173071194519, + "grad_norm": 0.025390625, + "learning_rate": 0.004080569034380567, + "loss": 0.8059, + "num_input_tokens_seen": 61089752, + "step": 105220 + }, + { + "epoch": 15.672475424486148, + "grad_norm": 0.034423828125, + "learning_rate": 0.004079232410667516, + "loss": 0.7897, + "num_input_tokens_seen": 61092984, + "step": 105225 + }, + { + "epoch": 15.673220137027108, + "grad_norm": 0.034423828125, + "learning_rate": 0.004077895971449736, + "loss": 0.8023, + "num_input_tokens_seen": 61095864, + "step": 105230 + }, + { + "epoch": 15.673964849568067, + "grad_norm": 0.0299072265625, + "learning_rate": 0.004076559716749802, + "loss": 0.8032, + "num_input_tokens_seen": 61098776, + "step": 105235 + }, + { + "epoch": 15.674709562109026, + "grad_norm": 0.03662109375, + "learning_rate": 0.004075223646590289, + "loss": 0.7862, + "num_input_tokens_seen": 61101752, + "step": 105240 + }, + { + "epoch": 15.675454274649985, + "grad_norm": 0.0439453125, + "learning_rate": 0.004073887760993765, + "loss": 0.8103, + "num_input_tokens_seen": 61104792, + "step": 105245 + }, + { + "epoch": 15.676198987190944, + "grad_norm": 0.039306640625, + "learning_rate": 0.004072552059982805, + "loss": 0.8075, + "num_input_tokens_seen": 61107512, + "step": 105250 + }, + { + "epoch": 15.676943699731904, + "grad_norm": 0.035888671875, + "learning_rate": 0.00407121654357997, + "loss": 0.8098, + "num_input_tokens_seen": 61110328, + "step": 105255 + }, + { + "epoch": 15.677688412272863, + "grad_norm": 0.03466796875, + "learning_rate": 0.0040698812118078265, + "loss": 0.7993, + "num_input_tokens_seen": 61113528, + "step": 105260 + }, + { + "epoch": 15.678433124813822, + "grad_norm": 0.0439453125, + "learning_rate": 0.004068546064688928, + "loss": 0.7919, + "num_input_tokens_seen": 61116248, + "step": 105265 + }, + { + "epoch": 15.67917783735478, + "grad_norm": 0.03857421875, + "learning_rate": 0.004067211102245837, + "loss": 0.7909, + "num_input_tokens_seen": 61119032, + "step": 105270 + }, + { + "epoch": 15.67992254989574, + "grad_norm": 0.052490234375, + "learning_rate": 0.0040658763245011, + "loss": 0.8018, + "num_input_tokens_seen": 61121912, + "step": 105275 + }, + { + "epoch": 15.6806672624367, + "grad_norm": 0.05712890625, + "learning_rate": 0.0040645417314772725, + "loss": 0.7836, + "num_input_tokens_seen": 61124664, + "step": 105280 + }, + { + "epoch": 15.681411974977658, + "grad_norm": 0.039306640625, + "learning_rate": 0.004063207323196899, + "loss": 0.7993, + "num_input_tokens_seen": 61127608, + "step": 105285 + }, + { + "epoch": 15.682156687518617, + "grad_norm": 0.062255859375, + "learning_rate": 0.004061873099682516, + "loss": 0.8108, + "num_input_tokens_seen": 61130424, + "step": 105290 + }, + { + "epoch": 15.682901400059578, + "grad_norm": 0.32421875, + "learning_rate": 0.0040605390609566765, + "loss": 0.8231, + "num_input_tokens_seen": 61133304, + "step": 105295 + }, + { + "epoch": 15.683646112600536, + "grad_norm": 0.03759765625, + "learning_rate": 0.004059205207041909, + "loss": 0.7863, + "num_input_tokens_seen": 61135960, + "step": 105300 + }, + { + "epoch": 15.684390825141495, + "grad_norm": 0.033935546875, + "learning_rate": 0.004057871537960753, + "loss": 0.7839, + "num_input_tokens_seen": 61138680, + "step": 105305 + }, + { + "epoch": 15.685135537682454, + "grad_norm": 0.027587890625, + "learning_rate": 0.004056538053735729, + "loss": 0.835, + "num_input_tokens_seen": 61141336, + "step": 105310 + }, + { + "epoch": 15.685880250223414, + "grad_norm": 0.03857421875, + "learning_rate": 0.004055204754389378, + "loss": 0.8021, + "num_input_tokens_seen": 61144280, + "step": 105315 + }, + { + "epoch": 15.686624962764373, + "grad_norm": 0.059814453125, + "learning_rate": 0.004053871639944219, + "loss": 0.8101, + "num_input_tokens_seen": 61147128, + "step": 105320 + }, + { + "epoch": 15.687369675305332, + "grad_norm": 0.035888671875, + "learning_rate": 0.004052538710422767, + "loss": 0.8095, + "num_input_tokens_seen": 61150008, + "step": 105325 + }, + { + "epoch": 15.68811438784629, + "grad_norm": 0.04248046875, + "learning_rate": 0.004051205965847554, + "loss": 0.7949, + "num_input_tokens_seen": 61153112, + "step": 105330 + }, + { + "epoch": 15.688859100387251, + "grad_norm": 0.048583984375, + "learning_rate": 0.004049873406241083, + "loss": 0.8186, + "num_input_tokens_seen": 61156056, + "step": 105335 + }, + { + "epoch": 15.68960381292821, + "grad_norm": 0.051513671875, + "learning_rate": 0.004048541031625876, + "loss": 0.8092, + "num_input_tokens_seen": 61159352, + "step": 105340 + }, + { + "epoch": 15.690348525469169, + "grad_norm": 0.036865234375, + "learning_rate": 0.004047208842024433, + "loss": 0.8004, + "num_input_tokens_seen": 61162168, + "step": 105345 + }, + { + "epoch": 15.691093238010128, + "grad_norm": 0.05029296875, + "learning_rate": 0.004045876837459268, + "loss": 0.7825, + "num_input_tokens_seen": 61165112, + "step": 105350 + }, + { + "epoch": 15.691837950551088, + "grad_norm": 0.033935546875, + "learning_rate": 0.004044545017952883, + "loss": 0.7955, + "num_input_tokens_seen": 61167864, + "step": 105355 + }, + { + "epoch": 15.692582663092047, + "grad_norm": 0.043701171875, + "learning_rate": 0.004043213383527773, + "loss": 0.7849, + "num_input_tokens_seen": 61170936, + "step": 105360 + }, + { + "epoch": 15.693327375633006, + "grad_norm": 0.02490234375, + "learning_rate": 0.004041881934206434, + "loss": 0.8108, + "num_input_tokens_seen": 61173592, + "step": 105365 + }, + { + "epoch": 15.694072088173964, + "grad_norm": 0.0225830078125, + "learning_rate": 0.004040550670011366, + "loss": 0.7969, + "num_input_tokens_seen": 61176248, + "step": 105370 + }, + { + "epoch": 15.694816800714925, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0040392195909650565, + "loss": 0.7866, + "num_input_tokens_seen": 61179064, + "step": 105375 + }, + { + "epoch": 15.695561513255884, + "grad_norm": 0.033203125, + "learning_rate": 0.004037888697089986, + "loss": 0.8213, + "num_input_tokens_seen": 61181592, + "step": 105380 + }, + { + "epoch": 15.696306225796842, + "grad_norm": 0.058837890625, + "learning_rate": 0.0040365579884086514, + "loss": 0.7888, + "num_input_tokens_seen": 61184088, + "step": 105385 + }, + { + "epoch": 15.697050938337801, + "grad_norm": 0.034423828125, + "learning_rate": 0.004035227464943521, + "loss": 0.7813, + "num_input_tokens_seen": 61186872, + "step": 105390 + }, + { + "epoch": 15.69779565087876, + "grad_norm": 0.05126953125, + "learning_rate": 0.004033897126717083, + "loss": 0.804, + "num_input_tokens_seen": 61190136, + "step": 105395 + }, + { + "epoch": 15.69854036341972, + "grad_norm": 0.040283203125, + "learning_rate": 0.004032566973751806, + "loss": 0.8017, + "num_input_tokens_seen": 61193048, + "step": 105400 + }, + { + "epoch": 15.69928507596068, + "grad_norm": 0.040283203125, + "learning_rate": 0.004031237006070162, + "loss": 0.7975, + "num_input_tokens_seen": 61195992, + "step": 105405 + }, + { + "epoch": 15.700029788501638, + "grad_norm": 0.0654296875, + "learning_rate": 0.004029907223694623, + "loss": 0.7964, + "num_input_tokens_seen": 61198808, + "step": 105410 + }, + { + "epoch": 15.700774501042597, + "grad_norm": 0.03857421875, + "learning_rate": 0.004028577626647647, + "loss": 0.794, + "num_input_tokens_seen": 61201880, + "step": 105415 + }, + { + "epoch": 15.701519213583557, + "grad_norm": 0.046875, + "learning_rate": 0.004027248214951706, + "loss": 0.8024, + "num_input_tokens_seen": 61205208, + "step": 105420 + }, + { + "epoch": 15.702263926124516, + "grad_norm": 0.06494140625, + "learning_rate": 0.0040259189886292564, + "loss": 0.7769, + "num_input_tokens_seen": 61207896, + "step": 105425 + }, + { + "epoch": 15.703008638665475, + "grad_norm": 0.0400390625, + "learning_rate": 0.004024589947702751, + "loss": 0.8083, + "num_input_tokens_seen": 61210744, + "step": 105430 + }, + { + "epoch": 15.703753351206434, + "grad_norm": 0.04345703125, + "learning_rate": 0.0040232610921946375, + "loss": 0.8216, + "num_input_tokens_seen": 61213720, + "step": 105435 + }, + { + "epoch": 15.704498063747394, + "grad_norm": 0.0458984375, + "learning_rate": 0.004021932422127377, + "loss": 0.7886, + "num_input_tokens_seen": 61216408, + "step": 105440 + }, + { + "epoch": 15.705242776288353, + "grad_norm": 0.03466796875, + "learning_rate": 0.0040206039375234115, + "loss": 0.7951, + "num_input_tokens_seen": 61219192, + "step": 105445 + }, + { + "epoch": 15.705987488829312, + "grad_norm": 0.283203125, + "learning_rate": 0.004019275638405178, + "loss": 0.8176, + "num_input_tokens_seen": 61222488, + "step": 105450 + }, + { + "epoch": 15.70673220137027, + "grad_norm": 0.047607421875, + "learning_rate": 0.004017947524795126, + "loss": 0.7853, + "num_input_tokens_seen": 61225656, + "step": 105455 + }, + { + "epoch": 15.707476913911231, + "grad_norm": 0.049560546875, + "learning_rate": 0.004016619596715685, + "loss": 0.8056, + "num_input_tokens_seen": 61228664, + "step": 105460 + }, + { + "epoch": 15.70822162645219, + "grad_norm": 0.0252685546875, + "learning_rate": 0.004015291854189298, + "loss": 0.8035, + "num_input_tokens_seen": 61231384, + "step": 105465 + }, + { + "epoch": 15.708966338993148, + "grad_norm": 0.060302734375, + "learning_rate": 0.004013964297238385, + "loss": 0.7882, + "num_input_tokens_seen": 61234296, + "step": 105470 + }, + { + "epoch": 15.709711051534107, + "grad_norm": 0.04931640625, + "learning_rate": 0.004012636925885386, + "loss": 0.8002, + "num_input_tokens_seen": 61237208, + "step": 105475 + }, + { + "epoch": 15.710455764075068, + "grad_norm": 0.06201171875, + "learning_rate": 0.004011309740152717, + "loss": 0.8043, + "num_input_tokens_seen": 61240312, + "step": 105480 + }, + { + "epoch": 15.711200476616026, + "grad_norm": 0.05224609375, + "learning_rate": 0.004009982740062798, + "loss": 0.7918, + "num_input_tokens_seen": 61243288, + "step": 105485 + }, + { + "epoch": 15.711945189156985, + "grad_norm": 0.03857421875, + "learning_rate": 0.004008655925638056, + "loss": 0.7946, + "num_input_tokens_seen": 61246392, + "step": 105490 + }, + { + "epoch": 15.712689901697944, + "grad_norm": 0.056884765625, + "learning_rate": 0.0040073292969009, + "loss": 0.7836, + "num_input_tokens_seen": 61249624, + "step": 105495 + }, + { + "epoch": 15.713434614238905, + "grad_norm": 0.044677734375, + "learning_rate": 0.004006002853873744, + "loss": 0.8306, + "num_input_tokens_seen": 61252568, + "step": 105500 + }, + { + "epoch": 15.714179326779863, + "grad_norm": 0.035400390625, + "learning_rate": 0.004004676596578991, + "loss": 0.8337, + "num_input_tokens_seen": 61255672, + "step": 105505 + }, + { + "epoch": 15.714924039320822, + "grad_norm": 0.035888671875, + "learning_rate": 0.004003350525039057, + "loss": 0.8119, + "num_input_tokens_seen": 61258360, + "step": 105510 + }, + { + "epoch": 15.71566875186178, + "grad_norm": 0.03857421875, + "learning_rate": 0.004002024639276333, + "loss": 0.7984, + "num_input_tokens_seen": 61261336, + "step": 105515 + }, + { + "epoch": 15.716413464402741, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00400069893931323, + "loss": 0.8008, + "num_input_tokens_seen": 61264184, + "step": 105520 + }, + { + "epoch": 15.7171581769437, + "grad_norm": 0.0498046875, + "learning_rate": 0.00399937342517214, + "loss": 0.8015, + "num_input_tokens_seen": 61267128, + "step": 105525 + }, + { + "epoch": 15.717902889484659, + "grad_norm": 0.03369140625, + "learning_rate": 0.003998048096875452, + "loss": 0.8015, + "num_input_tokens_seen": 61270008, + "step": 105530 + }, + { + "epoch": 15.718647602025618, + "grad_norm": 0.042236328125, + "learning_rate": 0.003996722954445562, + "loss": 0.7928, + "num_input_tokens_seen": 61273272, + "step": 105535 + }, + { + "epoch": 15.719392314566576, + "grad_norm": 0.055419921875, + "learning_rate": 0.003995397997904851, + "loss": 0.8042, + "num_input_tokens_seen": 61276024, + "step": 105540 + }, + { + "epoch": 15.720137027107537, + "grad_norm": 0.040283203125, + "learning_rate": 0.003994073227275709, + "loss": 0.7893, + "num_input_tokens_seen": 61279128, + "step": 105545 + }, + { + "epoch": 15.720881739648496, + "grad_norm": 0.04931640625, + "learning_rate": 0.003992748642580511, + "loss": 0.7933, + "num_input_tokens_seen": 61281912, + "step": 105550 + }, + { + "epoch": 15.721626452189454, + "grad_norm": 0.038818359375, + "learning_rate": 0.003991424243841642, + "loss": 0.7897, + "num_input_tokens_seen": 61284984, + "step": 105555 + }, + { + "epoch": 15.722371164730415, + "grad_norm": 0.023681640625, + "learning_rate": 0.0039901000310814715, + "loss": 0.8044, + "num_input_tokens_seen": 61287928, + "step": 105560 + }, + { + "epoch": 15.723115877271374, + "grad_norm": 0.03857421875, + "learning_rate": 0.003988776004322371, + "loss": 0.7886, + "num_input_tokens_seen": 61290584, + "step": 105565 + }, + { + "epoch": 15.723860589812332, + "grad_norm": 0.03369140625, + "learning_rate": 0.003987452163586708, + "loss": 0.7834, + "num_input_tokens_seen": 61293176, + "step": 105570 + }, + { + "epoch": 15.724605302353291, + "grad_norm": 0.041259765625, + "learning_rate": 0.003986128508896844, + "loss": 0.8051, + "num_input_tokens_seen": 61296280, + "step": 105575 + }, + { + "epoch": 15.72535001489425, + "grad_norm": 0.039306640625, + "learning_rate": 0.00398480504027515, + "loss": 0.7955, + "num_input_tokens_seen": 61299352, + "step": 105580 + }, + { + "epoch": 15.72609472743521, + "grad_norm": 0.034912109375, + "learning_rate": 0.003983481757743975, + "loss": 0.7843, + "num_input_tokens_seen": 61302104, + "step": 105585 + }, + { + "epoch": 15.72683943997617, + "grad_norm": 0.05126953125, + "learning_rate": 0.003982158661325682, + "loss": 0.7874, + "num_input_tokens_seen": 61305144, + "step": 105590 + }, + { + "epoch": 15.727584152517128, + "grad_norm": 0.046875, + "learning_rate": 0.003980835751042618, + "loss": 0.7981, + "num_input_tokens_seen": 61308280, + "step": 105595 + }, + { + "epoch": 15.728328865058087, + "grad_norm": 0.04345703125, + "learning_rate": 0.003979513026917138, + "loss": 0.7919, + "num_input_tokens_seen": 61311128, + "step": 105600 + }, + { + "epoch": 15.729073577599047, + "grad_norm": 0.052490234375, + "learning_rate": 0.003978190488971585, + "loss": 0.7907, + "num_input_tokens_seen": 61314104, + "step": 105605 + }, + { + "epoch": 15.729818290140006, + "grad_norm": 0.042724609375, + "learning_rate": 0.003976868137228296, + "loss": 0.7994, + "num_input_tokens_seen": 61317208, + "step": 105610 + }, + { + "epoch": 15.730563002680965, + "grad_norm": 0.040283203125, + "learning_rate": 0.00397554597170962, + "loss": 0.7909, + "num_input_tokens_seen": 61319928, + "step": 105615 + }, + { + "epoch": 15.731307715221924, + "grad_norm": 0.051513671875, + "learning_rate": 0.0039742239924378895, + "loss": 0.8051, + "num_input_tokens_seen": 61322712, + "step": 105620 + }, + { + "epoch": 15.732052427762884, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00397290219943544, + "loss": 0.8038, + "num_input_tokens_seen": 61325688, + "step": 105625 + }, + { + "epoch": 15.732797140303843, + "grad_norm": 0.037353515625, + "learning_rate": 0.003971580592724601, + "loss": 0.8022, + "num_input_tokens_seen": 61328408, + "step": 105630 + }, + { + "epoch": 15.733541852844802, + "grad_norm": 0.03466796875, + "learning_rate": 0.003970259172327699, + "loss": 0.7914, + "num_input_tokens_seen": 61330872, + "step": 105635 + }, + { + "epoch": 15.73428656538576, + "grad_norm": 0.0272216796875, + "learning_rate": 0.003968937938267055, + "loss": 0.7993, + "num_input_tokens_seen": 61333752, + "step": 105640 + }, + { + "epoch": 15.735031277926721, + "grad_norm": 0.033447265625, + "learning_rate": 0.003967616890564997, + "loss": 0.7999, + "num_input_tokens_seen": 61336600, + "step": 105645 + }, + { + "epoch": 15.73577599046768, + "grad_norm": 0.06298828125, + "learning_rate": 0.003966296029243838, + "loss": 0.7996, + "num_input_tokens_seen": 61339704, + "step": 105650 + }, + { + "epoch": 15.736520703008638, + "grad_norm": 0.0458984375, + "learning_rate": 0.003964975354325888, + "loss": 0.784, + "num_input_tokens_seen": 61342296, + "step": 105655 + }, + { + "epoch": 15.737265415549597, + "grad_norm": 0.036376953125, + "learning_rate": 0.0039636548658334705, + "loss": 0.7885, + "num_input_tokens_seen": 61345304, + "step": 105660 + }, + { + "epoch": 15.738010128090558, + "grad_norm": 0.049072265625, + "learning_rate": 0.0039623345637888815, + "loss": 0.7848, + "num_input_tokens_seen": 61348120, + "step": 105665 + }, + { + "epoch": 15.738754840631517, + "grad_norm": 0.03515625, + "learning_rate": 0.003961014448214437, + "loss": 0.7993, + "num_input_tokens_seen": 61351000, + "step": 105670 + }, + { + "epoch": 15.739499553172475, + "grad_norm": 0.041259765625, + "learning_rate": 0.003959694519132429, + "loss": 0.7916, + "num_input_tokens_seen": 61353720, + "step": 105675 + }, + { + "epoch": 15.740244265713434, + "grad_norm": 0.04248046875, + "learning_rate": 0.003958374776565165, + "loss": 0.7768, + "num_input_tokens_seen": 61356728, + "step": 105680 + }, + { + "epoch": 15.740988978254395, + "grad_norm": 0.044921875, + "learning_rate": 0.0039570552205349385, + "loss": 0.7918, + "num_input_tokens_seen": 61359768, + "step": 105685 + }, + { + "epoch": 15.741733690795353, + "grad_norm": 0.035888671875, + "learning_rate": 0.00395573585106404, + "loss": 0.8055, + "num_input_tokens_seen": 61362808, + "step": 105690 + }, + { + "epoch": 15.742478403336312, + "grad_norm": 0.03466796875, + "learning_rate": 0.003954416668174754, + "loss": 0.8121, + "num_input_tokens_seen": 61365432, + "step": 105695 + }, + { + "epoch": 15.74322311587727, + "grad_norm": 0.047607421875, + "learning_rate": 0.0039530976718893775, + "loss": 0.7831, + "num_input_tokens_seen": 61368536, + "step": 105700 + }, + { + "epoch": 15.743967828418231, + "grad_norm": 0.035888671875, + "learning_rate": 0.003951778862230189, + "loss": 0.8055, + "num_input_tokens_seen": 61371608, + "step": 105705 + }, + { + "epoch": 15.74471254095919, + "grad_norm": 0.040771484375, + "learning_rate": 0.003950460239219461, + "loss": 0.7943, + "num_input_tokens_seen": 61374520, + "step": 105710 + }, + { + "epoch": 15.745457253500149, + "grad_norm": 0.03466796875, + "learning_rate": 0.003949141802879483, + "loss": 0.8016, + "num_input_tokens_seen": 61377176, + "step": 105715 + }, + { + "epoch": 15.746201966041108, + "grad_norm": 0.051513671875, + "learning_rate": 0.003947823553232518, + "loss": 0.7977, + "num_input_tokens_seen": 61379992, + "step": 105720 + }, + { + "epoch": 15.746946678582066, + "grad_norm": 0.0439453125, + "learning_rate": 0.003946505490300845, + "loss": 0.7816, + "num_input_tokens_seen": 61382872, + "step": 105725 + }, + { + "epoch": 15.747691391123027, + "grad_norm": 0.0556640625, + "learning_rate": 0.003945187614106728, + "loss": 0.7858, + "num_input_tokens_seen": 61385752, + "step": 105730 + }, + { + "epoch": 15.748436103663986, + "grad_norm": 0.02783203125, + "learning_rate": 0.003943869924672428, + "loss": 0.7785, + "num_input_tokens_seen": 61388984, + "step": 105735 + }, + { + "epoch": 15.749180816204944, + "grad_norm": 0.04931640625, + "learning_rate": 0.00394255242202021, + "loss": 0.7936, + "num_input_tokens_seen": 61391832, + "step": 105740 + }, + { + "epoch": 15.749925528745905, + "grad_norm": 0.11181640625, + "learning_rate": 0.003941235106172327, + "loss": 0.8321, + "num_input_tokens_seen": 61394648, + "step": 105745 + }, + { + "epoch": 15.750670241286864, + "grad_norm": 0.0294189453125, + "learning_rate": 0.003939917977151043, + "loss": 0.8076, + "num_input_tokens_seen": 61397528, + "step": 105750 + }, + { + "epoch": 15.751414953827823, + "grad_norm": 0.072265625, + "learning_rate": 0.003938601034978604, + "loss": 0.7989, + "num_input_tokens_seen": 61400536, + "step": 105755 + }, + { + "epoch": 15.752159666368781, + "grad_norm": 0.0299072265625, + "learning_rate": 0.003937284279677256, + "loss": 0.8033, + "num_input_tokens_seen": 61403416, + "step": 105760 + }, + { + "epoch": 15.75290437890974, + "grad_norm": 0.033935546875, + "learning_rate": 0.003935967711269249, + "loss": 0.8187, + "num_input_tokens_seen": 61406104, + "step": 105765 + }, + { + "epoch": 15.7536490914507, + "grad_norm": 0.041748046875, + "learning_rate": 0.003934651329776816, + "loss": 0.7944, + "num_input_tokens_seen": 61408888, + "step": 105770 + }, + { + "epoch": 15.75439380399166, + "grad_norm": 0.0230712890625, + "learning_rate": 0.003933335135222208, + "loss": 0.793, + "num_input_tokens_seen": 61411896, + "step": 105775 + }, + { + "epoch": 15.755138516532618, + "grad_norm": 0.056396484375, + "learning_rate": 0.00393201912762765, + "loss": 0.7874, + "num_input_tokens_seen": 61414968, + "step": 105780 + }, + { + "epoch": 15.755883229073577, + "grad_norm": 0.031494140625, + "learning_rate": 0.003930703307015384, + "loss": 0.7889, + "num_input_tokens_seen": 61417688, + "step": 105785 + }, + { + "epoch": 15.756627941614537, + "grad_norm": 0.03125, + "learning_rate": 0.003929387673407633, + "loss": 0.7805, + "num_input_tokens_seen": 61420408, + "step": 105790 + }, + { + "epoch": 15.757372654155496, + "grad_norm": 0.040283203125, + "learning_rate": 0.003928072226826628, + "loss": 0.809, + "num_input_tokens_seen": 61423288, + "step": 105795 + }, + { + "epoch": 15.758117366696455, + "grad_norm": 0.034912109375, + "learning_rate": 0.0039267569672945864, + "loss": 0.8191, + "num_input_tokens_seen": 61426040, + "step": 105800 + }, + { + "epoch": 15.758862079237414, + "grad_norm": 0.03515625, + "learning_rate": 0.003925441894833736, + "loss": 0.7906, + "num_input_tokens_seen": 61428952, + "step": 105805 + }, + { + "epoch": 15.759606791778374, + "grad_norm": 0.051513671875, + "learning_rate": 0.003924127009466288, + "loss": 0.8065, + "num_input_tokens_seen": 61432280, + "step": 105810 + }, + { + "epoch": 15.760351504319333, + "grad_norm": 0.1181640625, + "learning_rate": 0.003922812311214452, + "loss": 0.7833, + "num_input_tokens_seen": 61434744, + "step": 105815 + }, + { + "epoch": 15.761096216860292, + "grad_norm": 0.1025390625, + "learning_rate": 0.00392149780010045, + "loss": 0.7901, + "num_input_tokens_seen": 61437496, + "step": 105820 + }, + { + "epoch": 15.76184092940125, + "grad_norm": 0.02587890625, + "learning_rate": 0.003920183476146482, + "loss": 0.7978, + "num_input_tokens_seen": 61440312, + "step": 105825 + }, + { + "epoch": 15.762585641942211, + "grad_norm": 0.051025390625, + "learning_rate": 0.003918869339374753, + "loss": 0.7708, + "num_input_tokens_seen": 61443800, + "step": 105830 + }, + { + "epoch": 15.76333035448317, + "grad_norm": 0.042724609375, + "learning_rate": 0.003917555389807462, + "loss": 0.7969, + "num_input_tokens_seen": 61446808, + "step": 105835 + }, + { + "epoch": 15.764075067024129, + "grad_norm": 0.068359375, + "learning_rate": 0.003916241627466811, + "loss": 0.8038, + "num_input_tokens_seen": 61449848, + "step": 105840 + }, + { + "epoch": 15.764819779565087, + "grad_norm": 0.06396484375, + "learning_rate": 0.003914928052374994, + "loss": 0.7866, + "num_input_tokens_seen": 61452440, + "step": 105845 + }, + { + "epoch": 15.765564492106048, + "grad_norm": 0.04150390625, + "learning_rate": 0.003913614664554195, + "loss": 0.7874, + "num_input_tokens_seen": 61455160, + "step": 105850 + }, + { + "epoch": 15.766309204647007, + "grad_norm": 0.042236328125, + "learning_rate": 0.003912301464026614, + "loss": 0.7892, + "num_input_tokens_seen": 61458072, + "step": 105855 + }, + { + "epoch": 15.767053917187965, + "grad_norm": 0.05078125, + "learning_rate": 0.0039109884508144255, + "loss": 0.7951, + "num_input_tokens_seen": 61461048, + "step": 105860 + }, + { + "epoch": 15.767798629728924, + "grad_norm": 0.0458984375, + "learning_rate": 0.0039096756249398235, + "loss": 0.7906, + "num_input_tokens_seen": 61464248, + "step": 105865 + }, + { + "epoch": 15.768543342269885, + "grad_norm": 0.06494140625, + "learning_rate": 0.003908362986424973, + "loss": 0.7942, + "num_input_tokens_seen": 61467256, + "step": 105870 + }, + { + "epoch": 15.769288054810843, + "grad_norm": 0.037109375, + "learning_rate": 0.003907050535292064, + "loss": 0.8005, + "num_input_tokens_seen": 61469720, + "step": 105875 + }, + { + "epoch": 15.770032767351802, + "grad_norm": 0.08349609375, + "learning_rate": 0.003905738271563256, + "loss": 0.8115, + "num_input_tokens_seen": 61472632, + "step": 105880 + }, + { + "epoch": 15.770777479892761, + "grad_norm": 0.0228271484375, + "learning_rate": 0.0039044261952607276, + "loss": 0.8245, + "num_input_tokens_seen": 61475256, + "step": 105885 + }, + { + "epoch": 15.771522192433721, + "grad_norm": 0.033203125, + "learning_rate": 0.003903114306406643, + "loss": 0.8157, + "num_input_tokens_seen": 61477944, + "step": 105890 + }, + { + "epoch": 15.77226690497468, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0039018026050231624, + "loss": 0.8036, + "num_input_tokens_seen": 61480952, + "step": 105895 + }, + { + "epoch": 15.773011617515639, + "grad_norm": 0.055908203125, + "learning_rate": 0.003900491091132448, + "loss": 0.7958, + "num_input_tokens_seen": 61483800, + "step": 105900 + }, + { + "epoch": 15.773756330056598, + "grad_norm": 0.040771484375, + "learning_rate": 0.003899179764756652, + "loss": 0.8117, + "num_input_tokens_seen": 61486744, + "step": 105905 + }, + { + "epoch": 15.774501042597556, + "grad_norm": 0.054931640625, + "learning_rate": 0.0038978686259179356, + "loss": 0.7942, + "num_input_tokens_seen": 61489912, + "step": 105910 + }, + { + "epoch": 15.775245755138517, + "grad_norm": 0.034912109375, + "learning_rate": 0.003896557674638439, + "loss": 0.7878, + "num_input_tokens_seen": 61492760, + "step": 105915 + }, + { + "epoch": 15.775990467679476, + "grad_norm": 0.04443359375, + "learning_rate": 0.0038952469109403208, + "loss": 0.7826, + "num_input_tokens_seen": 61495704, + "step": 105920 + }, + { + "epoch": 15.776735180220435, + "grad_norm": 0.0517578125, + "learning_rate": 0.0038939363348457203, + "loss": 0.8001, + "num_input_tokens_seen": 61498680, + "step": 105925 + }, + { + "epoch": 15.777479892761393, + "grad_norm": 0.043701171875, + "learning_rate": 0.0038926259463767736, + "loss": 0.7934, + "num_input_tokens_seen": 61501656, + "step": 105930 + }, + { + "epoch": 15.778224605302354, + "grad_norm": 0.052734375, + "learning_rate": 0.0038913157455556257, + "loss": 0.779, + "num_input_tokens_seen": 61504600, + "step": 105935 + }, + { + "epoch": 15.778969317843313, + "grad_norm": 0.03759765625, + "learning_rate": 0.003890005732404404, + "loss": 0.7747, + "num_input_tokens_seen": 61507352, + "step": 105940 + }, + { + "epoch": 15.779714030384271, + "grad_norm": 0.0296630859375, + "learning_rate": 0.0038886959069452474, + "loss": 0.793, + "num_input_tokens_seen": 61510072, + "step": 105945 + }, + { + "epoch": 15.78045874292523, + "grad_norm": 0.0419921875, + "learning_rate": 0.0038873862692002767, + "loss": 0.7921, + "num_input_tokens_seen": 61513144, + "step": 105950 + }, + { + "epoch": 15.78120345546619, + "grad_norm": 0.052490234375, + "learning_rate": 0.003886076819191626, + "loss": 0.8031, + "num_input_tokens_seen": 61516024, + "step": 105955 + }, + { + "epoch": 15.78194816800715, + "grad_norm": 0.036865234375, + "learning_rate": 0.003884767556941411, + "loss": 0.8057, + "num_input_tokens_seen": 61519320, + "step": 105960 + }, + { + "epoch": 15.782692880548108, + "grad_norm": 0.05322265625, + "learning_rate": 0.00388345848247175, + "loss": 0.8064, + "num_input_tokens_seen": 61522424, + "step": 105965 + }, + { + "epoch": 15.783437593089067, + "grad_norm": 0.05126953125, + "learning_rate": 0.0038821495958047607, + "loss": 0.7912, + "num_input_tokens_seen": 61525432, + "step": 105970 + }, + { + "epoch": 15.784182305630027, + "grad_norm": 0.03271484375, + "learning_rate": 0.0038808408969625507, + "loss": 0.7978, + "num_input_tokens_seen": 61528088, + "step": 105975 + }, + { + "epoch": 15.784927018170986, + "grad_norm": 0.035400390625, + "learning_rate": 0.0038795323859672354, + "loss": 0.7833, + "num_input_tokens_seen": 61530936, + "step": 105980 + }, + { + "epoch": 15.785671730711945, + "grad_norm": 0.046630859375, + "learning_rate": 0.003878224062840915, + "loss": 0.7998, + "num_input_tokens_seen": 61533752, + "step": 105985 + }, + { + "epoch": 15.786416443252904, + "grad_norm": 0.058837890625, + "learning_rate": 0.0038769159276056996, + "loss": 0.7776, + "num_input_tokens_seen": 61536664, + "step": 105990 + }, + { + "epoch": 15.787161155793864, + "grad_norm": 0.03564453125, + "learning_rate": 0.003875607980283679, + "loss": 0.8168, + "num_input_tokens_seen": 61539736, + "step": 105995 + }, + { + "epoch": 15.787905868334823, + "grad_norm": 0.09521484375, + "learning_rate": 0.0038743002208969607, + "loss": 0.8252, + "num_input_tokens_seen": 61542584, + "step": 106000 + }, + { + "epoch": 15.788650580875782, + "grad_norm": 0.033203125, + "learning_rate": 0.003872992649467632, + "loss": 0.7692, + "num_input_tokens_seen": 61545304, + "step": 106005 + }, + { + "epoch": 15.78939529341674, + "grad_norm": 0.042724609375, + "learning_rate": 0.00387168526601778, + "loss": 0.7868, + "num_input_tokens_seen": 61548376, + "step": 106010 + }, + { + "epoch": 15.790140005957701, + "grad_norm": 0.052490234375, + "learning_rate": 0.0038703780705694993, + "loss": 0.7821, + "num_input_tokens_seen": 61551256, + "step": 106015 + }, + { + "epoch": 15.79088471849866, + "grad_norm": 0.040283203125, + "learning_rate": 0.0038690710631448686, + "loss": 0.7929, + "num_input_tokens_seen": 61554424, + "step": 106020 + }, + { + "epoch": 15.791629431039619, + "grad_norm": 0.049560546875, + "learning_rate": 0.0038677642437659654, + "loss": 0.8174, + "num_input_tokens_seen": 61557240, + "step": 106025 + }, + { + "epoch": 15.792374143580577, + "grad_norm": 0.0625, + "learning_rate": 0.003866457612454875, + "loss": 0.7956, + "num_input_tokens_seen": 61560248, + "step": 106030 + }, + { + "epoch": 15.793118856121538, + "grad_norm": 0.040771484375, + "learning_rate": 0.003865151169233668, + "loss": 0.7963, + "num_input_tokens_seen": 61562968, + "step": 106035 + }, + { + "epoch": 15.793863568662497, + "grad_norm": 0.041748046875, + "learning_rate": 0.0038638449141244107, + "loss": 0.7986, + "num_input_tokens_seen": 61565784, + "step": 106040 + }, + { + "epoch": 15.794608281203455, + "grad_norm": 0.037109375, + "learning_rate": 0.00386253884714918, + "loss": 0.7701, + "num_input_tokens_seen": 61569016, + "step": 106045 + }, + { + "epoch": 15.795352993744414, + "grad_norm": 0.0537109375, + "learning_rate": 0.0038612329683300364, + "loss": 0.7872, + "num_input_tokens_seen": 61571704, + "step": 106050 + }, + { + "epoch": 15.796097706285373, + "grad_norm": 0.03125, + "learning_rate": 0.0038599272776890345, + "loss": 0.804, + "num_input_tokens_seen": 61574744, + "step": 106055 + }, + { + "epoch": 15.796842418826333, + "grad_norm": 0.072265625, + "learning_rate": 0.003858621775248245, + "loss": 0.7979, + "num_input_tokens_seen": 61577720, + "step": 106060 + }, + { + "epoch": 15.797587131367292, + "grad_norm": 0.04296875, + "learning_rate": 0.0038573164610297126, + "loss": 0.7973, + "num_input_tokens_seen": 61581080, + "step": 106065 + }, + { + "epoch": 15.798331843908251, + "grad_norm": 0.044921875, + "learning_rate": 0.0038560113350554976, + "loss": 0.8024, + "num_input_tokens_seen": 61583768, + "step": 106070 + }, + { + "epoch": 15.799076556449211, + "grad_norm": 0.04931640625, + "learning_rate": 0.0038547063973476394, + "loss": 0.7801, + "num_input_tokens_seen": 61586296, + "step": 106075 + }, + { + "epoch": 15.79982126899017, + "grad_norm": 0.048583984375, + "learning_rate": 0.003853401647928195, + "loss": 0.7689, + "num_input_tokens_seen": 61589144, + "step": 106080 + }, + { + "epoch": 15.800565981531129, + "grad_norm": 0.0751953125, + "learning_rate": 0.003852097086819201, + "loss": 0.8083, + "num_input_tokens_seen": 61592088, + "step": 106085 + }, + { + "epoch": 15.801310694072088, + "grad_norm": 0.056396484375, + "learning_rate": 0.0038507927140426948, + "loss": 0.8139, + "num_input_tokens_seen": 61595032, + "step": 106090 + }, + { + "epoch": 15.802055406613047, + "grad_norm": 0.0732421875, + "learning_rate": 0.0038494885296207156, + "loss": 0.7979, + "num_input_tokens_seen": 61597848, + "step": 106095 + }, + { + "epoch": 15.802800119154007, + "grad_norm": 0.055908203125, + "learning_rate": 0.0038481845335752896, + "loss": 0.7919, + "num_input_tokens_seen": 61600824, + "step": 106100 + }, + { + "epoch": 15.803544831694966, + "grad_norm": 0.0390625, + "learning_rate": 0.0038468807259284562, + "loss": 0.803, + "num_input_tokens_seen": 61603576, + "step": 106105 + }, + { + "epoch": 15.804289544235925, + "grad_norm": 0.039794921875, + "learning_rate": 0.003845577106702234, + "loss": 0.8059, + "num_input_tokens_seen": 61606616, + "step": 106110 + }, + { + "epoch": 15.805034256776883, + "grad_norm": 0.0546875, + "learning_rate": 0.0038442736759186536, + "loss": 0.8102, + "num_input_tokens_seen": 61609400, + "step": 106115 + }, + { + "epoch": 15.805778969317844, + "grad_norm": 0.0517578125, + "learning_rate": 0.0038429704335997267, + "loss": 0.7823, + "num_input_tokens_seen": 61612184, + "step": 106120 + }, + { + "epoch": 15.806523681858803, + "grad_norm": 0.03466796875, + "learning_rate": 0.003841667379767479, + "loss": 0.7858, + "num_input_tokens_seen": 61614872, + "step": 106125 + }, + { + "epoch": 15.807268394399761, + "grad_norm": 0.0546875, + "learning_rate": 0.003840364514443921, + "loss": 0.8057, + "num_input_tokens_seen": 61617880, + "step": 106130 + }, + { + "epoch": 15.80801310694072, + "grad_norm": 0.03759765625, + "learning_rate": 0.0038390618376510565, + "loss": 0.8, + "num_input_tokens_seen": 61621176, + "step": 106135 + }, + { + "epoch": 15.80875781948168, + "grad_norm": 0.044677734375, + "learning_rate": 0.003837759349410904, + "loss": 0.7778, + "num_input_tokens_seen": 61624120, + "step": 106140 + }, + { + "epoch": 15.80950253202264, + "grad_norm": 0.0247802734375, + "learning_rate": 0.003836457049745459, + "loss": 0.7772, + "num_input_tokens_seen": 61627064, + "step": 106145 + }, + { + "epoch": 15.810247244563598, + "grad_norm": 0.0279541015625, + "learning_rate": 0.003835154938676729, + "loss": 0.7964, + "num_input_tokens_seen": 61629688, + "step": 106150 + }, + { + "epoch": 15.810991957104557, + "grad_norm": 0.04052734375, + "learning_rate": 0.0038338530162267096, + "loss": 0.7943, + "num_input_tokens_seen": 61632600, + "step": 106155 + }, + { + "epoch": 15.811736669645517, + "grad_norm": 0.04833984375, + "learning_rate": 0.003832551282417395, + "loss": 0.791, + "num_input_tokens_seen": 61635576, + "step": 106160 + }, + { + "epoch": 15.812481382186476, + "grad_norm": 0.047119140625, + "learning_rate": 0.003831249737270771, + "loss": 0.7906, + "num_input_tokens_seen": 61638456, + "step": 106165 + }, + { + "epoch": 15.813226094727435, + "grad_norm": 0.037353515625, + "learning_rate": 0.0038299483808088364, + "loss": 0.7982, + "num_input_tokens_seen": 61641304, + "step": 106170 + }, + { + "epoch": 15.813970807268394, + "grad_norm": 0.0546875, + "learning_rate": 0.00382864721305357, + "loss": 0.7978, + "num_input_tokens_seen": 61644120, + "step": 106175 + }, + { + "epoch": 15.814715519809354, + "grad_norm": 0.056640625, + "learning_rate": 0.003827346234026952, + "loss": 0.8029, + "num_input_tokens_seen": 61647128, + "step": 106180 + }, + { + "epoch": 15.815460232350313, + "grad_norm": 0.0478515625, + "learning_rate": 0.003826045443750969, + "loss": 0.8004, + "num_input_tokens_seen": 61650008, + "step": 106185 + }, + { + "epoch": 15.816204944891272, + "grad_norm": 0.040283203125, + "learning_rate": 0.0038247448422475848, + "loss": 0.8154, + "num_input_tokens_seen": 61653080, + "step": 106190 + }, + { + "epoch": 15.81694965743223, + "grad_norm": 0.06298828125, + "learning_rate": 0.003823444429538784, + "loss": 0.7992, + "num_input_tokens_seen": 61655736, + "step": 106195 + }, + { + "epoch": 15.817694369973191, + "grad_norm": 0.0341796875, + "learning_rate": 0.003822144205646526, + "loss": 0.7966, + "num_input_tokens_seen": 61658552, + "step": 106200 + }, + { + "epoch": 15.81843908251415, + "grad_norm": 0.03564453125, + "learning_rate": 0.003820844170592786, + "loss": 0.7735, + "num_input_tokens_seen": 61661592, + "step": 106205 + }, + { + "epoch": 15.819183795055109, + "grad_norm": 0.03515625, + "learning_rate": 0.0038195443243995214, + "loss": 0.7964, + "num_input_tokens_seen": 61664440, + "step": 106210 + }, + { + "epoch": 15.819928507596067, + "grad_norm": 0.03662109375, + "learning_rate": 0.0038182446670886865, + "loss": 0.7884, + "num_input_tokens_seen": 61667192, + "step": 106215 + }, + { + "epoch": 15.820673220137028, + "grad_norm": 0.037353515625, + "learning_rate": 0.0038169451986822486, + "loss": 0.8034, + "num_input_tokens_seen": 61670008, + "step": 106220 + }, + { + "epoch": 15.821417932677987, + "grad_norm": 0.04833984375, + "learning_rate": 0.0038156459192021557, + "loss": 0.7909, + "num_input_tokens_seen": 61672984, + "step": 106225 + }, + { + "epoch": 15.822162645218945, + "grad_norm": 0.03515625, + "learning_rate": 0.0038143468286703577, + "loss": 0.7851, + "num_input_tokens_seen": 61675704, + "step": 106230 + }, + { + "epoch": 15.822907357759904, + "grad_norm": 0.058837890625, + "learning_rate": 0.0038130479271087964, + "loss": 0.8103, + "num_input_tokens_seen": 61678712, + "step": 106235 + }, + { + "epoch": 15.823652070300863, + "grad_norm": 0.05322265625, + "learning_rate": 0.0038117492145394263, + "loss": 0.8053, + "num_input_tokens_seen": 61681816, + "step": 106240 + }, + { + "epoch": 15.824396782841823, + "grad_norm": 0.07080078125, + "learning_rate": 0.003810450690984176, + "loss": 0.8121, + "num_input_tokens_seen": 61684600, + "step": 106245 + }, + { + "epoch": 15.825141495382782, + "grad_norm": 0.048828125, + "learning_rate": 0.0038091523564649944, + "loss": 0.7808, + "num_input_tokens_seen": 61687288, + "step": 106250 + }, + { + "epoch": 15.825886207923741, + "grad_norm": 0.04248046875, + "learning_rate": 0.003807854211003808, + "loss": 0.8054, + "num_input_tokens_seen": 61690072, + "step": 106255 + }, + { + "epoch": 15.826630920464702, + "grad_norm": 0.052490234375, + "learning_rate": 0.003806556254622547, + "loss": 0.8053, + "num_input_tokens_seen": 61692792, + "step": 106260 + }, + { + "epoch": 15.82737563300566, + "grad_norm": 0.046875, + "learning_rate": 0.0038052584873431456, + "loss": 0.7876, + "num_input_tokens_seen": 61696056, + "step": 106265 + }, + { + "epoch": 15.828120345546619, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0038039609091875203, + "loss": 0.79, + "num_input_tokens_seen": 61699128, + "step": 106270 + }, + { + "epoch": 15.828865058087578, + "grad_norm": 0.036376953125, + "learning_rate": 0.0038026635201775993, + "loss": 0.8018, + "num_input_tokens_seen": 61702008, + "step": 106275 + }, + { + "epoch": 15.829609770628537, + "grad_norm": 0.03515625, + "learning_rate": 0.003801366320335293, + "loss": 0.7764, + "num_input_tokens_seen": 61705048, + "step": 106280 + }, + { + "epoch": 15.830354483169497, + "grad_norm": 0.0390625, + "learning_rate": 0.0038000693096825276, + "loss": 0.7911, + "num_input_tokens_seen": 61708216, + "step": 106285 + }, + { + "epoch": 15.831099195710456, + "grad_norm": 0.05908203125, + "learning_rate": 0.003798772488241207, + "loss": 0.7814, + "num_input_tokens_seen": 61710904, + "step": 106290 + }, + { + "epoch": 15.831843908251415, + "grad_norm": 0.038818359375, + "learning_rate": 0.0037974758560332406, + "loss": 0.7974, + "num_input_tokens_seen": 61713720, + "step": 106295 + }, + { + "epoch": 15.832588620792373, + "grad_norm": 0.045654296875, + "learning_rate": 0.0037961794130805335, + "loss": 0.8058, + "num_input_tokens_seen": 61716888, + "step": 106300 + }, + { + "epoch": 15.833333333333334, + "grad_norm": 0.05224609375, + "learning_rate": 0.003794883159404984, + "loss": 0.8134, + "num_input_tokens_seen": 61720024, + "step": 106305 + }, + { + "epoch": 15.834078045874293, + "grad_norm": 0.0634765625, + "learning_rate": 0.003793587095028499, + "loss": 0.779, + "num_input_tokens_seen": 61723288, + "step": 106310 + }, + { + "epoch": 15.834822758415251, + "grad_norm": 0.045166015625, + "learning_rate": 0.0037922912199729666, + "loss": 0.7965, + "num_input_tokens_seen": 61726168, + "step": 106315 + }, + { + "epoch": 15.83556747095621, + "grad_norm": 0.037841796875, + "learning_rate": 0.003790995534260287, + "loss": 0.7938, + "num_input_tokens_seen": 61728824, + "step": 106320 + }, + { + "epoch": 15.83631218349717, + "grad_norm": 0.05859375, + "learning_rate": 0.0037897000379123434, + "loss": 0.8032, + "num_input_tokens_seen": 61731544, + "step": 106325 + }, + { + "epoch": 15.83705689603813, + "grad_norm": 0.0302734375, + "learning_rate": 0.003788404730951026, + "loss": 0.7962, + "num_input_tokens_seen": 61734840, + "step": 106330 + }, + { + "epoch": 15.837801608579088, + "grad_norm": 0.07958984375, + "learning_rate": 0.0037871096133982158, + "loss": 0.7968, + "num_input_tokens_seen": 61737656, + "step": 106335 + }, + { + "epoch": 15.838546321120047, + "grad_norm": 0.03564453125, + "learning_rate": 0.003785814685275788, + "loss": 0.8057, + "num_input_tokens_seen": 61740792, + "step": 106340 + }, + { + "epoch": 15.839291033661008, + "grad_norm": 0.02783203125, + "learning_rate": 0.00378451994660563, + "loss": 0.7982, + "num_input_tokens_seen": 61743896, + "step": 106345 + }, + { + "epoch": 15.840035746201966, + "grad_norm": 0.033935546875, + "learning_rate": 0.0037832253974096064, + "loss": 0.7713, + "num_input_tokens_seen": 61746488, + "step": 106350 + }, + { + "epoch": 15.840780458742925, + "grad_norm": 0.04248046875, + "learning_rate": 0.0037819310377095864, + "loss": 0.7895, + "num_input_tokens_seen": 61749304, + "step": 106355 + }, + { + "epoch": 15.841525171283884, + "grad_norm": 0.052490234375, + "learning_rate": 0.003780636867527443, + "loss": 0.8264, + "num_input_tokens_seen": 61752536, + "step": 106360 + }, + { + "epoch": 15.842269883824844, + "grad_norm": 0.052001953125, + "learning_rate": 0.003779342886885037, + "loss": 0.8077, + "num_input_tokens_seen": 61755448, + "step": 106365 + }, + { + "epoch": 15.843014596365803, + "grad_norm": 0.036865234375, + "learning_rate": 0.0037780490958042303, + "loss": 0.7998, + "num_input_tokens_seen": 61758200, + "step": 106370 + }, + { + "epoch": 15.843759308906762, + "grad_norm": 0.028076171875, + "learning_rate": 0.003776755494306873, + "loss": 0.7925, + "num_input_tokens_seen": 61760920, + "step": 106375 + }, + { + "epoch": 15.84450402144772, + "grad_norm": 0.0625, + "learning_rate": 0.0037754620824148287, + "loss": 0.7794, + "num_input_tokens_seen": 61763800, + "step": 106380 + }, + { + "epoch": 15.845248733988681, + "grad_norm": 0.030029296875, + "learning_rate": 0.0037741688601499418, + "loss": 0.7912, + "num_input_tokens_seen": 61766488, + "step": 106385 + }, + { + "epoch": 15.84599344652964, + "grad_norm": 0.05517578125, + "learning_rate": 0.003772875827534066, + "loss": 0.7997, + "num_input_tokens_seen": 61769240, + "step": 106390 + }, + { + "epoch": 15.846738159070599, + "grad_norm": 0.04736328125, + "learning_rate": 0.003771582984589038, + "loss": 0.7827, + "num_input_tokens_seen": 61772056, + "step": 106395 + }, + { + "epoch": 15.847482871611557, + "grad_norm": 0.07275390625, + "learning_rate": 0.003770290331336708, + "loss": 0.803, + "num_input_tokens_seen": 61775000, + "step": 106400 + }, + { + "epoch": 15.848227584152518, + "grad_norm": 0.05126953125, + "learning_rate": 0.0037689978677989035, + "loss": 0.7744, + "num_input_tokens_seen": 61778008, + "step": 106405 + }, + { + "epoch": 15.848972296693477, + "grad_norm": 0.055419921875, + "learning_rate": 0.003767705593997471, + "loss": 0.7945, + "num_input_tokens_seen": 61781080, + "step": 106410 + }, + { + "epoch": 15.849717009234435, + "grad_norm": 0.0238037109375, + "learning_rate": 0.003766413509954238, + "loss": 0.792, + "num_input_tokens_seen": 61784184, + "step": 106415 + }, + { + "epoch": 15.850461721775394, + "grad_norm": 0.03466796875, + "learning_rate": 0.003765121615691029, + "loss": 0.8035, + "num_input_tokens_seen": 61787128, + "step": 106420 + }, + { + "epoch": 15.851206434316353, + "grad_norm": 0.08154296875, + "learning_rate": 0.0037638299112296735, + "loss": 0.8151, + "num_input_tokens_seen": 61789848, + "step": 106425 + }, + { + "epoch": 15.851951146857314, + "grad_norm": 0.042236328125, + "learning_rate": 0.003762538396591986, + "loss": 0.7929, + "num_input_tokens_seen": 61792728, + "step": 106430 + }, + { + "epoch": 15.852695859398272, + "grad_norm": 0.037353515625, + "learning_rate": 0.0037612470717997963, + "loss": 0.8093, + "num_input_tokens_seen": 61795736, + "step": 106435 + }, + { + "epoch": 15.853440571939231, + "grad_norm": 0.03369140625, + "learning_rate": 0.0037599559368749103, + "loss": 0.7952, + "num_input_tokens_seen": 61798584, + "step": 106440 + }, + { + "epoch": 15.85418528448019, + "grad_norm": 0.09228515625, + "learning_rate": 0.0037586649918391482, + "loss": 0.7881, + "num_input_tokens_seen": 61801720, + "step": 106445 + }, + { + "epoch": 15.85492999702115, + "grad_norm": 0.036376953125, + "learning_rate": 0.003757374236714317, + "loss": 0.7946, + "num_input_tokens_seen": 61804696, + "step": 106450 + }, + { + "epoch": 15.85567470956211, + "grad_norm": 0.04345703125, + "learning_rate": 0.0037560836715222166, + "loss": 0.775, + "num_input_tokens_seen": 61807736, + "step": 106455 + }, + { + "epoch": 15.856419422103068, + "grad_norm": 0.0286865234375, + "learning_rate": 0.00375479329628466, + "loss": 0.7898, + "num_input_tokens_seen": 61810776, + "step": 106460 + }, + { + "epoch": 15.857164134644027, + "grad_norm": 0.033203125, + "learning_rate": 0.0037535031110234366, + "loss": 0.7936, + "num_input_tokens_seen": 61813656, + "step": 106465 + }, + { + "epoch": 15.857908847184987, + "grad_norm": 0.052978515625, + "learning_rate": 0.003752213115760352, + "loss": 0.8021, + "num_input_tokens_seen": 61816920, + "step": 106470 + }, + { + "epoch": 15.858653559725946, + "grad_norm": 0.0478515625, + "learning_rate": 0.0037509233105171907, + "loss": 0.8039, + "num_input_tokens_seen": 61819608, + "step": 106475 + }, + { + "epoch": 15.859398272266905, + "grad_norm": 0.0322265625, + "learning_rate": 0.0037496336953157516, + "loss": 0.7958, + "num_input_tokens_seen": 61822552, + "step": 106480 + }, + { + "epoch": 15.860142984807863, + "grad_norm": 0.031982421875, + "learning_rate": 0.0037483442701778174, + "loss": 0.7886, + "num_input_tokens_seen": 61825752, + "step": 106485 + }, + { + "epoch": 15.860887697348824, + "grad_norm": 0.0341796875, + "learning_rate": 0.003747055035125169, + "loss": 0.8144, + "num_input_tokens_seen": 61828504, + "step": 106490 + }, + { + "epoch": 15.861632409889783, + "grad_norm": 0.037841796875, + "learning_rate": 0.003745765990179591, + "loss": 0.7797, + "num_input_tokens_seen": 61831480, + "step": 106495 + }, + { + "epoch": 15.862377122430741, + "grad_norm": 0.03857421875, + "learning_rate": 0.0037444771353628537, + "loss": 0.8082, + "num_input_tokens_seen": 61834456, + "step": 106500 + }, + { + "epoch": 15.8631218349717, + "grad_norm": 0.03271484375, + "learning_rate": 0.0037431884706967383, + "loss": 0.7966, + "num_input_tokens_seen": 61837208, + "step": 106505 + }, + { + "epoch": 15.86386654751266, + "grad_norm": 0.0537109375, + "learning_rate": 0.003741899996203009, + "loss": 0.7965, + "num_input_tokens_seen": 61840024, + "step": 106510 + }, + { + "epoch": 15.86461126005362, + "grad_norm": 0.08447265625, + "learning_rate": 0.003740611711903442, + "loss": 0.7933, + "num_input_tokens_seen": 61842616, + "step": 106515 + }, + { + "epoch": 15.865355972594578, + "grad_norm": 0.0810546875, + "learning_rate": 0.003739323617819793, + "loss": 0.8087, + "num_input_tokens_seen": 61845464, + "step": 106520 + }, + { + "epoch": 15.866100685135537, + "grad_norm": 0.0400390625, + "learning_rate": 0.0037380357139738295, + "loss": 0.789, + "num_input_tokens_seen": 61848248, + "step": 106525 + }, + { + "epoch": 15.866845397676498, + "grad_norm": 0.030517578125, + "learning_rate": 0.0037367480003873076, + "loss": 0.8022, + "num_input_tokens_seen": 61850968, + "step": 106530 + }, + { + "epoch": 15.867590110217456, + "grad_norm": 0.059814453125, + "learning_rate": 0.0037354604770819763, + "loss": 0.7932, + "num_input_tokens_seen": 61853752, + "step": 106535 + }, + { + "epoch": 15.868334822758415, + "grad_norm": 0.048095703125, + "learning_rate": 0.0037341731440795954, + "loss": 0.7913, + "num_input_tokens_seen": 61856952, + "step": 106540 + }, + { + "epoch": 15.869079535299374, + "grad_norm": 0.0380859375, + "learning_rate": 0.0037328860014019054, + "loss": 0.8036, + "num_input_tokens_seen": 61859992, + "step": 106545 + }, + { + "epoch": 15.869824247840334, + "grad_norm": 0.03125, + "learning_rate": 0.003731599049070657, + "loss": 0.7869, + "num_input_tokens_seen": 61862936, + "step": 106550 + }, + { + "epoch": 15.870568960381293, + "grad_norm": 0.0849609375, + "learning_rate": 0.003730312287107592, + "loss": 0.8261, + "num_input_tokens_seen": 61865912, + "step": 106555 + }, + { + "epoch": 15.871313672922252, + "grad_norm": 0.05029296875, + "learning_rate": 0.003729025715534447, + "loss": 0.8078, + "num_input_tokens_seen": 61868440, + "step": 106560 + }, + { + "epoch": 15.87205838546321, + "grad_norm": 0.0576171875, + "learning_rate": 0.0037277393343729524, + "loss": 0.7915, + "num_input_tokens_seen": 61871448, + "step": 106565 + }, + { + "epoch": 15.872803098004171, + "grad_norm": 0.0556640625, + "learning_rate": 0.0037264531436448497, + "loss": 0.8256, + "num_input_tokens_seen": 61874232, + "step": 106570 + }, + { + "epoch": 15.87354781054513, + "grad_norm": 0.04931640625, + "learning_rate": 0.003725167143371864, + "loss": 0.802, + "num_input_tokens_seen": 61877144, + "step": 106575 + }, + { + "epoch": 15.874292523086089, + "grad_norm": 0.052490234375, + "learning_rate": 0.0037238813335757157, + "loss": 0.7958, + "num_input_tokens_seen": 61880056, + "step": 106580 + }, + { + "epoch": 15.875037235627047, + "grad_norm": 0.046630859375, + "learning_rate": 0.0037225957142781345, + "loss": 0.7912, + "num_input_tokens_seen": 61882840, + "step": 106585 + }, + { + "epoch": 15.875781948168008, + "grad_norm": 0.061279296875, + "learning_rate": 0.0037213102855008346, + "loss": 0.7997, + "num_input_tokens_seen": 61885880, + "step": 106590 + }, + { + "epoch": 15.876526660708967, + "grad_norm": 0.04638671875, + "learning_rate": 0.0037200250472655366, + "loss": 0.8009, + "num_input_tokens_seen": 61888664, + "step": 106595 + }, + { + "epoch": 15.877271373249926, + "grad_norm": 0.0654296875, + "learning_rate": 0.0037187399995939467, + "loss": 0.8105, + "num_input_tokens_seen": 61891320, + "step": 106600 + }, + { + "epoch": 15.878016085790884, + "grad_norm": 0.038330078125, + "learning_rate": 0.0037174551425077844, + "loss": 0.785, + "num_input_tokens_seen": 61894296, + "step": 106605 + }, + { + "epoch": 15.878760798331843, + "grad_norm": 0.0341796875, + "learning_rate": 0.0037161704760287507, + "loss": 0.7982, + "num_input_tokens_seen": 61897208, + "step": 106610 + }, + { + "epoch": 15.879505510872804, + "grad_norm": 0.04931640625, + "learning_rate": 0.0037148860001785427, + "loss": 0.8035, + "num_input_tokens_seen": 61900184, + "step": 106615 + }, + { + "epoch": 15.880250223413762, + "grad_norm": 0.06787109375, + "learning_rate": 0.00371360171497887, + "loss": 0.7767, + "num_input_tokens_seen": 61902840, + "step": 106620 + }, + { + "epoch": 15.880994935954721, + "grad_norm": 0.03515625, + "learning_rate": 0.0037123176204514273, + "loss": 0.8201, + "num_input_tokens_seen": 61906104, + "step": 106625 + }, + { + "epoch": 15.88173964849568, + "grad_norm": 0.033935546875, + "learning_rate": 0.003711033716617905, + "loss": 0.8004, + "num_input_tokens_seen": 61908888, + "step": 106630 + }, + { + "epoch": 15.88248436103664, + "grad_norm": 0.049560546875, + "learning_rate": 0.0037097500034999897, + "loss": 0.7926, + "num_input_tokens_seen": 61911832, + "step": 106635 + }, + { + "epoch": 15.8832290735776, + "grad_norm": 0.05029296875, + "learning_rate": 0.0037084664811193784, + "loss": 0.791, + "num_input_tokens_seen": 61914776, + "step": 106640 + }, + { + "epoch": 15.883973786118558, + "grad_norm": 0.0732421875, + "learning_rate": 0.0037071831494977455, + "loss": 0.7953, + "num_input_tokens_seen": 61917432, + "step": 106645 + }, + { + "epoch": 15.884718498659517, + "grad_norm": 0.08447265625, + "learning_rate": 0.0037059000086567795, + "loss": 0.797, + "num_input_tokens_seen": 61920344, + "step": 106650 + }, + { + "epoch": 15.885463211200477, + "grad_norm": 0.046630859375, + "learning_rate": 0.0037046170586181536, + "loss": 0.7972, + "num_input_tokens_seen": 61922904, + "step": 106655 + }, + { + "epoch": 15.886207923741436, + "grad_norm": 0.049560546875, + "learning_rate": 0.0037033342994035393, + "loss": 0.7855, + "num_input_tokens_seen": 61926008, + "step": 106660 + }, + { + "epoch": 15.886952636282395, + "grad_norm": 0.053955078125, + "learning_rate": 0.0037020517310346144, + "loss": 0.8154, + "num_input_tokens_seen": 61928792, + "step": 106665 + }, + { + "epoch": 15.887697348823353, + "grad_norm": 0.06982421875, + "learning_rate": 0.003700769353533038, + "loss": 0.8029, + "num_input_tokens_seen": 61931864, + "step": 106670 + }, + { + "epoch": 15.888442061364314, + "grad_norm": 0.03857421875, + "learning_rate": 0.003699487166920485, + "loss": 0.8017, + "num_input_tokens_seen": 61934648, + "step": 106675 + }, + { + "epoch": 15.889186773905273, + "grad_norm": 0.040771484375, + "learning_rate": 0.0036982051712186087, + "loss": 0.804, + "num_input_tokens_seen": 61937528, + "step": 106680 + }, + { + "epoch": 15.889931486446232, + "grad_norm": 0.04638671875, + "learning_rate": 0.0036969233664490672, + "loss": 0.8259, + "num_input_tokens_seen": 61940760, + "step": 106685 + }, + { + "epoch": 15.89067619898719, + "grad_norm": 0.050048828125, + "learning_rate": 0.0036956417526335206, + "loss": 0.7785, + "num_input_tokens_seen": 61943800, + "step": 106690 + }, + { + "epoch": 15.89142091152815, + "grad_norm": 0.02197265625, + "learning_rate": 0.003694360329793617, + "loss": 0.815, + "num_input_tokens_seen": 61946680, + "step": 106695 + }, + { + "epoch": 15.89216562406911, + "grad_norm": 0.03662109375, + "learning_rate": 0.003693079097951007, + "loss": 0.7882, + "num_input_tokens_seen": 61949784, + "step": 106700 + }, + { + "epoch": 15.892910336610068, + "grad_norm": 0.040283203125, + "learning_rate": 0.003691798057127327, + "loss": 0.7923, + "num_input_tokens_seen": 61953048, + "step": 106705 + }, + { + "epoch": 15.893655049151027, + "grad_norm": 0.09716796875, + "learning_rate": 0.0036905172073442324, + "loss": 0.79, + "num_input_tokens_seen": 61955736, + "step": 106710 + }, + { + "epoch": 15.894399761691988, + "grad_norm": 0.038818359375, + "learning_rate": 0.003689236548623349, + "loss": 0.7896, + "num_input_tokens_seen": 61958904, + "step": 106715 + }, + { + "epoch": 15.895144474232946, + "grad_norm": 0.050537109375, + "learning_rate": 0.003687956080986324, + "loss": 0.8031, + "num_input_tokens_seen": 61962072, + "step": 106720 + }, + { + "epoch": 15.895889186773905, + "grad_norm": 0.03759765625, + "learning_rate": 0.003686675804454779, + "loss": 0.8117, + "num_input_tokens_seen": 61965144, + "step": 106725 + }, + { + "epoch": 15.896633899314864, + "grad_norm": 0.05224609375, + "learning_rate": 0.0036853957190503515, + "loss": 0.7974, + "num_input_tokens_seen": 61968248, + "step": 106730 + }, + { + "epoch": 15.897378611855824, + "grad_norm": 0.047607421875, + "learning_rate": 0.0036841158247946657, + "loss": 0.7981, + "num_input_tokens_seen": 61971160, + "step": 106735 + }, + { + "epoch": 15.898123324396783, + "grad_norm": 0.0439453125, + "learning_rate": 0.0036828361217093363, + "loss": 0.8199, + "num_input_tokens_seen": 61974008, + "step": 106740 + }, + { + "epoch": 15.898868036937742, + "grad_norm": 0.039306640625, + "learning_rate": 0.0036815566098159923, + "loss": 0.7924, + "num_input_tokens_seen": 61976824, + "step": 106745 + }, + { + "epoch": 15.8996127494787, + "grad_norm": 0.037353515625, + "learning_rate": 0.0036802772891362467, + "loss": 0.7975, + "num_input_tokens_seen": 61979608, + "step": 106750 + }, + { + "epoch": 15.90035746201966, + "grad_norm": 0.048828125, + "learning_rate": 0.003678998159691712, + "loss": 0.8099, + "num_input_tokens_seen": 61982488, + "step": 106755 + }, + { + "epoch": 15.90110217456062, + "grad_norm": 0.036865234375, + "learning_rate": 0.003677719221503991, + "loss": 0.7855, + "num_input_tokens_seen": 61985400, + "step": 106760 + }, + { + "epoch": 15.901846887101579, + "grad_norm": 0.0654296875, + "learning_rate": 0.0036764404745947016, + "loss": 0.817, + "num_input_tokens_seen": 61988280, + "step": 106765 + }, + { + "epoch": 15.902591599642538, + "grad_norm": 0.042724609375, + "learning_rate": 0.003675161918985438, + "loss": 0.7945, + "num_input_tokens_seen": 61991096, + "step": 106770 + }, + { + "epoch": 15.903336312183498, + "grad_norm": 0.047119140625, + "learning_rate": 0.0036738835546978074, + "loss": 0.812, + "num_input_tokens_seen": 61994008, + "step": 106775 + }, + { + "epoch": 15.904081024724457, + "grad_norm": 0.042724609375, + "learning_rate": 0.003672605381753404, + "loss": 0.8113, + "num_input_tokens_seen": 61997240, + "step": 106780 + }, + { + "epoch": 15.904825737265416, + "grad_norm": 0.09033203125, + "learning_rate": 0.0036713274001738136, + "loss": 0.7834, + "num_input_tokens_seen": 62000184, + "step": 106785 + }, + { + "epoch": 15.905570449806374, + "grad_norm": 0.0380859375, + "learning_rate": 0.003670049609980639, + "loss": 0.7898, + "num_input_tokens_seen": 62003096, + "step": 106790 + }, + { + "epoch": 15.906315162347333, + "grad_norm": 0.051025390625, + "learning_rate": 0.003668772011195455, + "loss": 0.7909, + "num_input_tokens_seen": 62006296, + "step": 106795 + }, + { + "epoch": 15.907059874888294, + "grad_norm": 0.042236328125, + "learning_rate": 0.003667494603839857, + "loss": 0.8001, + "num_input_tokens_seen": 62009336, + "step": 106800 + }, + { + "epoch": 15.907804587429252, + "grad_norm": 0.037841796875, + "learning_rate": 0.0036662173879354154, + "loss": 0.8047, + "num_input_tokens_seen": 62012152, + "step": 106805 + }, + { + "epoch": 15.908549299970211, + "grad_norm": 0.03173828125, + "learning_rate": 0.0036649403635037148, + "loss": 0.7905, + "num_input_tokens_seen": 62014776, + "step": 106810 + }, + { + "epoch": 15.90929401251117, + "grad_norm": 0.045654296875, + "learning_rate": 0.0036636635305663276, + "loss": 0.7866, + "num_input_tokens_seen": 62017752, + "step": 106815 + }, + { + "epoch": 15.91003872505213, + "grad_norm": 0.050048828125, + "learning_rate": 0.003662386889144822, + "loss": 0.7927, + "num_input_tokens_seen": 62020600, + "step": 106820 + }, + { + "epoch": 15.91078343759309, + "grad_norm": 0.03125, + "learning_rate": 0.0036611104392607697, + "loss": 0.7896, + "num_input_tokens_seen": 62023384, + "step": 106825 + }, + { + "epoch": 15.911528150134048, + "grad_norm": 0.043212890625, + "learning_rate": 0.0036598341809357254, + "loss": 0.7919, + "num_input_tokens_seen": 62026488, + "step": 106830 + }, + { + "epoch": 15.912272862675007, + "grad_norm": 0.037841796875, + "learning_rate": 0.0036585581141912607, + "loss": 0.8063, + "num_input_tokens_seen": 62029080, + "step": 106835 + }, + { + "epoch": 15.913017575215967, + "grad_norm": 0.0576171875, + "learning_rate": 0.0036572822390489273, + "loss": 0.8048, + "num_input_tokens_seen": 62032344, + "step": 106840 + }, + { + "epoch": 15.913762287756926, + "grad_norm": 0.049072265625, + "learning_rate": 0.0036560065555302857, + "loss": 0.7882, + "num_input_tokens_seen": 62035224, + "step": 106845 + }, + { + "epoch": 15.914507000297885, + "grad_norm": 0.07275390625, + "learning_rate": 0.00365473106365688, + "loss": 0.8011, + "num_input_tokens_seen": 62037976, + "step": 106850 + }, + { + "epoch": 15.915251712838844, + "grad_norm": 0.03955078125, + "learning_rate": 0.0036534557634502673, + "loss": 0.7935, + "num_input_tokens_seen": 62040888, + "step": 106855 + }, + { + "epoch": 15.915996425379804, + "grad_norm": 0.04345703125, + "learning_rate": 0.0036521806549319863, + "loss": 0.8052, + "num_input_tokens_seen": 62043576, + "step": 106860 + }, + { + "epoch": 15.916741137920763, + "grad_norm": 0.026123046875, + "learning_rate": 0.0036509057381235747, + "loss": 0.8032, + "num_input_tokens_seen": 62046776, + "step": 106865 + }, + { + "epoch": 15.917485850461722, + "grad_norm": 0.051513671875, + "learning_rate": 0.0036496310130465807, + "loss": 0.7965, + "num_input_tokens_seen": 62049720, + "step": 106870 + }, + { + "epoch": 15.91823056300268, + "grad_norm": 0.11181640625, + "learning_rate": 0.00364835647972253, + "loss": 0.7907, + "num_input_tokens_seen": 62052760, + "step": 106875 + }, + { + "epoch": 15.918975275543641, + "grad_norm": 0.036376953125, + "learning_rate": 0.0036470821381729642, + "loss": 0.8081, + "num_input_tokens_seen": 62055768, + "step": 106880 + }, + { + "epoch": 15.9197199880846, + "grad_norm": 0.03515625, + "learning_rate": 0.0036458079884194076, + "loss": 0.7893, + "num_input_tokens_seen": 62058424, + "step": 106885 + }, + { + "epoch": 15.920464700625558, + "grad_norm": 0.0213623046875, + "learning_rate": 0.0036445340304833863, + "loss": 0.8034, + "num_input_tokens_seen": 62061272, + "step": 106890 + }, + { + "epoch": 15.921209413166517, + "grad_norm": 0.044921875, + "learning_rate": 0.003643260264386419, + "loss": 0.8104, + "num_input_tokens_seen": 62063960, + "step": 106895 + }, + { + "epoch": 15.921954125707478, + "grad_norm": 0.045166015625, + "learning_rate": 0.003641986690150023, + "loss": 0.7945, + "num_input_tokens_seen": 62067128, + "step": 106900 + }, + { + "epoch": 15.922698838248436, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0036407133077957237, + "loss": 0.8007, + "num_input_tokens_seen": 62069912, + "step": 106905 + }, + { + "epoch": 15.923443550789395, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0036394401173450225, + "loss": 0.8137, + "num_input_tokens_seen": 62072760, + "step": 106910 + }, + { + "epoch": 15.924188263330354, + "grad_norm": 0.0732421875, + "learning_rate": 0.0036381671188194385, + "loss": 0.8046, + "num_input_tokens_seen": 62075608, + "step": 106915 + }, + { + "epoch": 15.924932975871315, + "grad_norm": 0.030517578125, + "learning_rate": 0.00363689431224047, + "loss": 0.7885, + "num_input_tokens_seen": 62079032, + "step": 106920 + }, + { + "epoch": 15.925677688412273, + "grad_norm": 0.032958984375, + "learning_rate": 0.0036356216976296263, + "loss": 0.8018, + "num_input_tokens_seen": 62081848, + "step": 106925 + }, + { + "epoch": 15.926422400953232, + "grad_norm": 0.040771484375, + "learning_rate": 0.0036343492750084017, + "loss": 0.7846, + "num_input_tokens_seen": 62084568, + "step": 106930 + }, + { + "epoch": 15.92716711349419, + "grad_norm": 0.038330078125, + "learning_rate": 0.0036330770443982956, + "loss": 0.7965, + "num_input_tokens_seen": 62087544, + "step": 106935 + }, + { + "epoch": 15.92791182603515, + "grad_norm": 0.05078125, + "learning_rate": 0.0036318050058208027, + "loss": 0.7852, + "num_input_tokens_seen": 62090488, + "step": 106940 + }, + { + "epoch": 15.92865653857611, + "grad_norm": 0.026123046875, + "learning_rate": 0.0036305331592974043, + "loss": 0.803, + "num_input_tokens_seen": 62093336, + "step": 106945 + }, + { + "epoch": 15.929401251117069, + "grad_norm": 0.042236328125, + "learning_rate": 0.003629261504849598, + "loss": 0.785, + "num_input_tokens_seen": 62096088, + "step": 106950 + }, + { + "epoch": 15.930145963658028, + "grad_norm": 0.047119140625, + "learning_rate": 0.0036279900424988614, + "loss": 0.8049, + "num_input_tokens_seen": 62098968, + "step": 106955 + }, + { + "epoch": 15.930890676198988, + "grad_norm": 0.039306640625, + "learning_rate": 0.0036267187722666743, + "loss": 0.7826, + "num_input_tokens_seen": 62101880, + "step": 106960 + }, + { + "epoch": 15.931635388739947, + "grad_norm": 0.047119140625, + "learning_rate": 0.0036254476941745112, + "loss": 0.8077, + "num_input_tokens_seen": 62104824, + "step": 106965 + }, + { + "epoch": 15.932380101280906, + "grad_norm": 0.037353515625, + "learning_rate": 0.0036241768082438527, + "loss": 0.7911, + "num_input_tokens_seen": 62107768, + "step": 106970 + }, + { + "epoch": 15.933124813821864, + "grad_norm": 0.028564453125, + "learning_rate": 0.003622906114496166, + "loss": 0.8156, + "num_input_tokens_seen": 62110520, + "step": 106975 + }, + { + "epoch": 15.933869526362823, + "grad_norm": 0.031494140625, + "learning_rate": 0.0036216356129529113, + "loss": 0.7925, + "num_input_tokens_seen": 62113272, + "step": 106980 + }, + { + "epoch": 15.934614238903784, + "grad_norm": 0.0322265625, + "learning_rate": 0.0036203653036355637, + "loss": 0.7902, + "num_input_tokens_seen": 62116152, + "step": 106985 + }, + { + "epoch": 15.935358951444742, + "grad_norm": 0.0390625, + "learning_rate": 0.003619095186565574, + "loss": 0.8196, + "num_input_tokens_seen": 62119192, + "step": 106990 + }, + { + "epoch": 15.936103663985701, + "grad_norm": 0.034912109375, + "learning_rate": 0.0036178252617644086, + "loss": 0.7808, + "num_input_tokens_seen": 62122232, + "step": 106995 + }, + { + "epoch": 15.93684837652666, + "grad_norm": 0.037353515625, + "learning_rate": 0.0036165555292535117, + "loss": 0.7802, + "num_input_tokens_seen": 62124952, + "step": 107000 + }, + { + "epoch": 15.93759308906762, + "grad_norm": 0.0498046875, + "learning_rate": 0.0036152859890543447, + "loss": 0.8169, + "num_input_tokens_seen": 62127832, + "step": 107005 + }, + { + "epoch": 15.93833780160858, + "grad_norm": 0.043212890625, + "learning_rate": 0.0036140166411883505, + "loss": 0.8013, + "num_input_tokens_seen": 62131000, + "step": 107010 + }, + { + "epoch": 15.939082514149538, + "grad_norm": 0.0380859375, + "learning_rate": 0.0036127474856769686, + "loss": 0.7878, + "num_input_tokens_seen": 62133816, + "step": 107015 + }, + { + "epoch": 15.939827226690497, + "grad_norm": 0.053955078125, + "learning_rate": 0.0036114785225416484, + "loss": 0.7969, + "num_input_tokens_seen": 62136792, + "step": 107020 + }, + { + "epoch": 15.940571939231457, + "grad_norm": 0.051513671875, + "learning_rate": 0.003610209751803823, + "loss": 0.7985, + "num_input_tokens_seen": 62139576, + "step": 107025 + }, + { + "epoch": 15.941316651772416, + "grad_norm": 0.04638671875, + "learning_rate": 0.0036089411734849296, + "loss": 0.8247, + "num_input_tokens_seen": 62142424, + "step": 107030 + }, + { + "epoch": 15.942061364313375, + "grad_norm": 0.0439453125, + "learning_rate": 0.0036076727876063917, + "loss": 0.8055, + "num_input_tokens_seen": 62145272, + "step": 107035 + }, + { + "epoch": 15.942806076854334, + "grad_norm": 0.03271484375, + "learning_rate": 0.0036064045941896486, + "loss": 0.8059, + "num_input_tokens_seen": 62148248, + "step": 107040 + }, + { + "epoch": 15.943550789395294, + "grad_norm": 0.1298828125, + "learning_rate": 0.003605136593256115, + "loss": 0.8356, + "num_input_tokens_seen": 62150968, + "step": 107045 + }, + { + "epoch": 15.944295501936253, + "grad_norm": 0.05078125, + "learning_rate": 0.003603868784827222, + "loss": 0.7981, + "num_input_tokens_seen": 62153912, + "step": 107050 + }, + { + "epoch": 15.945040214477212, + "grad_norm": 0.05078125, + "learning_rate": 0.003602601168924384, + "loss": 0.7857, + "num_input_tokens_seen": 62157272, + "step": 107055 + }, + { + "epoch": 15.94578492701817, + "grad_norm": 0.050537109375, + "learning_rate": 0.0036013337455690106, + "loss": 0.7833, + "num_input_tokens_seen": 62160024, + "step": 107060 + }, + { + "epoch": 15.946529639559131, + "grad_norm": 0.042236328125, + "learning_rate": 0.0036000665147825233, + "loss": 0.7944, + "num_input_tokens_seen": 62162936, + "step": 107065 + }, + { + "epoch": 15.94727435210009, + "grad_norm": 0.040771484375, + "learning_rate": 0.003598799476586321, + "loss": 0.8137, + "num_input_tokens_seen": 62165880, + "step": 107070 + }, + { + "epoch": 15.948019064641048, + "grad_norm": 0.050537109375, + "learning_rate": 0.0035975326310018173, + "loss": 0.7867, + "num_input_tokens_seen": 62168632, + "step": 107075 + }, + { + "epoch": 15.948763777182007, + "grad_norm": 0.033203125, + "learning_rate": 0.0035962659780504128, + "loss": 0.8048, + "num_input_tokens_seen": 62171576, + "step": 107080 + }, + { + "epoch": 15.949508489722968, + "grad_norm": 0.083984375, + "learning_rate": 0.0035949995177535026, + "loss": 0.814, + "num_input_tokens_seen": 62174488, + "step": 107085 + }, + { + "epoch": 15.950253202263927, + "grad_norm": 0.06884765625, + "learning_rate": 0.0035937332501324807, + "loss": 0.7705, + "num_input_tokens_seen": 62177112, + "step": 107090 + }, + { + "epoch": 15.950997914804885, + "grad_norm": 0.0869140625, + "learning_rate": 0.003592467175208747, + "loss": 0.8093, + "num_input_tokens_seen": 62179928, + "step": 107095 + }, + { + "epoch": 15.951742627345844, + "grad_norm": 0.04833984375, + "learning_rate": 0.003591201293003686, + "loss": 0.8095, + "num_input_tokens_seen": 62182872, + "step": 107100 + }, + { + "epoch": 15.952487339886805, + "grad_norm": 0.03369140625, + "learning_rate": 0.003589935603538679, + "loss": 0.7887, + "num_input_tokens_seen": 62185848, + "step": 107105 + }, + { + "epoch": 15.953232052427763, + "grad_norm": 0.044921875, + "learning_rate": 0.003588670106835119, + "loss": 0.804, + "num_input_tokens_seen": 62188632, + "step": 107110 + }, + { + "epoch": 15.953976764968722, + "grad_norm": 0.0277099609375, + "learning_rate": 0.003587404802914375, + "loss": 0.7769, + "num_input_tokens_seen": 62191480, + "step": 107115 + }, + { + "epoch": 15.95472147750968, + "grad_norm": 0.05517578125, + "learning_rate": 0.0035861396917978303, + "loss": 0.7737, + "num_input_tokens_seen": 62194488, + "step": 107120 + }, + { + "epoch": 15.95546619005064, + "grad_norm": 0.0245361328125, + "learning_rate": 0.003584874773506853, + "loss": 0.7981, + "num_input_tokens_seen": 62197304, + "step": 107125 + }, + { + "epoch": 15.9562109025916, + "grad_norm": 0.038330078125, + "learning_rate": 0.0035836100480628173, + "loss": 0.8007, + "num_input_tokens_seen": 62200344, + "step": 107130 + }, + { + "epoch": 15.956955615132559, + "grad_norm": 0.038818359375, + "learning_rate": 0.0035823455154870864, + "loss": 0.8051, + "num_input_tokens_seen": 62203224, + "step": 107135 + }, + { + "epoch": 15.957700327673518, + "grad_norm": 0.036865234375, + "learning_rate": 0.0035810811758010194, + "loss": 0.8, + "num_input_tokens_seen": 62206072, + "step": 107140 + }, + { + "epoch": 15.958445040214476, + "grad_norm": 0.0546875, + "learning_rate": 0.003579817029025987, + "loss": 0.7886, + "num_input_tokens_seen": 62209080, + "step": 107145 + }, + { + "epoch": 15.959189752755437, + "grad_norm": 0.0361328125, + "learning_rate": 0.003578553075183335, + "loss": 0.7858, + "num_input_tokens_seen": 62211832, + "step": 107150 + }, + { + "epoch": 15.959934465296396, + "grad_norm": 0.03466796875, + "learning_rate": 0.003577289314294424, + "loss": 0.7875, + "num_input_tokens_seen": 62214680, + "step": 107155 + }, + { + "epoch": 15.960679177837354, + "grad_norm": 0.04052734375, + "learning_rate": 0.003576025746380595, + "loss": 0.813, + "num_input_tokens_seen": 62217432, + "step": 107160 + }, + { + "epoch": 15.961423890378313, + "grad_norm": 0.05078125, + "learning_rate": 0.003574762371463205, + "loss": 0.7955, + "num_input_tokens_seen": 62220280, + "step": 107165 + }, + { + "epoch": 15.962168602919274, + "grad_norm": 0.039306640625, + "learning_rate": 0.003573499189563586, + "loss": 0.8032, + "num_input_tokens_seen": 62223160, + "step": 107170 + }, + { + "epoch": 15.962913315460233, + "grad_norm": 0.03125, + "learning_rate": 0.0035722362007030908, + "loss": 0.7992, + "num_input_tokens_seen": 62226104, + "step": 107175 + }, + { + "epoch": 15.963658028001191, + "grad_norm": 0.064453125, + "learning_rate": 0.0035709734049030515, + "loss": 0.7998, + "num_input_tokens_seen": 62229048, + "step": 107180 + }, + { + "epoch": 15.96440274054215, + "grad_norm": 0.037109375, + "learning_rate": 0.0035697108021847945, + "loss": 0.8102, + "num_input_tokens_seen": 62231800, + "step": 107185 + }, + { + "epoch": 15.96514745308311, + "grad_norm": 0.04931640625, + "learning_rate": 0.003568448392569662, + "loss": 0.777, + "num_input_tokens_seen": 62234584, + "step": 107190 + }, + { + "epoch": 15.96589216562407, + "grad_norm": 0.03515625, + "learning_rate": 0.00356718617607897, + "loss": 0.8022, + "num_input_tokens_seen": 62237432, + "step": 107195 + }, + { + "epoch": 15.966636878165028, + "grad_norm": 0.046142578125, + "learning_rate": 0.0035659241527340522, + "loss": 0.8118, + "num_input_tokens_seen": 62240344, + "step": 107200 + }, + { + "epoch": 15.967381590705987, + "grad_norm": 0.042236328125, + "learning_rate": 0.003564662322556222, + "loss": 0.8001, + "num_input_tokens_seen": 62243320, + "step": 107205 + }, + { + "epoch": 15.968126303246947, + "grad_norm": 0.04736328125, + "learning_rate": 0.0035634006855668012, + "loss": 0.776, + "num_input_tokens_seen": 62246392, + "step": 107210 + }, + { + "epoch": 15.968871015787906, + "grad_norm": 0.041015625, + "learning_rate": 0.0035621392417871044, + "loss": 0.7873, + "num_input_tokens_seen": 62249336, + "step": 107215 + }, + { + "epoch": 15.969615728328865, + "grad_norm": 0.036865234375, + "learning_rate": 0.0035608779912384386, + "loss": 0.7993, + "num_input_tokens_seen": 62252152, + "step": 107220 + }, + { + "epoch": 15.970360440869824, + "grad_norm": 0.0537109375, + "learning_rate": 0.0035596169339421136, + "loss": 0.8024, + "num_input_tokens_seen": 62254840, + "step": 107225 + }, + { + "epoch": 15.971105153410784, + "grad_norm": 0.06787109375, + "learning_rate": 0.0035583560699194287, + "loss": 0.8006, + "num_input_tokens_seen": 62257816, + "step": 107230 + }, + { + "epoch": 15.971849865951743, + "grad_norm": 0.0810546875, + "learning_rate": 0.0035570953991916924, + "loss": 0.7968, + "num_input_tokens_seen": 62260728, + "step": 107235 + }, + { + "epoch": 15.972594578492702, + "grad_norm": 0.0458984375, + "learning_rate": 0.0035558349217801976, + "loss": 0.8106, + "num_input_tokens_seen": 62263416, + "step": 107240 + }, + { + "epoch": 15.97333929103366, + "grad_norm": 0.0537109375, + "learning_rate": 0.003554574637706242, + "loss": 0.798, + "num_input_tokens_seen": 62266552, + "step": 107245 + }, + { + "epoch": 15.974084003574621, + "grad_norm": 0.04541015625, + "learning_rate": 0.0035533145469911136, + "loss": 0.7914, + "num_input_tokens_seen": 62269464, + "step": 107250 + }, + { + "epoch": 15.97482871611558, + "grad_norm": 0.0693359375, + "learning_rate": 0.003552054649656104, + "loss": 0.794, + "num_input_tokens_seen": 62272408, + "step": 107255 + }, + { + "epoch": 15.975573428656539, + "grad_norm": 0.04345703125, + "learning_rate": 0.003550794945722497, + "loss": 0.8023, + "num_input_tokens_seen": 62275256, + "step": 107260 + }, + { + "epoch": 15.976318141197497, + "grad_norm": 0.047607421875, + "learning_rate": 0.0035495354352115685, + "loss": 0.8038, + "num_input_tokens_seen": 62278264, + "step": 107265 + }, + { + "epoch": 15.977062853738456, + "grad_norm": 0.03271484375, + "learning_rate": 0.003548276118144606, + "loss": 0.7883, + "num_input_tokens_seen": 62281016, + "step": 107270 + }, + { + "epoch": 15.977807566279417, + "grad_norm": 0.06103515625, + "learning_rate": 0.0035470169945428735, + "loss": 0.791, + "num_input_tokens_seen": 62283608, + "step": 107275 + }, + { + "epoch": 15.978552278820375, + "grad_norm": 0.052001953125, + "learning_rate": 0.0035457580644276546, + "loss": 0.7951, + "num_input_tokens_seen": 62286584, + "step": 107280 + }, + { + "epoch": 15.979296991361334, + "grad_norm": 0.03271484375, + "learning_rate": 0.0035444993278202106, + "loss": 0.8049, + "num_input_tokens_seen": 62289496, + "step": 107285 + }, + { + "epoch": 15.980041703902295, + "grad_norm": 0.03662109375, + "learning_rate": 0.0035432407847418076, + "loss": 0.7997, + "num_input_tokens_seen": 62292536, + "step": 107290 + }, + { + "epoch": 15.980786416443253, + "grad_norm": 0.08056640625, + "learning_rate": 0.003541982435213704, + "loss": 0.8166, + "num_input_tokens_seen": 62295608, + "step": 107295 + }, + { + "epoch": 15.981531128984212, + "grad_norm": 0.03515625, + "learning_rate": 0.003540724279257166, + "loss": 0.8038, + "num_input_tokens_seen": 62299256, + "step": 107300 + }, + { + "epoch": 15.982275841525171, + "grad_norm": 0.034912109375, + "learning_rate": 0.003539466316893444, + "loss": 0.8097, + "num_input_tokens_seen": 62302328, + "step": 107305 + }, + { + "epoch": 15.98302055406613, + "grad_norm": 0.03955078125, + "learning_rate": 0.0035382085481437875, + "loss": 0.798, + "num_input_tokens_seen": 62305464, + "step": 107310 + }, + { + "epoch": 15.98376526660709, + "grad_norm": 0.046142578125, + "learning_rate": 0.00353695097302945, + "loss": 0.8084, + "num_input_tokens_seen": 62308376, + "step": 107315 + }, + { + "epoch": 15.984509979148049, + "grad_norm": 0.048583984375, + "learning_rate": 0.0035356935915716735, + "loss": 0.8237, + "num_input_tokens_seen": 62311096, + "step": 107320 + }, + { + "epoch": 15.985254691689008, + "grad_norm": 0.1279296875, + "learning_rate": 0.0035344364037917055, + "loss": 0.8274, + "num_input_tokens_seen": 62314008, + "step": 107325 + }, + { + "epoch": 15.985999404229966, + "grad_norm": 0.068359375, + "learning_rate": 0.0035331794097107766, + "loss": 0.8056, + "num_input_tokens_seen": 62317112, + "step": 107330 + }, + { + "epoch": 15.986744116770927, + "grad_norm": 0.039306640625, + "learning_rate": 0.003531922609350132, + "loss": 0.7974, + "num_input_tokens_seen": 62319928, + "step": 107335 + }, + { + "epoch": 15.987488829311886, + "grad_norm": 0.041259765625, + "learning_rate": 0.0035306660027309986, + "loss": 0.7751, + "num_input_tokens_seen": 62322872, + "step": 107340 + }, + { + "epoch": 15.988233541852845, + "grad_norm": 0.07275390625, + "learning_rate": 0.003529409589874603, + "loss": 0.8207, + "num_input_tokens_seen": 62325912, + "step": 107345 + }, + { + "epoch": 15.988978254393803, + "grad_norm": 0.050537109375, + "learning_rate": 0.0035281533708021776, + "loss": 0.7922, + "num_input_tokens_seen": 62328952, + "step": 107350 + }, + { + "epoch": 15.989722966934764, + "grad_norm": 0.04248046875, + "learning_rate": 0.003526897345534943, + "loss": 0.7641, + "num_input_tokens_seen": 62331960, + "step": 107355 + }, + { + "epoch": 15.990467679475723, + "grad_norm": 0.080078125, + "learning_rate": 0.0035256415140941157, + "loss": 0.7872, + "num_input_tokens_seen": 62334648, + "step": 107360 + }, + { + "epoch": 15.991212392016681, + "grad_norm": 0.05224609375, + "learning_rate": 0.003524385876500909, + "loss": 0.7899, + "num_input_tokens_seen": 62337464, + "step": 107365 + }, + { + "epoch": 15.99195710455764, + "grad_norm": 0.034912109375, + "learning_rate": 0.0035231304327765427, + "loss": 0.8094, + "num_input_tokens_seen": 62340408, + "step": 107370 + }, + { + "epoch": 15.9927018170986, + "grad_norm": 0.025146484375, + "learning_rate": 0.0035218751829422207, + "loss": 0.7914, + "num_input_tokens_seen": 62343288, + "step": 107375 + }, + { + "epoch": 15.99344652963956, + "grad_norm": 0.0439453125, + "learning_rate": 0.0035206201270191555, + "loss": 0.8041, + "num_input_tokens_seen": 62346360, + "step": 107380 + }, + { + "epoch": 15.994191242180518, + "grad_norm": 0.057373046875, + "learning_rate": 0.0035193652650285455, + "loss": 0.7904, + "num_input_tokens_seen": 62349432, + "step": 107385 + }, + { + "epoch": 15.994935954721477, + "grad_norm": 0.03466796875, + "learning_rate": 0.003518110596991587, + "loss": 0.7984, + "num_input_tokens_seen": 62352504, + "step": 107390 + }, + { + "epoch": 15.995680667262437, + "grad_norm": 0.042236328125, + "learning_rate": 0.003516856122929485, + "loss": 0.7709, + "num_input_tokens_seen": 62355448, + "step": 107395 + }, + { + "epoch": 15.996425379803396, + "grad_norm": 0.06396484375, + "learning_rate": 0.0035156018428634233, + "loss": 0.8093, + "num_input_tokens_seen": 62358392, + "step": 107400 + }, + { + "epoch": 15.997170092344355, + "grad_norm": 0.0576171875, + "learning_rate": 0.003514347756814599, + "loss": 0.7902, + "num_input_tokens_seen": 62361144, + "step": 107405 + }, + { + "epoch": 15.997914804885314, + "grad_norm": 0.04541015625, + "learning_rate": 0.0035130938648041984, + "loss": 0.8067, + "num_input_tokens_seen": 62363768, + "step": 107410 + }, + { + "epoch": 15.998659517426274, + "grad_norm": 0.049560546875, + "learning_rate": 0.0035118401668534, + "loss": 0.7987, + "num_input_tokens_seen": 62366360, + "step": 107415 + }, + { + "epoch": 15.999404229967233, + "grad_norm": 0.05224609375, + "learning_rate": 0.0035105866629833826, + "loss": 0.7927, + "num_input_tokens_seen": 62369464, + "step": 107420 + }, + { + "epoch": 16.0, + "eval_loss": 0.7989339828491211, + "eval_runtime": 70.497, + "eval_samples_per_second": 42.328, + "eval_steps_per_second": 10.582, + "num_input_tokens_seen": 62371472, + "step": 107424 + }, + { + "epoch": 16.000148942508194, + "grad_norm": 0.0888671875, + "learning_rate": 0.0035093333532153314, + "loss": 0.8017, + "num_input_tokens_seen": 62372016, + "step": 107425 + }, + { + "epoch": 16.00089365504915, + "grad_norm": 0.04443359375, + "learning_rate": 0.0035080802375704145, + "loss": 0.8031, + "num_input_tokens_seen": 62374608, + "step": 107430 + }, + { + "epoch": 16.00163836759011, + "grad_norm": 0.03369140625, + "learning_rate": 0.0035068273160697963, + "loss": 0.7848, + "num_input_tokens_seen": 62377584, + "step": 107435 + }, + { + "epoch": 16.002383080131068, + "grad_norm": 0.035400390625, + "learning_rate": 0.003505574588734654, + "loss": 0.7901, + "num_input_tokens_seen": 62380464, + "step": 107440 + }, + { + "epoch": 16.00312779267203, + "grad_norm": 0.035888671875, + "learning_rate": 0.003504322055586143, + "loss": 0.7923, + "num_input_tokens_seen": 62383312, + "step": 107445 + }, + { + "epoch": 16.00387250521299, + "grad_norm": 0.03759765625, + "learning_rate": 0.003503069716645432, + "loss": 0.8158, + "num_input_tokens_seen": 62386320, + "step": 107450 + }, + { + "epoch": 16.004617217753946, + "grad_norm": 0.02685546875, + "learning_rate": 0.0035018175719336675, + "loss": 0.8035, + "num_input_tokens_seen": 62389168, + "step": 107455 + }, + { + "epoch": 16.005361930294907, + "grad_norm": 0.037109375, + "learning_rate": 0.003500565621472014, + "loss": 0.7946, + "num_input_tokens_seen": 62392144, + "step": 107460 + }, + { + "epoch": 16.006106642835864, + "grad_norm": 0.0269775390625, + "learning_rate": 0.003499313865281617, + "loss": 0.8186, + "num_input_tokens_seen": 62395568, + "step": 107465 + }, + { + "epoch": 16.006851355376824, + "grad_norm": 0.047119140625, + "learning_rate": 0.00349806230338362, + "loss": 0.7907, + "num_input_tokens_seen": 62398448, + "step": 107470 + }, + { + "epoch": 16.007596067917785, + "grad_norm": 0.032470703125, + "learning_rate": 0.0034968109357991725, + "loss": 0.7926, + "num_input_tokens_seen": 62401232, + "step": 107475 + }, + { + "epoch": 16.00834078045874, + "grad_norm": 0.05078125, + "learning_rate": 0.003495559762549414, + "loss": 0.7749, + "num_input_tokens_seen": 62404016, + "step": 107480 + }, + { + "epoch": 16.009085492999702, + "grad_norm": 0.03662109375, + "learning_rate": 0.0034943087836554814, + "loss": 0.7966, + "num_input_tokens_seen": 62406800, + "step": 107485 + }, + { + "epoch": 16.009830205540663, + "grad_norm": 0.0277099609375, + "learning_rate": 0.003493057999138502, + "loss": 0.7899, + "num_input_tokens_seen": 62409648, + "step": 107490 + }, + { + "epoch": 16.01057491808162, + "grad_norm": 0.032958984375, + "learning_rate": 0.003491807409019618, + "loss": 0.8006, + "num_input_tokens_seen": 62412624, + "step": 107495 + }, + { + "epoch": 16.01131963062258, + "grad_norm": 0.034423828125, + "learning_rate": 0.003490557013319952, + "loss": 0.7741, + "num_input_tokens_seen": 62415536, + "step": 107500 + }, + { + "epoch": 16.012064343163537, + "grad_norm": 0.050537109375, + "learning_rate": 0.003489306812060623, + "loss": 0.7899, + "num_input_tokens_seen": 62418448, + "step": 107505 + }, + { + "epoch": 16.012809055704498, + "grad_norm": 0.03857421875, + "learning_rate": 0.003488056805262759, + "loss": 0.8137, + "num_input_tokens_seen": 62421136, + "step": 107510 + }, + { + "epoch": 16.01355376824546, + "grad_norm": 0.0380859375, + "learning_rate": 0.003486806992947472, + "loss": 0.8119, + "num_input_tokens_seen": 62424400, + "step": 107515 + }, + { + "epoch": 16.014298480786415, + "grad_norm": 0.0380859375, + "learning_rate": 0.003485557375135885, + "loss": 0.8104, + "num_input_tokens_seen": 62427152, + "step": 107520 + }, + { + "epoch": 16.015043193327376, + "grad_norm": 0.050048828125, + "learning_rate": 0.0034843079518490972, + "loss": 0.7886, + "num_input_tokens_seen": 62429936, + "step": 107525 + }, + { + "epoch": 16.015787905868336, + "grad_norm": 0.04052734375, + "learning_rate": 0.003483058723108227, + "loss": 0.7963, + "num_input_tokens_seen": 62432976, + "step": 107530 + }, + { + "epoch": 16.016532618409293, + "grad_norm": 0.03662109375, + "learning_rate": 0.003481809688934371, + "loss": 0.7901, + "num_input_tokens_seen": 62435568, + "step": 107535 + }, + { + "epoch": 16.017277330950254, + "grad_norm": 0.042724609375, + "learning_rate": 0.0034805608493486376, + "loss": 0.7897, + "num_input_tokens_seen": 62438288, + "step": 107540 + }, + { + "epoch": 16.01802204349121, + "grad_norm": 0.048095703125, + "learning_rate": 0.0034793122043721213, + "loss": 0.766, + "num_input_tokens_seen": 62441264, + "step": 107545 + }, + { + "epoch": 16.01876675603217, + "grad_norm": 0.0478515625, + "learning_rate": 0.003478063754025915, + "loss": 0.8006, + "num_input_tokens_seen": 62444432, + "step": 107550 + }, + { + "epoch": 16.019511468573132, + "grad_norm": 0.035400390625, + "learning_rate": 0.0034768154983311123, + "loss": 0.7993, + "num_input_tokens_seen": 62447440, + "step": 107555 + }, + { + "epoch": 16.02025618111409, + "grad_norm": 0.046875, + "learning_rate": 0.0034755674373087957, + "loss": 0.8018, + "num_input_tokens_seen": 62450224, + "step": 107560 + }, + { + "epoch": 16.02100089365505, + "grad_norm": 0.04345703125, + "learning_rate": 0.003474319570980059, + "loss": 0.7874, + "num_input_tokens_seen": 62452912, + "step": 107565 + }, + { + "epoch": 16.02174560619601, + "grad_norm": 0.04443359375, + "learning_rate": 0.003473071899365973, + "loss": 0.7937, + "num_input_tokens_seen": 62456208, + "step": 107570 + }, + { + "epoch": 16.022490318736967, + "grad_norm": 0.044921875, + "learning_rate": 0.0034718244224876283, + "loss": 0.7967, + "num_input_tokens_seen": 62459344, + "step": 107575 + }, + { + "epoch": 16.023235031277927, + "grad_norm": 0.038330078125, + "learning_rate": 0.0034705771403660923, + "loss": 0.7848, + "num_input_tokens_seen": 62462448, + "step": 107580 + }, + { + "epoch": 16.023979743818884, + "grad_norm": 0.049560546875, + "learning_rate": 0.003469330053022432, + "loss": 0.7992, + "num_input_tokens_seen": 62465264, + "step": 107585 + }, + { + "epoch": 16.024724456359845, + "grad_norm": 0.039306640625, + "learning_rate": 0.003468083160477727, + "loss": 0.793, + "num_input_tokens_seen": 62468496, + "step": 107590 + }, + { + "epoch": 16.025469168900806, + "grad_norm": 0.04833984375, + "learning_rate": 0.0034668364627530317, + "loss": 0.7856, + "num_input_tokens_seen": 62471696, + "step": 107595 + }, + { + "epoch": 16.026213881441763, + "grad_norm": 0.0537109375, + "learning_rate": 0.003465589959869417, + "loss": 0.8187, + "num_input_tokens_seen": 62474512, + "step": 107600 + }, + { + "epoch": 16.026958593982723, + "grad_norm": 0.030029296875, + "learning_rate": 0.003464343651847933, + "loss": 0.7887, + "num_input_tokens_seen": 62477424, + "step": 107605 + }, + { + "epoch": 16.027703306523684, + "grad_norm": 0.0361328125, + "learning_rate": 0.0034630975387096425, + "loss": 0.7915, + "num_input_tokens_seen": 62480240, + "step": 107610 + }, + { + "epoch": 16.02844801906464, + "grad_norm": 0.060302734375, + "learning_rate": 0.0034618516204755925, + "loss": 0.8206, + "num_input_tokens_seen": 62483056, + "step": 107615 + }, + { + "epoch": 16.0291927316056, + "grad_norm": 0.05029296875, + "learning_rate": 0.003460605897166834, + "loss": 0.785, + "num_input_tokens_seen": 62486032, + "step": 107620 + }, + { + "epoch": 16.029937444146558, + "grad_norm": 0.042236328125, + "learning_rate": 0.003459360368804411, + "loss": 0.8172, + "num_input_tokens_seen": 62488880, + "step": 107625 + }, + { + "epoch": 16.03068215668752, + "grad_norm": 0.034912109375, + "learning_rate": 0.0034581150354093602, + "loss": 0.7767, + "num_input_tokens_seen": 62491952, + "step": 107630 + }, + { + "epoch": 16.03142686922848, + "grad_norm": 0.057373046875, + "learning_rate": 0.003456869897002728, + "loss": 0.8117, + "num_input_tokens_seen": 62494800, + "step": 107635 + }, + { + "epoch": 16.032171581769436, + "grad_norm": 0.02734375, + "learning_rate": 0.003455624953605545, + "loss": 0.8032, + "num_input_tokens_seen": 62497584, + "step": 107640 + }, + { + "epoch": 16.032916294310397, + "grad_norm": 0.05224609375, + "learning_rate": 0.0034543802052388484, + "loss": 0.7914, + "num_input_tokens_seen": 62501136, + "step": 107645 + }, + { + "epoch": 16.033661006851354, + "grad_norm": 0.02685546875, + "learning_rate": 0.00345313565192366, + "loss": 0.804, + "num_input_tokens_seen": 62504112, + "step": 107650 + }, + { + "epoch": 16.034405719392314, + "grad_norm": 0.040283203125, + "learning_rate": 0.003451891293681013, + "loss": 0.8069, + "num_input_tokens_seen": 62506640, + "step": 107655 + }, + { + "epoch": 16.035150431933275, + "grad_norm": 0.043701171875, + "learning_rate": 0.003450647130531926, + "loss": 0.7897, + "num_input_tokens_seen": 62509392, + "step": 107660 + }, + { + "epoch": 16.03589514447423, + "grad_norm": 0.07861328125, + "learning_rate": 0.003449403162497413, + "loss": 0.7867, + "num_input_tokens_seen": 62512464, + "step": 107665 + }, + { + "epoch": 16.036639857015192, + "grad_norm": 0.026611328125, + "learning_rate": 0.0034481593895984977, + "loss": 0.8327, + "num_input_tokens_seen": 62515312, + "step": 107670 + }, + { + "epoch": 16.037384569556153, + "grad_norm": 0.0849609375, + "learning_rate": 0.003446915811856185, + "loss": 0.8053, + "num_input_tokens_seen": 62518192, + "step": 107675 + }, + { + "epoch": 16.03812928209711, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0034456724292914937, + "loss": 0.7906, + "num_input_tokens_seen": 62521232, + "step": 107680 + }, + { + "epoch": 16.03887399463807, + "grad_norm": 0.033203125, + "learning_rate": 0.003444429241925421, + "loss": 0.8085, + "num_input_tokens_seen": 62524176, + "step": 107685 + }, + { + "epoch": 16.039618707179027, + "grad_norm": 0.03076171875, + "learning_rate": 0.003443186249778973, + "loss": 0.8095, + "num_input_tokens_seen": 62527568, + "step": 107690 + }, + { + "epoch": 16.040363419719988, + "grad_norm": 0.053466796875, + "learning_rate": 0.003441943452873143, + "loss": 0.7997, + "num_input_tokens_seen": 62530288, + "step": 107695 + }, + { + "epoch": 16.04110813226095, + "grad_norm": 0.06884765625, + "learning_rate": 0.0034407008512289342, + "loss": 0.8056, + "num_input_tokens_seen": 62533040, + "step": 107700 + }, + { + "epoch": 16.041852844801905, + "grad_norm": 0.0419921875, + "learning_rate": 0.0034394584448673377, + "loss": 0.795, + "num_input_tokens_seen": 62536272, + "step": 107705 + }, + { + "epoch": 16.042597557342866, + "grad_norm": 0.041748046875, + "learning_rate": 0.0034382162338093356, + "loss": 0.7881, + "num_input_tokens_seen": 62539056, + "step": 107710 + }, + { + "epoch": 16.043342269883826, + "grad_norm": 0.039306640625, + "learning_rate": 0.0034369742180759256, + "loss": 0.8298, + "num_input_tokens_seen": 62541776, + "step": 107715 + }, + { + "epoch": 16.044086982424783, + "grad_norm": 0.039794921875, + "learning_rate": 0.003435732397688078, + "loss": 0.7769, + "num_input_tokens_seen": 62545008, + "step": 107720 + }, + { + "epoch": 16.044831694965744, + "grad_norm": 0.05908203125, + "learning_rate": 0.0034344907726667822, + "loss": 0.7932, + "num_input_tokens_seen": 62548080, + "step": 107725 + }, + { + "epoch": 16.0455764075067, + "grad_norm": 0.047607421875, + "learning_rate": 0.003433249343033007, + "loss": 0.8292, + "num_input_tokens_seen": 62550704, + "step": 107730 + }, + { + "epoch": 16.04632112004766, + "grad_norm": 0.04296875, + "learning_rate": 0.0034320081088077318, + "loss": 0.7847, + "num_input_tokens_seen": 62553456, + "step": 107735 + }, + { + "epoch": 16.047065832588622, + "grad_norm": 0.048583984375, + "learning_rate": 0.003430767070011924, + "loss": 0.784, + "num_input_tokens_seen": 62556464, + "step": 107740 + }, + { + "epoch": 16.04781054512958, + "grad_norm": 0.04248046875, + "learning_rate": 0.0034295262266665454, + "loss": 0.7791, + "num_input_tokens_seen": 62559408, + "step": 107745 + }, + { + "epoch": 16.04855525767054, + "grad_norm": 0.050537109375, + "learning_rate": 0.003428285578792559, + "loss": 0.7785, + "num_input_tokens_seen": 62562128, + "step": 107750 + }, + { + "epoch": 16.0492999702115, + "grad_norm": 0.034423828125, + "learning_rate": 0.0034270451264109317, + "loss": 0.81, + "num_input_tokens_seen": 62565008, + "step": 107755 + }, + { + "epoch": 16.050044682752457, + "grad_norm": 0.0546875, + "learning_rate": 0.0034258048695426134, + "loss": 0.8074, + "num_input_tokens_seen": 62567792, + "step": 107760 + }, + { + "epoch": 16.050789395293418, + "grad_norm": 0.034423828125, + "learning_rate": 0.0034245648082085536, + "loss": 0.787, + "num_input_tokens_seen": 62570576, + "step": 107765 + }, + { + "epoch": 16.051534107834375, + "grad_norm": 0.048583984375, + "learning_rate": 0.003423324942429713, + "loss": 0.7976, + "num_input_tokens_seen": 62573200, + "step": 107770 + }, + { + "epoch": 16.052278820375335, + "grad_norm": 0.05322265625, + "learning_rate": 0.0034220852722270257, + "loss": 0.8014, + "num_input_tokens_seen": 62576720, + "step": 107775 + }, + { + "epoch": 16.053023532916296, + "grad_norm": 0.054443359375, + "learning_rate": 0.003420845797621445, + "loss": 0.8156, + "num_input_tokens_seen": 62579440, + "step": 107780 + }, + { + "epoch": 16.053768245457253, + "grad_norm": 0.03955078125, + "learning_rate": 0.003419606518633906, + "loss": 0.8107, + "num_input_tokens_seen": 62582384, + "step": 107785 + }, + { + "epoch": 16.054512957998213, + "grad_norm": 0.0546875, + "learning_rate": 0.003418367435285341, + "loss": 0.7955, + "num_input_tokens_seen": 62585328, + "step": 107790 + }, + { + "epoch": 16.055257670539174, + "grad_norm": 0.06591796875, + "learning_rate": 0.0034171285475966904, + "loss": 0.7966, + "num_input_tokens_seen": 62588176, + "step": 107795 + }, + { + "epoch": 16.05600238308013, + "grad_norm": 0.041015625, + "learning_rate": 0.003415889855588878, + "loss": 0.7794, + "num_input_tokens_seen": 62591120, + "step": 107800 + }, + { + "epoch": 16.05674709562109, + "grad_norm": 0.054931640625, + "learning_rate": 0.003414651359282836, + "loss": 0.787, + "num_input_tokens_seen": 62594160, + "step": 107805 + }, + { + "epoch": 16.057491808162048, + "grad_norm": 0.041748046875, + "learning_rate": 0.0034134130586994846, + "loss": 0.8003, + "num_input_tokens_seen": 62596784, + "step": 107810 + }, + { + "epoch": 16.05823652070301, + "grad_norm": 0.040283203125, + "learning_rate": 0.003412174953859745, + "loss": 0.7889, + "num_input_tokens_seen": 62599664, + "step": 107815 + }, + { + "epoch": 16.05898123324397, + "grad_norm": 0.05029296875, + "learning_rate": 0.00341093704478453, + "loss": 0.7858, + "num_input_tokens_seen": 62602512, + "step": 107820 + }, + { + "epoch": 16.059725945784926, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0034096993314947526, + "loss": 0.7737, + "num_input_tokens_seen": 62605488, + "step": 107825 + }, + { + "epoch": 16.060470658325887, + "grad_norm": 0.03515625, + "learning_rate": 0.003408461814011328, + "loss": 0.7926, + "num_input_tokens_seen": 62608080, + "step": 107830 + }, + { + "epoch": 16.061215370866844, + "grad_norm": 0.0272216796875, + "learning_rate": 0.003407224492355156, + "loss": 0.8025, + "num_input_tokens_seen": 62610768, + "step": 107835 + }, + { + "epoch": 16.061960083407804, + "grad_norm": 0.04736328125, + "learning_rate": 0.003405987366547149, + "loss": 0.7953, + "num_input_tokens_seen": 62613936, + "step": 107840 + }, + { + "epoch": 16.062704795948765, + "grad_norm": 0.0703125, + "learning_rate": 0.0034047504366081977, + "loss": 0.7865, + "num_input_tokens_seen": 62616976, + "step": 107845 + }, + { + "epoch": 16.06344950848972, + "grad_norm": 0.05224609375, + "learning_rate": 0.003403513702559205, + "loss": 0.8043, + "num_input_tokens_seen": 62620080, + "step": 107850 + }, + { + "epoch": 16.064194221030682, + "grad_norm": 0.0308837890625, + "learning_rate": 0.003402277164421061, + "loss": 0.7904, + "num_input_tokens_seen": 62623024, + "step": 107855 + }, + { + "epoch": 16.064938933571643, + "grad_norm": 0.04248046875, + "learning_rate": 0.0034010408222146585, + "loss": 0.7888, + "num_input_tokens_seen": 62626064, + "step": 107860 + }, + { + "epoch": 16.0656836461126, + "grad_norm": 0.04443359375, + "learning_rate": 0.003399804675960884, + "loss": 0.7895, + "num_input_tokens_seen": 62629104, + "step": 107865 + }, + { + "epoch": 16.06642835865356, + "grad_norm": 0.05224609375, + "learning_rate": 0.0033985687256806152, + "loss": 0.7974, + "num_input_tokens_seen": 62631920, + "step": 107870 + }, + { + "epoch": 16.067173071194517, + "grad_norm": 0.035400390625, + "learning_rate": 0.00339733297139474, + "loss": 0.7932, + "num_input_tokens_seen": 62634864, + "step": 107875 + }, + { + "epoch": 16.067917783735478, + "grad_norm": 0.039794921875, + "learning_rate": 0.003396097413124133, + "loss": 0.8268, + "num_input_tokens_seen": 62637552, + "step": 107880 + }, + { + "epoch": 16.06866249627644, + "grad_norm": 0.028076171875, + "learning_rate": 0.003394862050889666, + "loss": 0.7933, + "num_input_tokens_seen": 62640560, + "step": 107885 + }, + { + "epoch": 16.069407208817395, + "grad_norm": 0.07177734375, + "learning_rate": 0.003393626884712205, + "loss": 0.7848, + "num_input_tokens_seen": 62643504, + "step": 107890 + }, + { + "epoch": 16.070151921358356, + "grad_norm": 0.0546875, + "learning_rate": 0.0033923919146126255, + "loss": 0.8084, + "num_input_tokens_seen": 62646640, + "step": 107895 + }, + { + "epoch": 16.070896633899316, + "grad_norm": 0.0252685546875, + "learning_rate": 0.003391157140611784, + "loss": 0.792, + "num_input_tokens_seen": 62649456, + "step": 107900 + }, + { + "epoch": 16.071641346440273, + "grad_norm": 0.054443359375, + "learning_rate": 0.003389922562730548, + "loss": 0.8078, + "num_input_tokens_seen": 62652560, + "step": 107905 + }, + { + "epoch": 16.072386058981234, + "grad_norm": 0.0458984375, + "learning_rate": 0.0033886881809897704, + "loss": 0.8059, + "num_input_tokens_seen": 62655440, + "step": 107910 + }, + { + "epoch": 16.07313077152219, + "grad_norm": 0.034912109375, + "learning_rate": 0.0033874539954103005, + "loss": 0.8094, + "num_input_tokens_seen": 62658192, + "step": 107915 + }, + { + "epoch": 16.07387548406315, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0033862200060129958, + "loss": 0.7956, + "num_input_tokens_seen": 62660880, + "step": 107920 + }, + { + "epoch": 16.074620196604112, + "grad_norm": 0.033203125, + "learning_rate": 0.0033849862128186967, + "loss": 0.8027, + "num_input_tokens_seen": 62663664, + "step": 107925 + }, + { + "epoch": 16.07536490914507, + "grad_norm": 0.078125, + "learning_rate": 0.003383752615848257, + "loss": 0.786, + "num_input_tokens_seen": 62666672, + "step": 107930 + }, + { + "epoch": 16.07610962168603, + "grad_norm": 0.04345703125, + "learning_rate": 0.0033825192151225036, + "loss": 0.7936, + "num_input_tokens_seen": 62669456, + "step": 107935 + }, + { + "epoch": 16.07685433422699, + "grad_norm": 0.039794921875, + "learning_rate": 0.003381286010662286, + "loss": 0.8019, + "num_input_tokens_seen": 62672336, + "step": 107940 + }, + { + "epoch": 16.077599046767947, + "grad_norm": 0.04736328125, + "learning_rate": 0.0033800530024884323, + "loss": 0.8082, + "num_input_tokens_seen": 62675248, + "step": 107945 + }, + { + "epoch": 16.078343759308908, + "grad_norm": 0.045166015625, + "learning_rate": 0.003378820190621774, + "loss": 0.7935, + "num_input_tokens_seen": 62678064, + "step": 107950 + }, + { + "epoch": 16.079088471849865, + "grad_norm": 0.05029296875, + "learning_rate": 0.003377587575083136, + "loss": 0.7932, + "num_input_tokens_seen": 62681168, + "step": 107955 + }, + { + "epoch": 16.079833184390825, + "grad_norm": 0.0263671875, + "learning_rate": 0.0033763551558933405, + "loss": 0.7846, + "num_input_tokens_seen": 62683664, + "step": 107960 + }, + { + "epoch": 16.080577896931786, + "grad_norm": 0.036865234375, + "learning_rate": 0.003375122933073214, + "loss": 0.8, + "num_input_tokens_seen": 62686672, + "step": 107965 + }, + { + "epoch": 16.081322609472743, + "grad_norm": 0.037109375, + "learning_rate": 0.003373890906643568, + "loss": 0.7901, + "num_input_tokens_seen": 62689680, + "step": 107970 + }, + { + "epoch": 16.082067322013703, + "grad_norm": 0.04931640625, + "learning_rate": 0.0033726590766252206, + "loss": 0.7951, + "num_input_tokens_seen": 62692368, + "step": 107975 + }, + { + "epoch": 16.082812034554664, + "grad_norm": 0.04248046875, + "learning_rate": 0.003371427443038979, + "loss": 0.8108, + "num_input_tokens_seen": 62695472, + "step": 107980 + }, + { + "epoch": 16.08355674709562, + "grad_norm": 0.047119140625, + "learning_rate": 0.0033701960059056547, + "loss": 0.7975, + "num_input_tokens_seen": 62698288, + "step": 107985 + }, + { + "epoch": 16.08430145963658, + "grad_norm": 0.056640625, + "learning_rate": 0.003368964765246049, + "loss": 0.8107, + "num_input_tokens_seen": 62701040, + "step": 107990 + }, + { + "epoch": 16.085046172177538, + "grad_norm": 0.0908203125, + "learning_rate": 0.0033677337210809575, + "loss": 0.8335, + "num_input_tokens_seen": 62703984, + "step": 107995 + }, + { + "epoch": 16.0857908847185, + "grad_norm": 0.038330078125, + "learning_rate": 0.0033665028734311867, + "loss": 0.7953, + "num_input_tokens_seen": 62706736, + "step": 108000 + }, + { + "epoch": 16.08653559725946, + "grad_norm": 0.07177734375, + "learning_rate": 0.003365272222317523, + "loss": 0.811, + "num_input_tokens_seen": 62709296, + "step": 108005 + }, + { + "epoch": 16.087280309800416, + "grad_norm": 0.037353515625, + "learning_rate": 0.003364041767760764, + "loss": 0.8042, + "num_input_tokens_seen": 62712272, + "step": 108010 + }, + { + "epoch": 16.088025022341377, + "grad_norm": 0.038818359375, + "learning_rate": 0.0033628115097816924, + "loss": 0.7952, + "num_input_tokens_seen": 62715184, + "step": 108015 + }, + { + "epoch": 16.088769734882334, + "grad_norm": 0.0341796875, + "learning_rate": 0.003361581448401093, + "loss": 0.7976, + "num_input_tokens_seen": 62718288, + "step": 108020 + }, + { + "epoch": 16.089514447423294, + "grad_norm": 0.045166015625, + "learning_rate": 0.003360351583639746, + "loss": 0.7811, + "num_input_tokens_seen": 62721168, + "step": 108025 + }, + { + "epoch": 16.090259159964255, + "grad_norm": 0.03857421875, + "learning_rate": 0.0033591219155184255, + "loss": 0.7734, + "num_input_tokens_seen": 62723728, + "step": 108030 + }, + { + "epoch": 16.091003872505212, + "grad_norm": 0.052490234375, + "learning_rate": 0.0033578924440579136, + "loss": 0.793, + "num_input_tokens_seen": 62726768, + "step": 108035 + }, + { + "epoch": 16.091748585046172, + "grad_norm": 0.0576171875, + "learning_rate": 0.0033566631692789717, + "loss": 0.7855, + "num_input_tokens_seen": 62729616, + "step": 108040 + }, + { + "epoch": 16.092493297587133, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0033554340912023755, + "loss": 0.794, + "num_input_tokens_seen": 62732432, + "step": 108045 + }, + { + "epoch": 16.09323801012809, + "grad_norm": 0.035400390625, + "learning_rate": 0.0033542052098488803, + "loss": 0.8142, + "num_input_tokens_seen": 62735440, + "step": 108050 + }, + { + "epoch": 16.09398272266905, + "grad_norm": 0.058837890625, + "learning_rate": 0.0033529765252392585, + "loss": 0.8062, + "num_input_tokens_seen": 62738288, + "step": 108055 + }, + { + "epoch": 16.094727435210007, + "grad_norm": 0.05419921875, + "learning_rate": 0.0033517480373942542, + "loss": 0.7981, + "num_input_tokens_seen": 62741008, + "step": 108060 + }, + { + "epoch": 16.095472147750968, + "grad_norm": 0.04248046875, + "learning_rate": 0.003350519746334633, + "loss": 0.8055, + "num_input_tokens_seen": 62744048, + "step": 108065 + }, + { + "epoch": 16.09621686029193, + "grad_norm": 0.036376953125, + "learning_rate": 0.0033492916520811416, + "loss": 0.8165, + "num_input_tokens_seen": 62747376, + "step": 108070 + }, + { + "epoch": 16.096961572832885, + "grad_norm": 0.033935546875, + "learning_rate": 0.0033480637546545247, + "loss": 0.7839, + "num_input_tokens_seen": 62750160, + "step": 108075 + }, + { + "epoch": 16.097706285373846, + "grad_norm": 0.059326171875, + "learning_rate": 0.0033468360540755263, + "loss": 0.8018, + "num_input_tokens_seen": 62753104, + "step": 108080 + }, + { + "epoch": 16.098450997914806, + "grad_norm": 0.05908203125, + "learning_rate": 0.003345608550364891, + "loss": 0.8147, + "num_input_tokens_seen": 62755984, + "step": 108085 + }, + { + "epoch": 16.099195710455763, + "grad_norm": 0.10888671875, + "learning_rate": 0.003344381243543356, + "loss": 0.8228, + "num_input_tokens_seen": 62759024, + "step": 108090 + }, + { + "epoch": 16.099940422996724, + "grad_norm": 0.0380859375, + "learning_rate": 0.0033431541336316492, + "loss": 0.7724, + "num_input_tokens_seen": 62762000, + "step": 108095 + }, + { + "epoch": 16.10068513553768, + "grad_norm": 0.033935546875, + "learning_rate": 0.0033419272206505093, + "loss": 0.8037, + "num_input_tokens_seen": 62764688, + "step": 108100 + }, + { + "epoch": 16.10142984807864, + "grad_norm": 0.045166015625, + "learning_rate": 0.0033407005046206598, + "loss": 0.7849, + "num_input_tokens_seen": 62767440, + "step": 108105 + }, + { + "epoch": 16.102174560619602, + "grad_norm": 0.05712890625, + "learning_rate": 0.0033394739855628222, + "loss": 0.8086, + "num_input_tokens_seen": 62770224, + "step": 108110 + }, + { + "epoch": 16.10291927316056, + "grad_norm": 0.050048828125, + "learning_rate": 0.0033382476634977237, + "loss": 0.7985, + "num_input_tokens_seen": 62773168, + "step": 108115 + }, + { + "epoch": 16.10366398570152, + "grad_norm": 0.045166015625, + "learning_rate": 0.0033370215384460754, + "loss": 0.7794, + "num_input_tokens_seen": 62775888, + "step": 108120 + }, + { + "epoch": 16.10440869824248, + "grad_norm": 0.05224609375, + "learning_rate": 0.0033357956104285978, + "loss": 0.795, + "num_input_tokens_seen": 62778672, + "step": 108125 + }, + { + "epoch": 16.105153410783437, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0033345698794659943, + "loss": 0.8014, + "num_input_tokens_seen": 62781552, + "step": 108130 + }, + { + "epoch": 16.105898123324398, + "grad_norm": 0.044189453125, + "learning_rate": 0.00333334434557898, + "loss": 0.7779, + "num_input_tokens_seen": 62784368, + "step": 108135 + }, + { + "epoch": 16.106642835865355, + "grad_norm": 0.06201171875, + "learning_rate": 0.0033321190087882567, + "loss": 0.7933, + "num_input_tokens_seen": 62787120, + "step": 108140 + }, + { + "epoch": 16.107387548406315, + "grad_norm": 0.049072265625, + "learning_rate": 0.0033308938691145244, + "loss": 0.7901, + "num_input_tokens_seen": 62789712, + "step": 108145 + }, + { + "epoch": 16.108132260947276, + "grad_norm": 0.0400390625, + "learning_rate": 0.0033296689265784785, + "loss": 0.7812, + "num_input_tokens_seen": 62792624, + "step": 108150 + }, + { + "epoch": 16.108876973488233, + "grad_norm": 0.04931640625, + "learning_rate": 0.003328444181200812, + "loss": 0.7863, + "num_input_tokens_seen": 62795440, + "step": 108155 + }, + { + "epoch": 16.109621686029193, + "grad_norm": 0.039306640625, + "learning_rate": 0.0033272196330022233, + "loss": 0.8039, + "num_input_tokens_seen": 62798256, + "step": 108160 + }, + { + "epoch": 16.11036639857015, + "grad_norm": 0.060791015625, + "learning_rate": 0.00332599528200339, + "loss": 0.7866, + "num_input_tokens_seen": 62801232, + "step": 108165 + }, + { + "epoch": 16.11111111111111, + "grad_norm": 0.044921875, + "learning_rate": 0.0033247711282250046, + "loss": 0.8036, + "num_input_tokens_seen": 62804112, + "step": 108170 + }, + { + "epoch": 16.11185582365207, + "grad_norm": 0.03564453125, + "learning_rate": 0.0033235471716877426, + "loss": 0.8039, + "num_input_tokens_seen": 62806928, + "step": 108175 + }, + { + "epoch": 16.11260053619303, + "grad_norm": 0.033935546875, + "learning_rate": 0.0033223234124122878, + "loss": 0.8099, + "num_input_tokens_seen": 62810032, + "step": 108180 + }, + { + "epoch": 16.11334524873399, + "grad_norm": 0.2353515625, + "learning_rate": 0.00332109985041931, + "loss": 0.8279, + "num_input_tokens_seen": 62813136, + "step": 108185 + }, + { + "epoch": 16.11408996127495, + "grad_norm": 0.042236328125, + "learning_rate": 0.0033198764857294743, + "loss": 0.7845, + "num_input_tokens_seen": 62816176, + "step": 108190 + }, + { + "epoch": 16.114834673815906, + "grad_norm": 0.02392578125, + "learning_rate": 0.0033186533183634606, + "loss": 0.8057, + "num_input_tokens_seen": 62818960, + "step": 108195 + }, + { + "epoch": 16.115579386356867, + "grad_norm": 0.07470703125, + "learning_rate": 0.0033174303483419214, + "loss": 0.8078, + "num_input_tokens_seen": 62821712, + "step": 108200 + }, + { + "epoch": 16.116324098897824, + "grad_norm": 0.052001953125, + "learning_rate": 0.0033162075756855277, + "loss": 0.784, + "num_input_tokens_seen": 62824496, + "step": 108205 + }, + { + "epoch": 16.117068811438784, + "grad_norm": 0.062255859375, + "learning_rate": 0.0033149850004149323, + "loss": 0.8022, + "num_input_tokens_seen": 62827408, + "step": 108210 + }, + { + "epoch": 16.117813523979745, + "grad_norm": 0.051025390625, + "learning_rate": 0.003313762622550789, + "loss": 0.7879, + "num_input_tokens_seen": 62830000, + "step": 108215 + }, + { + "epoch": 16.118558236520702, + "grad_norm": 0.0390625, + "learning_rate": 0.003312540442113744, + "loss": 0.779, + "num_input_tokens_seen": 62833040, + "step": 108220 + }, + { + "epoch": 16.119302949061662, + "grad_norm": 0.032470703125, + "learning_rate": 0.0033113184591244543, + "loss": 0.8147, + "num_input_tokens_seen": 62836240, + "step": 108225 + }, + { + "epoch": 16.120047661602623, + "grad_norm": 0.042236328125, + "learning_rate": 0.003310096673603559, + "loss": 0.8573, + "num_input_tokens_seen": 62838896, + "step": 108230 + }, + { + "epoch": 16.12079237414358, + "grad_norm": 0.046630859375, + "learning_rate": 0.0033088750855716957, + "loss": 0.7897, + "num_input_tokens_seen": 62842000, + "step": 108235 + }, + { + "epoch": 16.12153708668454, + "grad_norm": 0.08935546875, + "learning_rate": 0.0033076536950495095, + "loss": 0.817, + "num_input_tokens_seen": 62845072, + "step": 108240 + }, + { + "epoch": 16.122281799225497, + "grad_norm": 0.053955078125, + "learning_rate": 0.0033064325020576266, + "loss": 0.794, + "num_input_tokens_seen": 62847920, + "step": 108245 + }, + { + "epoch": 16.123026511766458, + "grad_norm": 0.027099609375, + "learning_rate": 0.0033052115066166845, + "loss": 0.8003, + "num_input_tokens_seen": 62850768, + "step": 108250 + }, + { + "epoch": 16.12377122430742, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0033039907087473053, + "loss": 0.8202, + "num_input_tokens_seen": 62853456, + "step": 108255 + }, + { + "epoch": 16.124515936848375, + "grad_norm": 0.04248046875, + "learning_rate": 0.00330277010847012, + "loss": 0.7958, + "num_input_tokens_seen": 62856208, + "step": 108260 + }, + { + "epoch": 16.125260649389336, + "grad_norm": 0.037109375, + "learning_rate": 0.003301549705805746, + "loss": 0.7855, + "num_input_tokens_seen": 62858928, + "step": 108265 + }, + { + "epoch": 16.126005361930297, + "grad_norm": 0.0439453125, + "learning_rate": 0.003300329500774793, + "loss": 0.7988, + "num_input_tokens_seen": 62861744, + "step": 108270 + }, + { + "epoch": 16.126750074471254, + "grad_norm": 0.03955078125, + "learning_rate": 0.0032991094933978882, + "loss": 0.8182, + "num_input_tokens_seen": 62864784, + "step": 108275 + }, + { + "epoch": 16.127494787012214, + "grad_norm": 0.040771484375, + "learning_rate": 0.0032978896836956366, + "loss": 0.7918, + "num_input_tokens_seen": 62867408, + "step": 108280 + }, + { + "epoch": 16.12823949955317, + "grad_norm": 0.06298828125, + "learning_rate": 0.0032966700716886435, + "loss": 0.7799, + "num_input_tokens_seen": 62870256, + "step": 108285 + }, + { + "epoch": 16.12898421209413, + "grad_norm": 0.059814453125, + "learning_rate": 0.003295450657397513, + "loss": 0.791, + "num_input_tokens_seen": 62873424, + "step": 108290 + }, + { + "epoch": 16.129728924635092, + "grad_norm": 0.054931640625, + "learning_rate": 0.003294231440842849, + "loss": 0.8151, + "num_input_tokens_seen": 62876368, + "step": 108295 + }, + { + "epoch": 16.13047363717605, + "grad_norm": 0.04052734375, + "learning_rate": 0.003293012422045246, + "loss": 0.7973, + "num_input_tokens_seen": 62879312, + "step": 108300 + }, + { + "epoch": 16.13121834971701, + "grad_norm": 0.061279296875, + "learning_rate": 0.0032917936010253023, + "loss": 0.8065, + "num_input_tokens_seen": 62882000, + "step": 108305 + }, + { + "epoch": 16.13196306225797, + "grad_norm": 0.039306640625, + "learning_rate": 0.0032905749778036064, + "loss": 0.8058, + "num_input_tokens_seen": 62884848, + "step": 108310 + }, + { + "epoch": 16.132707774798927, + "grad_norm": 0.03955078125, + "learning_rate": 0.0032893565524007417, + "loss": 0.8048, + "num_input_tokens_seen": 62887664, + "step": 108315 + }, + { + "epoch": 16.133452487339888, + "grad_norm": 0.04345703125, + "learning_rate": 0.0032881383248373, + "loss": 0.778, + "num_input_tokens_seen": 62890704, + "step": 108320 + }, + { + "epoch": 16.134197199880845, + "grad_norm": 0.043701171875, + "learning_rate": 0.0032869202951338523, + "loss": 0.7728, + "num_input_tokens_seen": 62893584, + "step": 108325 + }, + { + "epoch": 16.134941912421805, + "grad_norm": 0.047119140625, + "learning_rate": 0.0032857024633109874, + "loss": 0.7905, + "num_input_tokens_seen": 62896432, + "step": 108330 + }, + { + "epoch": 16.135686624962766, + "grad_norm": 0.039794921875, + "learning_rate": 0.0032844848293892685, + "loss": 0.7839, + "num_input_tokens_seen": 62899824, + "step": 108335 + }, + { + "epoch": 16.136431337503723, + "grad_norm": 0.039306640625, + "learning_rate": 0.003283267393389275, + "loss": 0.7929, + "num_input_tokens_seen": 62902512, + "step": 108340 + }, + { + "epoch": 16.137176050044683, + "grad_norm": 0.039306640625, + "learning_rate": 0.0032820501553315726, + "loss": 0.798, + "num_input_tokens_seen": 62905776, + "step": 108345 + }, + { + "epoch": 16.13792076258564, + "grad_norm": 0.04931640625, + "learning_rate": 0.0032808331152367216, + "loss": 0.7805, + "num_input_tokens_seen": 62908560, + "step": 108350 + }, + { + "epoch": 16.1386654751266, + "grad_norm": 0.055908203125, + "learning_rate": 0.0032796162731252845, + "loss": 0.795, + "num_input_tokens_seen": 62911504, + "step": 108355 + }, + { + "epoch": 16.13941018766756, + "grad_norm": 0.0478515625, + "learning_rate": 0.0032783996290178144, + "loss": 0.8073, + "num_input_tokens_seen": 62914320, + "step": 108360 + }, + { + "epoch": 16.14015490020852, + "grad_norm": 0.058349609375, + "learning_rate": 0.003277183182934873, + "loss": 0.804, + "num_input_tokens_seen": 62917168, + "step": 108365 + }, + { + "epoch": 16.14089961274948, + "grad_norm": 0.039306640625, + "learning_rate": 0.003275966934897003, + "loss": 0.7964, + "num_input_tokens_seen": 62919952, + "step": 108370 + }, + { + "epoch": 16.14164432529044, + "grad_norm": 0.03955078125, + "learning_rate": 0.0032747508849247605, + "loss": 0.8041, + "num_input_tokens_seen": 62922736, + "step": 108375 + }, + { + "epoch": 16.142389037831396, + "grad_norm": 0.0625, + "learning_rate": 0.003273535033038681, + "loss": 0.7852, + "num_input_tokens_seen": 62925552, + "step": 108380 + }, + { + "epoch": 16.143133750372357, + "grad_norm": 0.051025390625, + "learning_rate": 0.003272319379259312, + "loss": 0.7931, + "num_input_tokens_seen": 62928208, + "step": 108385 + }, + { + "epoch": 16.143878462913314, + "grad_norm": 0.045166015625, + "learning_rate": 0.0032711039236071873, + "loss": 0.7859, + "num_input_tokens_seen": 62931184, + "step": 108390 + }, + { + "epoch": 16.144623175454274, + "grad_norm": 0.03857421875, + "learning_rate": 0.003269888666102838, + "loss": 0.7854, + "num_input_tokens_seen": 62933936, + "step": 108395 + }, + { + "epoch": 16.145367887995235, + "grad_norm": 0.0263671875, + "learning_rate": 0.003268673606766802, + "loss": 0.8046, + "num_input_tokens_seen": 62937264, + "step": 108400 + }, + { + "epoch": 16.146112600536192, + "grad_norm": 0.033935546875, + "learning_rate": 0.003267458745619603, + "loss": 0.7911, + "num_input_tokens_seen": 62940240, + "step": 108405 + }, + { + "epoch": 16.146857313077152, + "grad_norm": 0.0615234375, + "learning_rate": 0.003266244082681758, + "loss": 0.7943, + "num_input_tokens_seen": 62943088, + "step": 108410 + }, + { + "epoch": 16.147602025618113, + "grad_norm": 0.047607421875, + "learning_rate": 0.0032650296179737985, + "loss": 0.8012, + "num_input_tokens_seen": 62945968, + "step": 108415 + }, + { + "epoch": 16.14834673815907, + "grad_norm": 0.0400390625, + "learning_rate": 0.003263815351516237, + "loss": 0.794, + "num_input_tokens_seen": 62948624, + "step": 108420 + }, + { + "epoch": 16.14909145070003, + "grad_norm": 0.028076171875, + "learning_rate": 0.0032626012833295867, + "loss": 0.8064, + "num_input_tokens_seen": 62951728, + "step": 108425 + }, + { + "epoch": 16.149836163240987, + "grad_norm": 0.06787109375, + "learning_rate": 0.0032613874134343533, + "loss": 0.8068, + "num_input_tokens_seen": 62954704, + "step": 108430 + }, + { + "epoch": 16.150580875781948, + "grad_norm": 0.0625, + "learning_rate": 0.003260173741851055, + "loss": 0.8066, + "num_input_tokens_seen": 62957392, + "step": 108435 + }, + { + "epoch": 16.15132558832291, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0032589602686001837, + "loss": 0.8011, + "num_input_tokens_seen": 62960144, + "step": 108440 + }, + { + "epoch": 16.152070300863866, + "grad_norm": 0.042236328125, + "learning_rate": 0.00325774699370225, + "loss": 0.798, + "num_input_tokens_seen": 62963120, + "step": 108445 + }, + { + "epoch": 16.152815013404826, + "grad_norm": 0.043701171875, + "learning_rate": 0.003256533917177744, + "loss": 0.7965, + "num_input_tokens_seen": 62966064, + "step": 108450 + }, + { + "epoch": 16.153559725945787, + "grad_norm": 0.091796875, + "learning_rate": 0.003255321039047164, + "loss": 0.8214, + "num_input_tokens_seen": 62969168, + "step": 108455 + }, + { + "epoch": 16.154304438486744, + "grad_norm": 0.049072265625, + "learning_rate": 0.003254108359330996, + "loss": 0.7907, + "num_input_tokens_seen": 62971952, + "step": 108460 + }, + { + "epoch": 16.155049151027704, + "grad_norm": 0.060302734375, + "learning_rate": 0.0032528958780497328, + "loss": 0.8034, + "num_input_tokens_seen": 62974704, + "step": 108465 + }, + { + "epoch": 16.15579386356866, + "grad_norm": 0.039306640625, + "learning_rate": 0.0032516835952238537, + "loss": 0.7904, + "num_input_tokens_seen": 62977680, + "step": 108470 + }, + { + "epoch": 16.15653857610962, + "grad_norm": 0.03515625, + "learning_rate": 0.00325047151087384, + "loss": 0.8011, + "num_input_tokens_seen": 62980368, + "step": 108475 + }, + { + "epoch": 16.157283288650582, + "grad_norm": 0.034912109375, + "learning_rate": 0.0032492596250201693, + "loss": 0.8097, + "num_input_tokens_seen": 62983696, + "step": 108480 + }, + { + "epoch": 16.15802800119154, + "grad_norm": 0.05419921875, + "learning_rate": 0.00324804793768331, + "loss": 0.7891, + "num_input_tokens_seen": 62986544, + "step": 108485 + }, + { + "epoch": 16.1587727137325, + "grad_norm": 0.048095703125, + "learning_rate": 0.0032468364488837404, + "loss": 0.7966, + "num_input_tokens_seen": 62989328, + "step": 108490 + }, + { + "epoch": 16.15951742627346, + "grad_norm": 0.0625, + "learning_rate": 0.0032456251586419193, + "loss": 0.7979, + "num_input_tokens_seen": 62992464, + "step": 108495 + }, + { + "epoch": 16.160262138814417, + "grad_norm": 0.040283203125, + "learning_rate": 0.003244414066978319, + "loss": 0.7899, + "num_input_tokens_seen": 62995632, + "step": 108500 + }, + { + "epoch": 16.161006851355378, + "grad_norm": 0.064453125, + "learning_rate": 0.0032432031739133907, + "loss": 0.7992, + "num_input_tokens_seen": 62998544, + "step": 108505 + }, + { + "epoch": 16.161751563896335, + "grad_norm": 0.0322265625, + "learning_rate": 0.003241992479467599, + "loss": 0.8083, + "num_input_tokens_seen": 63001584, + "step": 108510 + }, + { + "epoch": 16.162496276437295, + "grad_norm": 0.08203125, + "learning_rate": 0.0032407819836613953, + "loss": 0.804, + "num_input_tokens_seen": 63004208, + "step": 108515 + }, + { + "epoch": 16.163240988978256, + "grad_norm": 0.0458984375, + "learning_rate": 0.003239571686515225, + "loss": 0.7981, + "num_input_tokens_seen": 63007056, + "step": 108520 + }, + { + "epoch": 16.163985701519213, + "grad_norm": 0.0208740234375, + "learning_rate": 0.0032383615880495417, + "loss": 0.815, + "num_input_tokens_seen": 63010192, + "step": 108525 + }, + { + "epoch": 16.164730414060173, + "grad_norm": 0.04296875, + "learning_rate": 0.003237151688284781, + "loss": 0.8179, + "num_input_tokens_seen": 63013008, + "step": 108530 + }, + { + "epoch": 16.16547512660113, + "grad_norm": 0.03564453125, + "learning_rate": 0.003235941987241391, + "loss": 0.7986, + "num_input_tokens_seen": 63016112, + "step": 108535 + }, + { + "epoch": 16.16621983914209, + "grad_norm": 0.037353515625, + "learning_rate": 0.0032347324849398046, + "loss": 0.8112, + "num_input_tokens_seen": 63019088, + "step": 108540 + }, + { + "epoch": 16.16696455168305, + "grad_norm": 0.02880859375, + "learning_rate": 0.003233523181400456, + "loss": 0.7891, + "num_input_tokens_seen": 63022160, + "step": 108545 + }, + { + "epoch": 16.16770926422401, + "grad_norm": 0.07666015625, + "learning_rate": 0.0032323140766437747, + "loss": 0.7749, + "num_input_tokens_seen": 63024976, + "step": 108550 + }, + { + "epoch": 16.16845397676497, + "grad_norm": 0.02490234375, + "learning_rate": 0.003231105170690182, + "loss": 0.7974, + "num_input_tokens_seen": 63027696, + "step": 108555 + }, + { + "epoch": 16.16919868930593, + "grad_norm": 0.058837890625, + "learning_rate": 0.0032298964635601108, + "loss": 0.8024, + "num_input_tokens_seen": 63030480, + "step": 108560 + }, + { + "epoch": 16.169943401846886, + "grad_norm": 0.0294189453125, + "learning_rate": 0.003228687955273972, + "loss": 0.818, + "num_input_tokens_seen": 63033392, + "step": 108565 + }, + { + "epoch": 16.170688114387847, + "grad_norm": 0.043212890625, + "learning_rate": 0.0032274796458521904, + "loss": 0.7901, + "num_input_tokens_seen": 63036496, + "step": 108570 + }, + { + "epoch": 16.171432826928804, + "grad_norm": 0.049072265625, + "learning_rate": 0.0032262715353151704, + "loss": 0.7935, + "num_input_tokens_seen": 63039472, + "step": 108575 + }, + { + "epoch": 16.172177539469764, + "grad_norm": 0.05908203125, + "learning_rate": 0.0032250636236833313, + "loss": 0.7932, + "num_input_tokens_seen": 63042544, + "step": 108580 + }, + { + "epoch": 16.172922252010725, + "grad_norm": 0.07177734375, + "learning_rate": 0.0032238559109770697, + "loss": 0.7952, + "num_input_tokens_seen": 63045520, + "step": 108585 + }, + { + "epoch": 16.173666964551682, + "grad_norm": 0.037109375, + "learning_rate": 0.0032226483972167996, + "loss": 0.8098, + "num_input_tokens_seen": 63048464, + "step": 108590 + }, + { + "epoch": 16.174411677092642, + "grad_norm": 0.058837890625, + "learning_rate": 0.003221441082422913, + "loss": 0.8116, + "num_input_tokens_seen": 63051440, + "step": 108595 + }, + { + "epoch": 16.175156389633603, + "grad_norm": 0.078125, + "learning_rate": 0.0032202339666158046, + "loss": 0.8018, + "num_input_tokens_seen": 63054192, + "step": 108600 + }, + { + "epoch": 16.17590110217456, + "grad_norm": 0.049072265625, + "learning_rate": 0.0032190270498158756, + "loss": 0.7972, + "num_input_tokens_seen": 63057296, + "step": 108605 + }, + { + "epoch": 16.17664581471552, + "grad_norm": 0.0576171875, + "learning_rate": 0.003217820332043511, + "loss": 0.7902, + "num_input_tokens_seen": 63059728, + "step": 108610 + }, + { + "epoch": 16.177390527256478, + "grad_norm": 0.0439453125, + "learning_rate": 0.003216613813319096, + "loss": 0.7921, + "num_input_tokens_seen": 63062640, + "step": 108615 + }, + { + "epoch": 16.178135239797438, + "grad_norm": 0.056884765625, + "learning_rate": 0.0032154074936630115, + "loss": 0.7985, + "num_input_tokens_seen": 63065392, + "step": 108620 + }, + { + "epoch": 16.1788799523384, + "grad_norm": 0.04833984375, + "learning_rate": 0.003214201373095644, + "loss": 0.8091, + "num_input_tokens_seen": 63068272, + "step": 108625 + }, + { + "epoch": 16.179624664879356, + "grad_norm": 0.05224609375, + "learning_rate": 0.0032129954516373657, + "loss": 0.7999, + "num_input_tokens_seen": 63070928, + "step": 108630 + }, + { + "epoch": 16.180369377420316, + "grad_norm": 0.049072265625, + "learning_rate": 0.0032117897293085437, + "loss": 0.799, + "num_input_tokens_seen": 63073904, + "step": 108635 + }, + { + "epoch": 16.181114089961277, + "grad_norm": 0.04638671875, + "learning_rate": 0.0032105842061295603, + "loss": 0.7817, + "num_input_tokens_seen": 63077136, + "step": 108640 + }, + { + "epoch": 16.181858802502234, + "grad_norm": 0.022216796875, + "learning_rate": 0.0032093788821207685, + "loss": 0.7905, + "num_input_tokens_seen": 63079952, + "step": 108645 + }, + { + "epoch": 16.182603515043194, + "grad_norm": 0.036865234375, + "learning_rate": 0.003208173757302541, + "loss": 0.8059, + "num_input_tokens_seen": 63082768, + "step": 108650 + }, + { + "epoch": 16.18334822758415, + "grad_norm": 0.0439453125, + "learning_rate": 0.0032069688316952293, + "loss": 0.8129, + "num_input_tokens_seen": 63085776, + "step": 108655 + }, + { + "epoch": 16.18409294012511, + "grad_norm": 0.06689453125, + "learning_rate": 0.0032057641053191994, + "loss": 0.7834, + "num_input_tokens_seen": 63088432, + "step": 108660 + }, + { + "epoch": 16.184837652666072, + "grad_norm": 0.041259765625, + "learning_rate": 0.003204559578194791, + "loss": 0.8042, + "num_input_tokens_seen": 63091440, + "step": 108665 + }, + { + "epoch": 16.18558236520703, + "grad_norm": 0.049560546875, + "learning_rate": 0.003203355250342365, + "loss": 0.8094, + "num_input_tokens_seen": 63094480, + "step": 108670 + }, + { + "epoch": 16.18632707774799, + "grad_norm": 0.04248046875, + "learning_rate": 0.0032021511217822634, + "loss": 0.7943, + "num_input_tokens_seen": 63097488, + "step": 108675 + }, + { + "epoch": 16.187071790288947, + "grad_norm": 0.048828125, + "learning_rate": 0.003200947192534826, + "loss": 0.8038, + "num_input_tokens_seen": 63100528, + "step": 108680 + }, + { + "epoch": 16.187816502829907, + "grad_norm": 0.043701171875, + "learning_rate": 0.003199743462620396, + "loss": 0.7923, + "num_input_tokens_seen": 63103760, + "step": 108685 + }, + { + "epoch": 16.188561215370868, + "grad_norm": 0.0458984375, + "learning_rate": 0.0031985399320593015, + "loss": 0.8031, + "num_input_tokens_seen": 63106544, + "step": 108690 + }, + { + "epoch": 16.189305927911825, + "grad_norm": 0.058349609375, + "learning_rate": 0.003197336600871884, + "loss": 0.7934, + "num_input_tokens_seen": 63109296, + "step": 108695 + }, + { + "epoch": 16.190050640452785, + "grad_norm": 0.09619140625, + "learning_rate": 0.003196133469078467, + "loss": 0.8033, + "num_input_tokens_seen": 63112016, + "step": 108700 + }, + { + "epoch": 16.190795352993746, + "grad_norm": 0.059326171875, + "learning_rate": 0.003194930536699381, + "loss": 0.7796, + "num_input_tokens_seen": 63114768, + "step": 108705 + }, + { + "epoch": 16.191540065534703, + "grad_norm": 0.041259765625, + "learning_rate": 0.003193727803754944, + "loss": 0.7855, + "num_input_tokens_seen": 63117424, + "step": 108710 + }, + { + "epoch": 16.192284778075663, + "grad_norm": 0.040283203125, + "learning_rate": 0.003192525270265475, + "loss": 0.7917, + "num_input_tokens_seen": 63120432, + "step": 108715 + }, + { + "epoch": 16.19302949061662, + "grad_norm": 0.056884765625, + "learning_rate": 0.0031913229362512934, + "loss": 0.8166, + "num_input_tokens_seen": 63123248, + "step": 108720 + }, + { + "epoch": 16.19377420315758, + "grad_norm": 0.038818359375, + "learning_rate": 0.0031901208017327052, + "loss": 0.799, + "num_input_tokens_seen": 63126192, + "step": 108725 + }, + { + "epoch": 16.19451891569854, + "grad_norm": 0.1982421875, + "learning_rate": 0.003188918866730026, + "loss": 0.8248, + "num_input_tokens_seen": 63128912, + "step": 108730 + }, + { + "epoch": 16.1952636282395, + "grad_norm": 0.0439453125, + "learning_rate": 0.003187717131263559, + "loss": 0.7964, + "num_input_tokens_seen": 63131696, + "step": 108735 + }, + { + "epoch": 16.19600834078046, + "grad_norm": 0.046142578125, + "learning_rate": 0.0031865155953536024, + "loss": 0.8162, + "num_input_tokens_seen": 63134704, + "step": 108740 + }, + { + "epoch": 16.19675305332142, + "grad_norm": 0.052734375, + "learning_rate": 0.0031853142590204616, + "loss": 0.79, + "num_input_tokens_seen": 63137648, + "step": 108745 + }, + { + "epoch": 16.197497765862376, + "grad_norm": 0.042724609375, + "learning_rate": 0.003184113122284429, + "loss": 0.7797, + "num_input_tokens_seen": 63140880, + "step": 108750 + }, + { + "epoch": 16.198242478403337, + "grad_norm": 0.03662109375, + "learning_rate": 0.0031829121851657963, + "loss": 0.7889, + "num_input_tokens_seen": 63143568, + "step": 108755 + }, + { + "epoch": 16.198987190944294, + "grad_norm": 0.0595703125, + "learning_rate": 0.0031817114476848474, + "loss": 0.7763, + "num_input_tokens_seen": 63146544, + "step": 108760 + }, + { + "epoch": 16.199731903485254, + "grad_norm": 0.03955078125, + "learning_rate": 0.0031805109098618748, + "loss": 0.7837, + "num_input_tokens_seen": 63149648, + "step": 108765 + }, + { + "epoch": 16.200476616026215, + "grad_norm": 0.031005859375, + "learning_rate": 0.003179310571717155, + "loss": 0.7911, + "num_input_tokens_seen": 63152560, + "step": 108770 + }, + { + "epoch": 16.201221328567172, + "grad_norm": 0.045166015625, + "learning_rate": 0.0031781104332709725, + "loss": 0.8134, + "num_input_tokens_seen": 63155344, + "step": 108775 + }, + { + "epoch": 16.201966041108133, + "grad_norm": 0.05810546875, + "learning_rate": 0.0031769104945435947, + "loss": 0.824, + "num_input_tokens_seen": 63158352, + "step": 108780 + }, + { + "epoch": 16.202710753649093, + "grad_norm": 0.05517578125, + "learning_rate": 0.0031757107555553028, + "loss": 0.7922, + "num_input_tokens_seen": 63161008, + "step": 108785 + }, + { + "epoch": 16.20345546619005, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0031745112163263593, + "loss": 0.8089, + "num_input_tokens_seen": 63163888, + "step": 108790 + }, + { + "epoch": 16.20420017873101, + "grad_norm": 0.045654296875, + "learning_rate": 0.0031733118768770257, + "loss": 0.7916, + "num_input_tokens_seen": 63166704, + "step": 108795 + }, + { + "epoch": 16.204944891271968, + "grad_norm": 0.044189453125, + "learning_rate": 0.003172112737227572, + "loss": 0.793, + "num_input_tokens_seen": 63169680, + "step": 108800 + }, + { + "epoch": 16.205689603812928, + "grad_norm": 0.0419921875, + "learning_rate": 0.003170913797398252, + "loss": 0.8311, + "num_input_tokens_seen": 63172592, + "step": 108805 + }, + { + "epoch": 16.20643431635389, + "grad_norm": 0.06884765625, + "learning_rate": 0.0031697150574093213, + "loss": 0.7917, + "num_input_tokens_seen": 63175344, + "step": 108810 + }, + { + "epoch": 16.207179028894846, + "grad_norm": 0.057373046875, + "learning_rate": 0.0031685165172810267, + "loss": 0.7878, + "num_input_tokens_seen": 63178320, + "step": 108815 + }, + { + "epoch": 16.207923741435806, + "grad_norm": 0.031982421875, + "learning_rate": 0.003167318177033626, + "loss": 0.7982, + "num_input_tokens_seen": 63181200, + "step": 108820 + }, + { + "epoch": 16.208668453976767, + "grad_norm": 0.044677734375, + "learning_rate": 0.0031661200366873525, + "loss": 0.8064, + "num_input_tokens_seen": 63184176, + "step": 108825 + }, + { + "epoch": 16.209413166517724, + "grad_norm": 0.054931640625, + "learning_rate": 0.0031649220962624586, + "loss": 0.7864, + "num_input_tokens_seen": 63187248, + "step": 108830 + }, + { + "epoch": 16.210157879058684, + "grad_norm": 0.055419921875, + "learning_rate": 0.003163724355779176, + "loss": 0.7646, + "num_input_tokens_seen": 63190256, + "step": 108835 + }, + { + "epoch": 16.21090259159964, + "grad_norm": 0.0223388671875, + "learning_rate": 0.003162526815257739, + "loss": 0.8021, + "num_input_tokens_seen": 63192944, + "step": 108840 + }, + { + "epoch": 16.2116473041406, + "grad_norm": 0.0390625, + "learning_rate": 0.0031613294747183827, + "loss": 0.8016, + "num_input_tokens_seen": 63195664, + "step": 108845 + }, + { + "epoch": 16.212392016681562, + "grad_norm": 0.054931640625, + "learning_rate": 0.003160132334181328, + "loss": 0.7848, + "num_input_tokens_seen": 63198608, + "step": 108850 + }, + { + "epoch": 16.21313672922252, + "grad_norm": 0.046630859375, + "learning_rate": 0.0031589353936668082, + "loss": 0.8066, + "num_input_tokens_seen": 63201520, + "step": 108855 + }, + { + "epoch": 16.21388144176348, + "grad_norm": 0.0439453125, + "learning_rate": 0.0031577386531950378, + "loss": 0.8017, + "num_input_tokens_seen": 63204464, + "step": 108860 + }, + { + "epoch": 16.214626154304437, + "grad_norm": 0.03662109375, + "learning_rate": 0.00315654211278624, + "loss": 0.7818, + "num_input_tokens_seen": 63207696, + "step": 108865 + }, + { + "epoch": 16.215370866845397, + "grad_norm": 0.032470703125, + "learning_rate": 0.003155345772460626, + "loss": 0.8066, + "num_input_tokens_seen": 63210768, + "step": 108870 + }, + { + "epoch": 16.216115579386358, + "grad_norm": 0.031982421875, + "learning_rate": 0.003154149632238407, + "loss": 0.821, + "num_input_tokens_seen": 63213968, + "step": 108875 + }, + { + "epoch": 16.216860291927315, + "grad_norm": 0.032958984375, + "learning_rate": 0.003152953692139789, + "loss": 0.7807, + "num_input_tokens_seen": 63216656, + "step": 108880 + }, + { + "epoch": 16.217605004468275, + "grad_norm": 0.035400390625, + "learning_rate": 0.003151757952184975, + "loss": 0.7927, + "num_input_tokens_seen": 63219344, + "step": 108885 + }, + { + "epoch": 16.218349717009236, + "grad_norm": 0.043212890625, + "learning_rate": 0.003150562412394172, + "loss": 0.7886, + "num_input_tokens_seen": 63222448, + "step": 108890 + }, + { + "epoch": 16.219094429550193, + "grad_norm": 0.051513671875, + "learning_rate": 0.0031493670727875704, + "loss": 0.8222, + "num_input_tokens_seen": 63225584, + "step": 108895 + }, + { + "epoch": 16.219839142091153, + "grad_norm": 0.039794921875, + "learning_rate": 0.003148171933385372, + "loss": 0.8073, + "num_input_tokens_seen": 63228656, + "step": 108900 + }, + { + "epoch": 16.22058385463211, + "grad_norm": 0.06298828125, + "learning_rate": 0.0031469769942077566, + "loss": 0.7992, + "num_input_tokens_seen": 63231472, + "step": 108905 + }, + { + "epoch": 16.22132856717307, + "grad_norm": 0.04638671875, + "learning_rate": 0.003145782255274924, + "loss": 0.8082, + "num_input_tokens_seen": 63234416, + "step": 108910 + }, + { + "epoch": 16.22207327971403, + "grad_norm": 0.04541015625, + "learning_rate": 0.0031445877166070527, + "loss": 0.7801, + "num_input_tokens_seen": 63237168, + "step": 108915 + }, + { + "epoch": 16.22281799225499, + "grad_norm": 0.07763671875, + "learning_rate": 0.0031433933782243164, + "loss": 0.7696, + "num_input_tokens_seen": 63239920, + "step": 108920 + }, + { + "epoch": 16.22356270479595, + "grad_norm": 0.045166015625, + "learning_rate": 0.0031421992401469046, + "loss": 0.8097, + "num_input_tokens_seen": 63242800, + "step": 108925 + }, + { + "epoch": 16.22430741733691, + "grad_norm": 0.03466796875, + "learning_rate": 0.0031410053023949802, + "loss": 0.8009, + "num_input_tokens_seen": 63246288, + "step": 108930 + }, + { + "epoch": 16.225052129877866, + "grad_norm": 0.036376953125, + "learning_rate": 0.0031398115649887226, + "loss": 0.7934, + "num_input_tokens_seen": 63249200, + "step": 108935 + }, + { + "epoch": 16.225796842418827, + "grad_norm": 0.037841796875, + "learning_rate": 0.003138618027948294, + "loss": 0.8011, + "num_input_tokens_seen": 63251984, + "step": 108940 + }, + { + "epoch": 16.226541554959784, + "grad_norm": 0.078125, + "learning_rate": 0.003137424691293858, + "loss": 0.8048, + "num_input_tokens_seen": 63254864, + "step": 108945 + }, + { + "epoch": 16.227286267500745, + "grad_norm": 0.036865234375, + "learning_rate": 0.003136231555045575, + "loss": 0.7876, + "num_input_tokens_seen": 63257616, + "step": 108950 + }, + { + "epoch": 16.228030980041705, + "grad_norm": 0.055419921875, + "learning_rate": 0.0031350386192236, + "loss": 0.8115, + "num_input_tokens_seen": 63260336, + "step": 108955 + }, + { + "epoch": 16.228775692582662, + "grad_norm": 0.033935546875, + "learning_rate": 0.0031338458838480908, + "loss": 0.7999, + "num_input_tokens_seen": 63263120, + "step": 108960 + }, + { + "epoch": 16.229520405123623, + "grad_norm": 0.068359375, + "learning_rate": 0.0031326533489391913, + "loss": 0.7947, + "num_input_tokens_seen": 63265840, + "step": 108965 + }, + { + "epoch": 16.230265117664583, + "grad_norm": 0.04931640625, + "learning_rate": 0.003131461014517058, + "loss": 0.806, + "num_input_tokens_seen": 63268400, + "step": 108970 + }, + { + "epoch": 16.23100983020554, + "grad_norm": 0.036376953125, + "learning_rate": 0.003130268880601822, + "loss": 0.7937, + "num_input_tokens_seen": 63271280, + "step": 108975 + }, + { + "epoch": 16.2317545427465, + "grad_norm": 0.05224609375, + "learning_rate": 0.003129076947213634, + "loss": 0.8124, + "num_input_tokens_seen": 63273936, + "step": 108980 + }, + { + "epoch": 16.232499255287458, + "grad_norm": 0.029052734375, + "learning_rate": 0.003127885214372622, + "loss": 0.8055, + "num_input_tokens_seen": 63276656, + "step": 108985 + }, + { + "epoch": 16.233243967828418, + "grad_norm": 0.05908203125, + "learning_rate": 0.0031266936820989275, + "loss": 0.8, + "num_input_tokens_seen": 63279536, + "step": 108990 + }, + { + "epoch": 16.23398868036938, + "grad_norm": 0.05810546875, + "learning_rate": 0.003125502350412675, + "loss": 0.8375, + "num_input_tokens_seen": 63282320, + "step": 108995 + }, + { + "epoch": 16.234733392910336, + "grad_norm": 0.038818359375, + "learning_rate": 0.003124311219333988, + "loss": 0.7958, + "num_input_tokens_seen": 63285264, + "step": 109000 + }, + { + "epoch": 16.235478105451296, + "grad_norm": 0.0966796875, + "learning_rate": 0.0031231202888829966, + "loss": 0.8037, + "num_input_tokens_seen": 63287856, + "step": 109005 + }, + { + "epoch": 16.236222817992257, + "grad_norm": 0.061767578125, + "learning_rate": 0.0031219295590798174, + "loss": 0.7907, + "num_input_tokens_seen": 63290320, + "step": 109010 + }, + { + "epoch": 16.236967530533214, + "grad_norm": 0.046142578125, + "learning_rate": 0.003120739029944565, + "loss": 0.7898, + "num_input_tokens_seen": 63293136, + "step": 109015 + }, + { + "epoch": 16.237712243074174, + "grad_norm": 0.04931640625, + "learning_rate": 0.003119548701497349, + "loss": 0.787, + "num_input_tokens_seen": 63296304, + "step": 109020 + }, + { + "epoch": 16.23845695561513, + "grad_norm": 0.043701171875, + "learning_rate": 0.0031183585737582886, + "loss": 0.786, + "num_input_tokens_seen": 63299344, + "step": 109025 + }, + { + "epoch": 16.239201668156092, + "grad_norm": 0.0595703125, + "learning_rate": 0.0031171686467474833, + "loss": 0.784, + "num_input_tokens_seen": 63302384, + "step": 109030 + }, + { + "epoch": 16.239946380697052, + "grad_norm": 0.05126953125, + "learning_rate": 0.003115978920485033, + "loss": 0.7871, + "num_input_tokens_seen": 63305264, + "step": 109035 + }, + { + "epoch": 16.24069109323801, + "grad_norm": 0.054443359375, + "learning_rate": 0.0031147893949910444, + "loss": 0.7995, + "num_input_tokens_seen": 63308240, + "step": 109040 + }, + { + "epoch": 16.24143580577897, + "grad_norm": 0.024658203125, + "learning_rate": 0.003113600070285605, + "loss": 0.8143, + "num_input_tokens_seen": 63311280, + "step": 109045 + }, + { + "epoch": 16.242180518319927, + "grad_norm": 0.043701171875, + "learning_rate": 0.003112410946388815, + "loss": 0.8015, + "num_input_tokens_seen": 63314128, + "step": 109050 + }, + { + "epoch": 16.242925230860887, + "grad_norm": 0.06005859375, + "learning_rate": 0.003111222023320757, + "loss": 0.7954, + "num_input_tokens_seen": 63316848, + "step": 109055 + }, + { + "epoch": 16.243669943401848, + "grad_norm": 0.02783203125, + "learning_rate": 0.0031100333011015234, + "loss": 0.786, + "num_input_tokens_seen": 63319696, + "step": 109060 + }, + { + "epoch": 16.244414655942805, + "grad_norm": 0.05322265625, + "learning_rate": 0.0031088447797511915, + "loss": 0.8045, + "num_input_tokens_seen": 63322512, + "step": 109065 + }, + { + "epoch": 16.245159368483765, + "grad_norm": 0.048583984375, + "learning_rate": 0.0031076564592898395, + "loss": 0.7903, + "num_input_tokens_seen": 63325328, + "step": 109070 + }, + { + "epoch": 16.245904081024726, + "grad_norm": 0.095703125, + "learning_rate": 0.0031064683397375474, + "loss": 0.8161, + "num_input_tokens_seen": 63328080, + "step": 109075 + }, + { + "epoch": 16.246648793565683, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0031052804211143853, + "loss": 0.7972, + "num_input_tokens_seen": 63331184, + "step": 109080 + }, + { + "epoch": 16.247393506106643, + "grad_norm": 0.03662109375, + "learning_rate": 0.003104092703440422, + "loss": 0.7999, + "num_input_tokens_seen": 63333936, + "step": 109085 + }, + { + "epoch": 16.2481382186476, + "grad_norm": 0.04833984375, + "learning_rate": 0.0031029051867357163, + "loss": 0.7954, + "num_input_tokens_seen": 63336848, + "step": 109090 + }, + { + "epoch": 16.24888293118856, + "grad_norm": 0.042236328125, + "learning_rate": 0.0031017178710203413, + "loss": 0.7891, + "num_input_tokens_seen": 63339888, + "step": 109095 + }, + { + "epoch": 16.24962764372952, + "grad_norm": 0.06298828125, + "learning_rate": 0.003100530756314345, + "loss": 0.7948, + "num_input_tokens_seen": 63342960, + "step": 109100 + }, + { + "epoch": 16.25037235627048, + "grad_norm": 0.041259765625, + "learning_rate": 0.0030993438426377903, + "loss": 0.7971, + "num_input_tokens_seen": 63346224, + "step": 109105 + }, + { + "epoch": 16.25111706881144, + "grad_norm": 0.034423828125, + "learning_rate": 0.0030981571300107247, + "loss": 0.7982, + "num_input_tokens_seen": 63349136, + "step": 109110 + }, + { + "epoch": 16.2518617813524, + "grad_norm": 0.047607421875, + "learning_rate": 0.0030969706184532015, + "loss": 0.7919, + "num_input_tokens_seen": 63352240, + "step": 109115 + }, + { + "epoch": 16.252606493893357, + "grad_norm": 0.055419921875, + "learning_rate": 0.0030957843079852596, + "loss": 0.7915, + "num_input_tokens_seen": 63354928, + "step": 109120 + }, + { + "epoch": 16.253351206434317, + "grad_norm": 0.048828125, + "learning_rate": 0.003094598198626942, + "loss": 0.8208, + "num_input_tokens_seen": 63357744, + "step": 109125 + }, + { + "epoch": 16.254095918975274, + "grad_norm": 0.027587890625, + "learning_rate": 0.003093412290398289, + "loss": 0.7868, + "num_input_tokens_seen": 63360464, + "step": 109130 + }, + { + "epoch": 16.254840631516235, + "grad_norm": 0.05810546875, + "learning_rate": 0.003092226583319336, + "loss": 0.8369, + "num_input_tokens_seen": 63363600, + "step": 109135 + }, + { + "epoch": 16.255585344057195, + "grad_norm": 0.044189453125, + "learning_rate": 0.0030910410774101127, + "loss": 0.7954, + "num_input_tokens_seen": 63366512, + "step": 109140 + }, + { + "epoch": 16.256330056598152, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0030898557726906405, + "loss": 0.8446, + "num_input_tokens_seen": 63369488, + "step": 109145 + }, + { + "epoch": 16.257074769139113, + "grad_norm": 0.042724609375, + "learning_rate": 0.0030886706691809554, + "loss": 0.814, + "num_input_tokens_seen": 63372368, + "step": 109150 + }, + { + "epoch": 16.257819481680073, + "grad_norm": 0.03759765625, + "learning_rate": 0.0030874857669010735, + "loss": 0.786, + "num_input_tokens_seen": 63375216, + "step": 109155 + }, + { + "epoch": 16.25856419422103, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0030863010658710084, + "loss": 0.801, + "num_input_tokens_seen": 63378064, + "step": 109160 + }, + { + "epoch": 16.25930890676199, + "grad_norm": 0.055908203125, + "learning_rate": 0.0030851165661107804, + "loss": 0.8064, + "num_input_tokens_seen": 63380848, + "step": 109165 + }, + { + "epoch": 16.260053619302948, + "grad_norm": 0.050537109375, + "learning_rate": 0.0030839322676403957, + "loss": 0.7946, + "num_input_tokens_seen": 63383696, + "step": 109170 + }, + { + "epoch": 16.260798331843908, + "grad_norm": 0.0498046875, + "learning_rate": 0.0030827481704798682, + "loss": 0.8039, + "num_input_tokens_seen": 63386544, + "step": 109175 + }, + { + "epoch": 16.26154304438487, + "grad_norm": 0.0281982421875, + "learning_rate": 0.003081564274649195, + "loss": 0.7808, + "num_input_tokens_seen": 63389520, + "step": 109180 + }, + { + "epoch": 16.262287756925826, + "grad_norm": 0.042724609375, + "learning_rate": 0.003080380580168382, + "loss": 0.808, + "num_input_tokens_seen": 63392240, + "step": 109185 + }, + { + "epoch": 16.263032469466786, + "grad_norm": 0.033447265625, + "learning_rate": 0.003079197087057423, + "loss": 0.8132, + "num_input_tokens_seen": 63395280, + "step": 109190 + }, + { + "epoch": 16.263777182007743, + "grad_norm": 0.0311279296875, + "learning_rate": 0.0030780137953363156, + "loss": 0.7956, + "num_input_tokens_seen": 63398288, + "step": 109195 + }, + { + "epoch": 16.264521894548704, + "grad_norm": 0.054443359375, + "learning_rate": 0.0030768307050250488, + "loss": 0.8133, + "num_input_tokens_seen": 63401040, + "step": 109200 + }, + { + "epoch": 16.265266607089664, + "grad_norm": 0.0771484375, + "learning_rate": 0.0030756478161436097, + "loss": 0.7923, + "num_input_tokens_seen": 63403696, + "step": 109205 + }, + { + "epoch": 16.26601131963062, + "grad_norm": 0.0595703125, + "learning_rate": 0.0030744651287119805, + "loss": 0.8381, + "num_input_tokens_seen": 63406640, + "step": 109210 + }, + { + "epoch": 16.266756032171582, + "grad_norm": 0.07763671875, + "learning_rate": 0.0030732826427501375, + "loss": 0.8199, + "num_input_tokens_seen": 63409680, + "step": 109215 + }, + { + "epoch": 16.267500744712542, + "grad_norm": 0.05224609375, + "learning_rate": 0.0030721003582780673, + "loss": 0.7812, + "num_input_tokens_seen": 63412496, + "step": 109220 + }, + { + "epoch": 16.2682454572535, + "grad_norm": 0.04296875, + "learning_rate": 0.0030709182753157365, + "loss": 0.8139, + "num_input_tokens_seen": 63415632, + "step": 109225 + }, + { + "epoch": 16.26899016979446, + "grad_norm": 0.05859375, + "learning_rate": 0.0030697363938831184, + "loss": 0.8151, + "num_input_tokens_seen": 63418512, + "step": 109230 + }, + { + "epoch": 16.269734882335417, + "grad_norm": 0.047119140625, + "learning_rate": 0.0030685547140001804, + "loss": 0.7954, + "num_input_tokens_seen": 63421328, + "step": 109235 + }, + { + "epoch": 16.270479594876377, + "grad_norm": 0.03173828125, + "learning_rate": 0.003067373235686879, + "loss": 0.7914, + "num_input_tokens_seen": 63424336, + "step": 109240 + }, + { + "epoch": 16.271224307417338, + "grad_norm": 0.0732421875, + "learning_rate": 0.0030661919589631834, + "loss": 0.7951, + "num_input_tokens_seen": 63427312, + "step": 109245 + }, + { + "epoch": 16.271969019958295, + "grad_norm": 0.0380859375, + "learning_rate": 0.0030650108838490425, + "loss": 0.8023, + "num_input_tokens_seen": 63430288, + "step": 109250 + }, + { + "epoch": 16.272713732499255, + "grad_norm": 0.0301513671875, + "learning_rate": 0.003063830010364414, + "loss": 0.797, + "num_input_tokens_seen": 63433136, + "step": 109255 + }, + { + "epoch": 16.273458445040216, + "grad_norm": 0.051025390625, + "learning_rate": 0.003062649338529245, + "loss": 0.7884, + "num_input_tokens_seen": 63435696, + "step": 109260 + }, + { + "epoch": 16.274203157581173, + "grad_norm": 0.03759765625, + "learning_rate": 0.003061468868363485, + "loss": 0.8007, + "num_input_tokens_seen": 63439088, + "step": 109265 + }, + { + "epoch": 16.274947870122134, + "grad_norm": 0.05419921875, + "learning_rate": 0.0030602885998870758, + "loss": 0.7905, + "num_input_tokens_seen": 63441968, + "step": 109270 + }, + { + "epoch": 16.27569258266309, + "grad_norm": 0.04150390625, + "learning_rate": 0.003059108533119954, + "loss": 0.8017, + "num_input_tokens_seen": 63444816, + "step": 109275 + }, + { + "epoch": 16.27643729520405, + "grad_norm": 0.02783203125, + "learning_rate": 0.0030579286680820585, + "loss": 0.8216, + "num_input_tokens_seen": 63447504, + "step": 109280 + }, + { + "epoch": 16.27718200774501, + "grad_norm": 0.10498046875, + "learning_rate": 0.0030567490047933178, + "loss": 0.8244, + "num_input_tokens_seen": 63450480, + "step": 109285 + }, + { + "epoch": 16.27792672028597, + "grad_norm": 0.039306640625, + "learning_rate": 0.0030555695432736662, + "loss": 0.8339, + "num_input_tokens_seen": 63453680, + "step": 109290 + }, + { + "epoch": 16.27867143282693, + "grad_norm": 0.05126953125, + "learning_rate": 0.003054390283543025, + "loss": 0.7929, + "num_input_tokens_seen": 63456624, + "step": 109295 + }, + { + "epoch": 16.27941614536789, + "grad_norm": 0.031494140625, + "learning_rate": 0.003053211225621322, + "loss": 0.8079, + "num_input_tokens_seen": 63459440, + "step": 109300 + }, + { + "epoch": 16.280160857908847, + "grad_norm": 0.0625, + "learning_rate": 0.0030520323695284696, + "loss": 0.7949, + "num_input_tokens_seen": 63462128, + "step": 109305 + }, + { + "epoch": 16.280905570449807, + "grad_norm": 0.0361328125, + "learning_rate": 0.0030508537152843913, + "loss": 0.8297, + "num_input_tokens_seen": 63464784, + "step": 109310 + }, + { + "epoch": 16.281650282990764, + "grad_norm": 0.043701171875, + "learning_rate": 0.0030496752629089944, + "loss": 0.7975, + "num_input_tokens_seen": 63467472, + "step": 109315 + }, + { + "epoch": 16.282394995531725, + "grad_norm": 0.041259765625, + "learning_rate": 0.0030484970124221838, + "loss": 0.8182, + "num_input_tokens_seen": 63470288, + "step": 109320 + }, + { + "epoch": 16.283139708072685, + "grad_norm": 0.04150390625, + "learning_rate": 0.003047318963843873, + "loss": 0.804, + "num_input_tokens_seen": 63472848, + "step": 109325 + }, + { + "epoch": 16.283884420613642, + "grad_norm": 0.0439453125, + "learning_rate": 0.003046141117193956, + "loss": 0.7837, + "num_input_tokens_seen": 63475984, + "step": 109330 + }, + { + "epoch": 16.284629133154603, + "grad_norm": 0.0732421875, + "learning_rate": 0.0030449634724923407, + "loss": 0.8113, + "num_input_tokens_seen": 63478672, + "step": 109335 + }, + { + "epoch": 16.285373845695563, + "grad_norm": 0.03515625, + "learning_rate": 0.0030437860297589173, + "loss": 0.8092, + "num_input_tokens_seen": 63481424, + "step": 109340 + }, + { + "epoch": 16.28611855823652, + "grad_norm": 0.035400390625, + "learning_rate": 0.0030426087890135754, + "loss": 0.7936, + "num_input_tokens_seen": 63484048, + "step": 109345 + }, + { + "epoch": 16.28686327077748, + "grad_norm": 0.0245361328125, + "learning_rate": 0.003041431750276202, + "loss": 0.8018, + "num_input_tokens_seen": 63487152, + "step": 109350 + }, + { + "epoch": 16.287607983318438, + "grad_norm": 0.05712890625, + "learning_rate": 0.00304025491356669, + "loss": 0.8213, + "num_input_tokens_seen": 63490064, + "step": 109355 + }, + { + "epoch": 16.2883526958594, + "grad_norm": 0.046630859375, + "learning_rate": 0.0030390782789049145, + "loss": 0.7894, + "num_input_tokens_seen": 63493008, + "step": 109360 + }, + { + "epoch": 16.28909740840036, + "grad_norm": 0.046142578125, + "learning_rate": 0.00303790184631075, + "loss": 0.7981, + "num_input_tokens_seen": 63495824, + "step": 109365 + }, + { + "epoch": 16.289842120941316, + "grad_norm": 0.059326171875, + "learning_rate": 0.0030367256158040827, + "loss": 0.7896, + "num_input_tokens_seen": 63498384, + "step": 109370 + }, + { + "epoch": 16.290586833482276, + "grad_norm": 0.04345703125, + "learning_rate": 0.003035549587404771, + "loss": 0.8058, + "num_input_tokens_seen": 63501264, + "step": 109375 + }, + { + "epoch": 16.291331546023233, + "grad_norm": 0.0419921875, + "learning_rate": 0.003034373761132694, + "loss": 0.7932, + "num_input_tokens_seen": 63504016, + "step": 109380 + }, + { + "epoch": 16.292076258564194, + "grad_norm": 0.04443359375, + "learning_rate": 0.0030331981370077064, + "loss": 0.7822, + "num_input_tokens_seen": 63506800, + "step": 109385 + }, + { + "epoch": 16.292820971105154, + "grad_norm": 0.06494140625, + "learning_rate": 0.0030320227150496784, + "loss": 0.797, + "num_input_tokens_seen": 63509872, + "step": 109390 + }, + { + "epoch": 16.29356568364611, + "grad_norm": 0.0361328125, + "learning_rate": 0.003030847495278463, + "loss": 0.7953, + "num_input_tokens_seen": 63512880, + "step": 109395 + }, + { + "epoch": 16.294310396187072, + "grad_norm": 0.041015625, + "learning_rate": 0.0030296724777139103, + "loss": 0.8159, + "num_input_tokens_seen": 63515824, + "step": 109400 + }, + { + "epoch": 16.295055108728032, + "grad_norm": 0.026123046875, + "learning_rate": 0.0030284976623758783, + "loss": 0.7861, + "num_input_tokens_seen": 63518864, + "step": 109405 + }, + { + "epoch": 16.29579982126899, + "grad_norm": 0.033447265625, + "learning_rate": 0.003027323049284211, + "loss": 0.8263, + "num_input_tokens_seen": 63521872, + "step": 109410 + }, + { + "epoch": 16.29654453380995, + "grad_norm": 0.057373046875, + "learning_rate": 0.0030261486384587527, + "loss": 0.796, + "num_input_tokens_seen": 63525072, + "step": 109415 + }, + { + "epoch": 16.297289246350907, + "grad_norm": 0.052734375, + "learning_rate": 0.0030249744299193403, + "loss": 0.7918, + "num_input_tokens_seen": 63528048, + "step": 109420 + }, + { + "epoch": 16.298033958891867, + "grad_norm": 0.050048828125, + "learning_rate": 0.0030238004236858166, + "loss": 0.7936, + "num_input_tokens_seen": 63530960, + "step": 109425 + }, + { + "epoch": 16.298778671432828, + "grad_norm": 0.0233154296875, + "learning_rate": 0.0030226266197780093, + "loss": 0.7968, + "num_input_tokens_seen": 63533968, + "step": 109430 + }, + { + "epoch": 16.299523383973785, + "grad_norm": 0.04345703125, + "learning_rate": 0.0030214530182157556, + "loss": 0.7786, + "num_input_tokens_seen": 63536464, + "step": 109435 + }, + { + "epoch": 16.300268096514746, + "grad_norm": 0.034912109375, + "learning_rate": 0.0030202796190188794, + "loss": 0.8063, + "num_input_tokens_seen": 63539440, + "step": 109440 + }, + { + "epoch": 16.301012809055706, + "grad_norm": 0.1044921875, + "learning_rate": 0.003019106422207199, + "loss": 0.7894, + "num_input_tokens_seen": 63542128, + "step": 109445 + }, + { + "epoch": 16.301757521596663, + "grad_norm": 0.09423828125, + "learning_rate": 0.0030179334278005424, + "loss": 0.7885, + "num_input_tokens_seen": 63545040, + "step": 109450 + }, + { + "epoch": 16.302502234137624, + "grad_norm": 0.03759765625, + "learning_rate": 0.00301676063581872, + "loss": 0.8024, + "num_input_tokens_seen": 63547824, + "step": 109455 + }, + { + "epoch": 16.30324694667858, + "grad_norm": 0.0400390625, + "learning_rate": 0.00301558804628155, + "loss": 0.8175, + "num_input_tokens_seen": 63550928, + "step": 109460 + }, + { + "epoch": 16.30399165921954, + "grad_norm": 0.040283203125, + "learning_rate": 0.0030144156592088397, + "loss": 0.7969, + "num_input_tokens_seen": 63554224, + "step": 109465 + }, + { + "epoch": 16.3047363717605, + "grad_norm": 0.055908203125, + "learning_rate": 0.0030132434746203943, + "loss": 0.7996, + "num_input_tokens_seen": 63557520, + "step": 109470 + }, + { + "epoch": 16.30548108430146, + "grad_norm": 0.035888671875, + "learning_rate": 0.003012071492536019, + "loss": 0.7992, + "num_input_tokens_seen": 63560208, + "step": 109475 + }, + { + "epoch": 16.30622579684242, + "grad_norm": 0.036376953125, + "learning_rate": 0.003010899712975508, + "loss": 0.7822, + "num_input_tokens_seen": 63563120, + "step": 109480 + }, + { + "epoch": 16.30697050938338, + "grad_norm": 0.07861328125, + "learning_rate": 0.003009728135958664, + "loss": 0.8135, + "num_input_tokens_seen": 63565968, + "step": 109485 + }, + { + "epoch": 16.307715221924337, + "grad_norm": 0.03564453125, + "learning_rate": 0.003008556761505274, + "loss": 0.822, + "num_input_tokens_seen": 63568752, + "step": 109490 + }, + { + "epoch": 16.308459934465297, + "grad_norm": 0.0361328125, + "learning_rate": 0.003007385589635133, + "loss": 0.799, + "num_input_tokens_seen": 63571664, + "step": 109495 + }, + { + "epoch": 16.309204647006254, + "grad_norm": 0.06787109375, + "learning_rate": 0.003006214620368021, + "loss": 0.802, + "num_input_tokens_seen": 63574416, + "step": 109500 + }, + { + "epoch": 16.309949359547215, + "grad_norm": 0.05712890625, + "learning_rate": 0.003005043853723726, + "loss": 0.7955, + "num_input_tokens_seen": 63577520, + "step": 109505 + }, + { + "epoch": 16.310694072088175, + "grad_norm": 0.057861328125, + "learning_rate": 0.0030038732897220205, + "loss": 0.7869, + "num_input_tokens_seen": 63580400, + "step": 109510 + }, + { + "epoch": 16.311438784629132, + "grad_norm": 0.035400390625, + "learning_rate": 0.0030027029283826866, + "loss": 0.7882, + "num_input_tokens_seen": 63583056, + "step": 109515 + }, + { + "epoch": 16.312183497170093, + "grad_norm": 0.04052734375, + "learning_rate": 0.0030015327697254937, + "loss": 0.7938, + "num_input_tokens_seen": 63586544, + "step": 109520 + }, + { + "epoch": 16.312928209711053, + "grad_norm": 0.046142578125, + "learning_rate": 0.0030003628137702065, + "loss": 0.7888, + "num_input_tokens_seen": 63589232, + "step": 109525 + }, + { + "epoch": 16.31367292225201, + "grad_norm": 0.0277099609375, + "learning_rate": 0.002999193060536597, + "loss": 0.8008, + "num_input_tokens_seen": 63592048, + "step": 109530 + }, + { + "epoch": 16.31441763479297, + "grad_norm": 0.03271484375, + "learning_rate": 0.002998023510044424, + "loss": 0.8055, + "num_input_tokens_seen": 63595184, + "step": 109535 + }, + { + "epoch": 16.315162347333928, + "grad_norm": 0.051513671875, + "learning_rate": 0.002996854162313446, + "loss": 0.8188, + "num_input_tokens_seen": 63598160, + "step": 109540 + }, + { + "epoch": 16.31590705987489, + "grad_norm": 0.0576171875, + "learning_rate": 0.002995685017363411, + "loss": 0.7973, + "num_input_tokens_seen": 63601168, + "step": 109545 + }, + { + "epoch": 16.31665177241585, + "grad_norm": 0.033935546875, + "learning_rate": 0.0029945160752140832, + "loss": 0.7806, + "num_input_tokens_seen": 63604176, + "step": 109550 + }, + { + "epoch": 16.317396484956806, + "grad_norm": 0.06494140625, + "learning_rate": 0.002993347335885205, + "loss": 0.7872, + "num_input_tokens_seen": 63606672, + "step": 109555 + }, + { + "epoch": 16.318141197497766, + "grad_norm": 0.04345703125, + "learning_rate": 0.0029921787993965154, + "loss": 0.806, + "num_input_tokens_seen": 63609904, + "step": 109560 + }, + { + "epoch": 16.318885910038723, + "grad_norm": 0.036865234375, + "learning_rate": 0.0029910104657677643, + "loss": 0.8162, + "num_input_tokens_seen": 63612560, + "step": 109565 + }, + { + "epoch": 16.319630622579684, + "grad_norm": 0.0380859375, + "learning_rate": 0.002989842335018683, + "loss": 0.7903, + "num_input_tokens_seen": 63615664, + "step": 109570 + }, + { + "epoch": 16.320375335120644, + "grad_norm": 0.060791015625, + "learning_rate": 0.0029886744071690122, + "loss": 0.7943, + "num_input_tokens_seen": 63618928, + "step": 109575 + }, + { + "epoch": 16.3211200476616, + "grad_norm": 0.03466796875, + "learning_rate": 0.0029875066822384767, + "loss": 0.7975, + "num_input_tokens_seen": 63621936, + "step": 109580 + }, + { + "epoch": 16.321864760202562, + "grad_norm": 0.049560546875, + "learning_rate": 0.0029863391602468095, + "loss": 0.7841, + "num_input_tokens_seen": 63624592, + "step": 109585 + }, + { + "epoch": 16.322609472743522, + "grad_norm": 0.0380859375, + "learning_rate": 0.0029851718412137293, + "loss": 0.8147, + "num_input_tokens_seen": 63627536, + "step": 109590 + }, + { + "epoch": 16.32335418528448, + "grad_norm": 0.038330078125, + "learning_rate": 0.0029840047251589634, + "loss": 0.781, + "num_input_tokens_seen": 63630544, + "step": 109595 + }, + { + "epoch": 16.32409889782544, + "grad_norm": 0.0242919921875, + "learning_rate": 0.002982837812102224, + "loss": 0.8041, + "num_input_tokens_seen": 63633328, + "step": 109600 + }, + { + "epoch": 16.324843610366397, + "grad_norm": 0.050537109375, + "learning_rate": 0.002981671102063227, + "loss": 0.7919, + "num_input_tokens_seen": 63636144, + "step": 109605 + }, + { + "epoch": 16.325588322907358, + "grad_norm": 0.03662109375, + "learning_rate": 0.002980504595061681, + "loss": 0.8046, + "num_input_tokens_seen": 63638704, + "step": 109610 + }, + { + "epoch": 16.326333035448318, + "grad_norm": 0.05126953125, + "learning_rate": 0.0029793382911172906, + "loss": 0.8021, + "num_input_tokens_seen": 63641744, + "step": 109615 + }, + { + "epoch": 16.327077747989275, + "grad_norm": 0.0234375, + "learning_rate": 0.0029781721902497654, + "loss": 0.7896, + "num_input_tokens_seen": 63645008, + "step": 109620 + }, + { + "epoch": 16.327822460530236, + "grad_norm": 0.05419921875, + "learning_rate": 0.0029770062924787998, + "loss": 0.7929, + "num_input_tokens_seen": 63647696, + "step": 109625 + }, + { + "epoch": 16.328567173071196, + "grad_norm": 0.049072265625, + "learning_rate": 0.002975840597824096, + "loss": 0.8095, + "num_input_tokens_seen": 63650416, + "step": 109630 + }, + { + "epoch": 16.329311885612153, + "grad_norm": 0.0274658203125, + "learning_rate": 0.002974675106305346, + "loss": 0.8071, + "num_input_tokens_seen": 63653296, + "step": 109635 + }, + { + "epoch": 16.330056598153114, + "grad_norm": 0.039306640625, + "learning_rate": 0.002973509817942235, + "loss": 0.808, + "num_input_tokens_seen": 63656144, + "step": 109640 + }, + { + "epoch": 16.33080131069407, + "grad_norm": 0.06494140625, + "learning_rate": 0.002972344732754453, + "loss": 0.7816, + "num_input_tokens_seen": 63659248, + "step": 109645 + }, + { + "epoch": 16.33154602323503, + "grad_norm": 0.048095703125, + "learning_rate": 0.0029711798507616807, + "loss": 0.8178, + "num_input_tokens_seen": 63662032, + "step": 109650 + }, + { + "epoch": 16.33229073577599, + "grad_norm": 0.056884765625, + "learning_rate": 0.0029700151719836035, + "loss": 0.7989, + "num_input_tokens_seen": 63664944, + "step": 109655 + }, + { + "epoch": 16.33303544831695, + "grad_norm": 0.038818359375, + "learning_rate": 0.002968850696439889, + "loss": 0.8076, + "num_input_tokens_seen": 63668016, + "step": 109660 + }, + { + "epoch": 16.33378016085791, + "grad_norm": 0.047607421875, + "learning_rate": 0.0029676864241502164, + "loss": 0.7783, + "num_input_tokens_seen": 63671152, + "step": 109665 + }, + { + "epoch": 16.33452487339887, + "grad_norm": 0.053955078125, + "learning_rate": 0.002966522355134255, + "loss": 0.8111, + "num_input_tokens_seen": 63673872, + "step": 109670 + }, + { + "epoch": 16.335269585939827, + "grad_norm": 0.045166015625, + "learning_rate": 0.0029653584894116674, + "loss": 0.7769, + "num_input_tokens_seen": 63676656, + "step": 109675 + }, + { + "epoch": 16.336014298480787, + "grad_norm": 0.04736328125, + "learning_rate": 0.002964194827002116, + "loss": 0.7967, + "num_input_tokens_seen": 63679792, + "step": 109680 + }, + { + "epoch": 16.336759011021744, + "grad_norm": 0.05712890625, + "learning_rate": 0.0029630313679252555, + "loss": 0.7868, + "num_input_tokens_seen": 63682736, + "step": 109685 + }, + { + "epoch": 16.337503723562705, + "grad_norm": 0.048583984375, + "learning_rate": 0.0029618681122007516, + "loss": 0.8248, + "num_input_tokens_seen": 63685776, + "step": 109690 + }, + { + "epoch": 16.338248436103665, + "grad_norm": 0.04541015625, + "learning_rate": 0.002960705059848247, + "loss": 0.7909, + "num_input_tokens_seen": 63688912, + "step": 109695 + }, + { + "epoch": 16.338993148644622, + "grad_norm": 0.05078125, + "learning_rate": 0.0029595422108873973, + "loss": 0.8043, + "num_input_tokens_seen": 63692048, + "step": 109700 + }, + { + "epoch": 16.339737861185583, + "grad_norm": 0.07470703125, + "learning_rate": 0.002958379565337841, + "loss": 0.8022, + "num_input_tokens_seen": 63694864, + "step": 109705 + }, + { + "epoch": 16.34048257372654, + "grad_norm": 0.04833984375, + "learning_rate": 0.0029572171232192276, + "loss": 0.7943, + "num_input_tokens_seen": 63697904, + "step": 109710 + }, + { + "epoch": 16.3412272862675, + "grad_norm": 0.02490234375, + "learning_rate": 0.002956054884551188, + "loss": 0.8167, + "num_input_tokens_seen": 63700720, + "step": 109715 + }, + { + "epoch": 16.34197199880846, + "grad_norm": 0.0517578125, + "learning_rate": 0.002954892849353362, + "loss": 0.8041, + "num_input_tokens_seen": 63703504, + "step": 109720 + }, + { + "epoch": 16.342716711349418, + "grad_norm": 0.0517578125, + "learning_rate": 0.002953731017645381, + "loss": 0.7948, + "num_input_tokens_seen": 63706192, + "step": 109725 + }, + { + "epoch": 16.34346142389038, + "grad_norm": 0.05615234375, + "learning_rate": 0.002952569389446866, + "loss": 0.8004, + "num_input_tokens_seen": 63708944, + "step": 109730 + }, + { + "epoch": 16.34420613643134, + "grad_norm": 0.038330078125, + "learning_rate": 0.0029514079647774516, + "loss": 0.792, + "num_input_tokens_seen": 63711664, + "step": 109735 + }, + { + "epoch": 16.344950848972296, + "grad_norm": 0.0498046875, + "learning_rate": 0.002950246743656755, + "loss": 0.8065, + "num_input_tokens_seen": 63714608, + "step": 109740 + }, + { + "epoch": 16.345695561513256, + "grad_norm": 0.057373046875, + "learning_rate": 0.0029490857261043916, + "loss": 0.7874, + "num_input_tokens_seen": 63717648, + "step": 109745 + }, + { + "epoch": 16.346440274054213, + "grad_norm": 0.0498046875, + "learning_rate": 0.002947924912139972, + "loss": 0.8046, + "num_input_tokens_seen": 63720528, + "step": 109750 + }, + { + "epoch": 16.347184986595174, + "grad_norm": 0.130859375, + "learning_rate": 0.002946764301783116, + "loss": 0.7904, + "num_input_tokens_seen": 63723472, + "step": 109755 + }, + { + "epoch": 16.347929699136134, + "grad_norm": 0.05810546875, + "learning_rate": 0.002945603895053429, + "loss": 0.7909, + "num_input_tokens_seen": 63726160, + "step": 109760 + }, + { + "epoch": 16.34867441167709, + "grad_norm": 0.0556640625, + "learning_rate": 0.002944443691970506, + "loss": 0.7885, + "num_input_tokens_seen": 63729200, + "step": 109765 + }, + { + "epoch": 16.349419124218052, + "grad_norm": 0.052734375, + "learning_rate": 0.002943283692553959, + "loss": 0.7875, + "num_input_tokens_seen": 63731888, + "step": 109770 + }, + { + "epoch": 16.350163836759013, + "grad_norm": 0.056884765625, + "learning_rate": 0.0029421238968233765, + "loss": 0.7928, + "num_input_tokens_seen": 63734608, + "step": 109775 + }, + { + "epoch": 16.35090854929997, + "grad_norm": 0.03466796875, + "learning_rate": 0.0029409643047983596, + "loss": 0.7934, + "num_input_tokens_seen": 63737616, + "step": 109780 + }, + { + "epoch": 16.35165326184093, + "grad_norm": 0.0263671875, + "learning_rate": 0.002939804916498491, + "loss": 0.7882, + "num_input_tokens_seen": 63740336, + "step": 109785 + }, + { + "epoch": 16.352397974381887, + "grad_norm": 0.044189453125, + "learning_rate": 0.0029386457319433637, + "loss": 0.7843, + "num_input_tokens_seen": 63742960, + "step": 109790 + }, + { + "epoch": 16.353142686922848, + "grad_norm": 0.054443359375, + "learning_rate": 0.0029374867511525586, + "loss": 0.7824, + "num_input_tokens_seen": 63745776, + "step": 109795 + }, + { + "epoch": 16.353887399463808, + "grad_norm": 0.04345703125, + "learning_rate": 0.0029363279741456564, + "loss": 0.8044, + "num_input_tokens_seen": 63748528, + "step": 109800 + }, + { + "epoch": 16.354632112004765, + "grad_norm": 0.045654296875, + "learning_rate": 0.00293516940094223, + "loss": 0.7785, + "num_input_tokens_seen": 63751696, + "step": 109805 + }, + { + "epoch": 16.355376824545726, + "grad_norm": 0.042724609375, + "learning_rate": 0.0029340110315618523, + "loss": 0.8156, + "num_input_tokens_seen": 63754576, + "step": 109810 + }, + { + "epoch": 16.356121537086686, + "grad_norm": 0.0458984375, + "learning_rate": 0.002932852866024097, + "loss": 0.7889, + "num_input_tokens_seen": 63757456, + "step": 109815 + }, + { + "epoch": 16.356866249627643, + "grad_norm": 0.037109375, + "learning_rate": 0.0029316949043485256, + "loss": 0.7976, + "num_input_tokens_seen": 63760400, + "step": 109820 + }, + { + "epoch": 16.357610962168604, + "grad_norm": 0.036376953125, + "learning_rate": 0.002930537146554706, + "loss": 0.7922, + "num_input_tokens_seen": 63763248, + "step": 109825 + }, + { + "epoch": 16.35835567470956, + "grad_norm": 0.03466796875, + "learning_rate": 0.0029293795926621916, + "loss": 0.7906, + "num_input_tokens_seen": 63766256, + "step": 109830 + }, + { + "epoch": 16.35910038725052, + "grad_norm": 0.033935546875, + "learning_rate": 0.0029282222426905446, + "loss": 0.8106, + "num_input_tokens_seen": 63768944, + "step": 109835 + }, + { + "epoch": 16.35984509979148, + "grad_norm": 0.039306640625, + "learning_rate": 0.002927065096659312, + "loss": 0.7873, + "num_input_tokens_seen": 63771664, + "step": 109840 + }, + { + "epoch": 16.36058981233244, + "grad_norm": 0.03955078125, + "learning_rate": 0.0029259081545880417, + "loss": 0.7967, + "num_input_tokens_seen": 63774448, + "step": 109845 + }, + { + "epoch": 16.3613345248734, + "grad_norm": 0.04638671875, + "learning_rate": 0.002924751416496286, + "loss": 0.7879, + "num_input_tokens_seen": 63777264, + "step": 109850 + }, + { + "epoch": 16.36207923741436, + "grad_norm": 0.0546875, + "learning_rate": 0.0029235948824035767, + "loss": 0.8091, + "num_input_tokens_seen": 63780112, + "step": 109855 + }, + { + "epoch": 16.362823949955317, + "grad_norm": 0.0419921875, + "learning_rate": 0.002922438552329464, + "loss": 0.7795, + "num_input_tokens_seen": 63782832, + "step": 109860 + }, + { + "epoch": 16.363568662496277, + "grad_norm": 0.04150390625, + "learning_rate": 0.002921282426293475, + "loss": 0.7757, + "num_input_tokens_seen": 63785936, + "step": 109865 + }, + { + "epoch": 16.364313375037234, + "grad_norm": 0.07421875, + "learning_rate": 0.0029201265043151447, + "loss": 0.8183, + "num_input_tokens_seen": 63788848, + "step": 109870 + }, + { + "epoch": 16.365058087578195, + "grad_norm": 0.032958984375, + "learning_rate": 0.0029189707864139946, + "loss": 0.7921, + "num_input_tokens_seen": 63791856, + "step": 109875 + }, + { + "epoch": 16.365802800119155, + "grad_norm": 0.0517578125, + "learning_rate": 0.002917815272609559, + "loss": 0.8077, + "num_input_tokens_seen": 63794672, + "step": 109880 + }, + { + "epoch": 16.366547512660112, + "grad_norm": 0.042236328125, + "learning_rate": 0.002916659962921354, + "loss": 0.8032, + "num_input_tokens_seen": 63797648, + "step": 109885 + }, + { + "epoch": 16.367292225201073, + "grad_norm": 0.06494140625, + "learning_rate": 0.0029155048573688957, + "loss": 0.8005, + "num_input_tokens_seen": 63800848, + "step": 109890 + }, + { + "epoch": 16.36803693774203, + "grad_norm": 0.04248046875, + "learning_rate": 0.0029143499559717034, + "loss": 0.7979, + "num_input_tokens_seen": 63803952, + "step": 109895 + }, + { + "epoch": 16.36878165028299, + "grad_norm": 0.054931640625, + "learning_rate": 0.0029131952587492815, + "loss": 0.8018, + "num_input_tokens_seen": 63806672, + "step": 109900 + }, + { + "epoch": 16.36952636282395, + "grad_norm": 0.033935546875, + "learning_rate": 0.0029120407657211443, + "loss": 0.7815, + "num_input_tokens_seen": 63809616, + "step": 109905 + }, + { + "epoch": 16.370271075364908, + "grad_norm": 0.03759765625, + "learning_rate": 0.0029108864769067898, + "loss": 0.7866, + "num_input_tokens_seen": 63812336, + "step": 109910 + }, + { + "epoch": 16.37101578790587, + "grad_norm": 0.035888671875, + "learning_rate": 0.002909732392325724, + "loss": 0.8105, + "num_input_tokens_seen": 63815568, + "step": 109915 + }, + { + "epoch": 16.37176050044683, + "grad_norm": 0.0537109375, + "learning_rate": 0.0029085785119974425, + "loss": 0.7842, + "num_input_tokens_seen": 63818224, + "step": 109920 + }, + { + "epoch": 16.372505212987786, + "grad_norm": 0.056396484375, + "learning_rate": 0.0029074248359414337, + "loss": 0.8032, + "num_input_tokens_seen": 63821072, + "step": 109925 + }, + { + "epoch": 16.373249925528746, + "grad_norm": 0.0634765625, + "learning_rate": 0.002906271364177197, + "loss": 0.8014, + "num_input_tokens_seen": 63824688, + "step": 109930 + }, + { + "epoch": 16.373994638069703, + "grad_norm": 0.056396484375, + "learning_rate": 0.0029051180967242128, + "loss": 0.786, + "num_input_tokens_seen": 63827952, + "step": 109935 + }, + { + "epoch": 16.374739350610664, + "grad_norm": 0.0250244140625, + "learning_rate": 0.002903965033601966, + "loss": 0.8011, + "num_input_tokens_seen": 63830832, + "step": 109940 + }, + { + "epoch": 16.375484063151625, + "grad_norm": 0.05712890625, + "learning_rate": 0.002902812174829933, + "loss": 0.7939, + "num_input_tokens_seen": 63833776, + "step": 109945 + }, + { + "epoch": 16.37622877569258, + "grad_norm": 0.06640625, + "learning_rate": 0.0029016595204275956, + "loss": 0.8043, + "num_input_tokens_seen": 63836592, + "step": 109950 + }, + { + "epoch": 16.376973488233542, + "grad_norm": 0.0419921875, + "learning_rate": 0.0029005070704144226, + "loss": 0.7919, + "num_input_tokens_seen": 63839504, + "step": 109955 + }, + { + "epoch": 16.377718200774503, + "grad_norm": 0.0245361328125, + "learning_rate": 0.0028993548248098877, + "loss": 0.7987, + "num_input_tokens_seen": 63842320, + "step": 109960 + }, + { + "epoch": 16.37846291331546, + "grad_norm": 0.0458984375, + "learning_rate": 0.002898202783633454, + "loss": 0.8095, + "num_input_tokens_seen": 63845040, + "step": 109965 + }, + { + "epoch": 16.37920762585642, + "grad_norm": 0.041015625, + "learning_rate": 0.0028970509469045808, + "loss": 0.7784, + "num_input_tokens_seen": 63847920, + "step": 109970 + }, + { + "epoch": 16.379952338397377, + "grad_norm": 0.040771484375, + "learning_rate": 0.002895899314642736, + "loss": 0.8032, + "num_input_tokens_seen": 63850864, + "step": 109975 + }, + { + "epoch": 16.380697050938338, + "grad_norm": 0.07373046875, + "learning_rate": 0.002894747886867365, + "loss": 0.8077, + "num_input_tokens_seen": 63854128, + "step": 109980 + }, + { + "epoch": 16.381441763479298, + "grad_norm": 0.061767578125, + "learning_rate": 0.0028935966635979305, + "loss": 0.7918, + "num_input_tokens_seen": 63857232, + "step": 109985 + }, + { + "epoch": 16.382186476020255, + "grad_norm": 0.0546875, + "learning_rate": 0.002892445644853872, + "loss": 0.7978, + "num_input_tokens_seen": 63859888, + "step": 109990 + }, + { + "epoch": 16.382931188561216, + "grad_norm": 0.043212890625, + "learning_rate": 0.0028912948306546414, + "loss": 0.8073, + "num_input_tokens_seen": 63862832, + "step": 109995 + }, + { + "epoch": 16.383675901102176, + "grad_norm": 0.059814453125, + "learning_rate": 0.002890144221019678, + "loss": 0.8026, + "num_input_tokens_seen": 63865616, + "step": 110000 + }, + { + "epoch": 16.384420613643133, + "grad_norm": 0.03662109375, + "learning_rate": 0.002888993815968421, + "loss": 0.7954, + "num_input_tokens_seen": 63868496, + "step": 110005 + }, + { + "epoch": 16.385165326184094, + "grad_norm": 0.0908203125, + "learning_rate": 0.002887843615520304, + "loss": 0.8006, + "num_input_tokens_seen": 63871792, + "step": 110010 + }, + { + "epoch": 16.38591003872505, + "grad_norm": 0.05126953125, + "learning_rate": 0.0028866936196947557, + "loss": 0.7937, + "num_input_tokens_seen": 63874352, + "step": 110015 + }, + { + "epoch": 16.38665475126601, + "grad_norm": 0.03564453125, + "learning_rate": 0.0028855438285112106, + "loss": 0.7847, + "num_input_tokens_seen": 63877264, + "step": 110020 + }, + { + "epoch": 16.38739946380697, + "grad_norm": 0.05322265625, + "learning_rate": 0.0028843942419890848, + "loss": 0.7868, + "num_input_tokens_seen": 63880368, + "step": 110025 + }, + { + "epoch": 16.38814417634793, + "grad_norm": 0.037109375, + "learning_rate": 0.00288324486014781, + "loss": 0.805, + "num_input_tokens_seen": 63883088, + "step": 110030 + }, + { + "epoch": 16.38888888888889, + "grad_norm": 0.038330078125, + "learning_rate": 0.002882095683006795, + "loss": 0.798, + "num_input_tokens_seen": 63885744, + "step": 110035 + }, + { + "epoch": 16.38963360142985, + "grad_norm": 0.072265625, + "learning_rate": 0.0028809467105854595, + "loss": 0.7991, + "num_input_tokens_seen": 63888624, + "step": 110040 + }, + { + "epoch": 16.390378313970807, + "grad_norm": 0.044189453125, + "learning_rate": 0.0028797979429032133, + "loss": 0.7934, + "num_input_tokens_seen": 63891664, + "step": 110045 + }, + { + "epoch": 16.391123026511767, + "grad_norm": 0.0673828125, + "learning_rate": 0.0028786493799794572, + "loss": 0.7959, + "num_input_tokens_seen": 63894416, + "step": 110050 + }, + { + "epoch": 16.391867739052724, + "grad_norm": 0.0654296875, + "learning_rate": 0.0028775010218336054, + "loss": 0.7929, + "num_input_tokens_seen": 63897520, + "step": 110055 + }, + { + "epoch": 16.392612451593685, + "grad_norm": 0.02734375, + "learning_rate": 0.0028763528684850495, + "loss": 0.7935, + "num_input_tokens_seen": 63900336, + "step": 110060 + }, + { + "epoch": 16.393357164134645, + "grad_norm": 0.051025390625, + "learning_rate": 0.0028752049199531923, + "loss": 0.7935, + "num_input_tokens_seen": 63903280, + "step": 110065 + }, + { + "epoch": 16.394101876675602, + "grad_norm": 0.07373046875, + "learning_rate": 0.002874057176257427, + "loss": 0.8018, + "num_input_tokens_seen": 63906224, + "step": 110070 + }, + { + "epoch": 16.394846589216563, + "grad_norm": 0.049072265625, + "learning_rate": 0.0028729096374171385, + "loss": 0.783, + "num_input_tokens_seen": 63908944, + "step": 110075 + }, + { + "epoch": 16.39559130175752, + "grad_norm": 0.0284423828125, + "learning_rate": 0.002871762303451719, + "loss": 0.8077, + "num_input_tokens_seen": 63911728, + "step": 110080 + }, + { + "epoch": 16.39633601429848, + "grad_norm": 0.06494140625, + "learning_rate": 0.002870615174380543, + "loss": 0.7644, + "num_input_tokens_seen": 63914480, + "step": 110085 + }, + { + "epoch": 16.39708072683944, + "grad_norm": 0.05078125, + "learning_rate": 0.002869468250223001, + "loss": 0.7913, + "num_input_tokens_seen": 63917360, + "step": 110090 + }, + { + "epoch": 16.397825439380398, + "grad_norm": 0.07275390625, + "learning_rate": 0.0028683215309984597, + "loss": 0.7846, + "num_input_tokens_seen": 63920368, + "step": 110095 + }, + { + "epoch": 16.39857015192136, + "grad_norm": 0.038818359375, + "learning_rate": 0.0028671750167263005, + "loss": 0.8084, + "num_input_tokens_seen": 63923312, + "step": 110100 + }, + { + "epoch": 16.39931486446232, + "grad_norm": 0.0439453125, + "learning_rate": 0.002866028707425884, + "loss": 0.8131, + "num_input_tokens_seen": 63926032, + "step": 110105 + }, + { + "epoch": 16.400059577003276, + "grad_norm": 0.035888671875, + "learning_rate": 0.0028648826031165827, + "loss": 0.7871, + "num_input_tokens_seen": 63928944, + "step": 110110 + }, + { + "epoch": 16.400804289544237, + "grad_norm": 0.0341796875, + "learning_rate": 0.002863736703817754, + "loss": 0.7928, + "num_input_tokens_seen": 63931824, + "step": 110115 + }, + { + "epoch": 16.401549002085194, + "grad_norm": 0.03564453125, + "learning_rate": 0.0028625910095487614, + "loss": 0.8018, + "num_input_tokens_seen": 63934672, + "step": 110120 + }, + { + "epoch": 16.402293714626154, + "grad_norm": 0.0498046875, + "learning_rate": 0.0028614455203289585, + "loss": 0.7899, + "num_input_tokens_seen": 63937488, + "step": 110125 + }, + { + "epoch": 16.403038427167115, + "grad_norm": 0.06640625, + "learning_rate": 0.0028603002361776973, + "loss": 0.7786, + "num_input_tokens_seen": 63940624, + "step": 110130 + }, + { + "epoch": 16.40378313970807, + "grad_norm": 0.0400390625, + "learning_rate": 0.0028591551571143245, + "loss": 0.7982, + "num_input_tokens_seen": 63943792, + "step": 110135 + }, + { + "epoch": 16.404527852249032, + "grad_norm": 0.078125, + "learning_rate": 0.0028580102831581816, + "loss": 0.8012, + "num_input_tokens_seen": 63946544, + "step": 110140 + }, + { + "epoch": 16.405272564789993, + "grad_norm": 0.039794921875, + "learning_rate": 0.002856865614328619, + "loss": 0.7958, + "num_input_tokens_seen": 63949648, + "step": 110145 + }, + { + "epoch": 16.40601727733095, + "grad_norm": 0.05615234375, + "learning_rate": 0.002855721150644965, + "loss": 0.7781, + "num_input_tokens_seen": 63952720, + "step": 110150 + }, + { + "epoch": 16.40676198987191, + "grad_norm": 0.047607421875, + "learning_rate": 0.0028545768921265646, + "loss": 0.8229, + "num_input_tokens_seen": 63955984, + "step": 110155 + }, + { + "epoch": 16.407506702412867, + "grad_norm": 0.0291748046875, + "learning_rate": 0.002853432838792743, + "loss": 0.8022, + "num_input_tokens_seen": 63959184, + "step": 110160 + }, + { + "epoch": 16.408251414953828, + "grad_norm": 0.04931640625, + "learning_rate": 0.0028522889906628258, + "loss": 0.7962, + "num_input_tokens_seen": 63961904, + "step": 110165 + }, + { + "epoch": 16.408996127494788, + "grad_norm": 0.046142578125, + "learning_rate": 0.0028511453477561414, + "loss": 0.7843, + "num_input_tokens_seen": 63964496, + "step": 110170 + }, + { + "epoch": 16.409740840035745, + "grad_norm": 0.04638671875, + "learning_rate": 0.0028500019100920052, + "loss": 0.8008, + "num_input_tokens_seen": 63967312, + "step": 110175 + }, + { + "epoch": 16.410485552576706, + "grad_norm": 0.0546875, + "learning_rate": 0.0028488586776897434, + "loss": 0.7935, + "num_input_tokens_seen": 63970192, + "step": 110180 + }, + { + "epoch": 16.411230265117666, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00284771565056866, + "loss": 0.7787, + "num_input_tokens_seen": 63973264, + "step": 110185 + }, + { + "epoch": 16.411974977658623, + "grad_norm": 0.07666015625, + "learning_rate": 0.0028465728287480735, + "loss": 0.7872, + "num_input_tokens_seen": 63976144, + "step": 110190 + }, + { + "epoch": 16.412719690199584, + "grad_norm": 0.0289306640625, + "learning_rate": 0.002845430212247286, + "loss": 0.8212, + "num_input_tokens_seen": 63978864, + "step": 110195 + }, + { + "epoch": 16.41346440274054, + "grad_norm": 0.0634765625, + "learning_rate": 0.0028442878010856033, + "loss": 0.8145, + "num_input_tokens_seen": 63981744, + "step": 110200 + }, + { + "epoch": 16.4142091152815, + "grad_norm": 0.057373046875, + "learning_rate": 0.0028431455952823217, + "loss": 0.8009, + "num_input_tokens_seen": 63984880, + "step": 110205 + }, + { + "epoch": 16.414953827822462, + "grad_norm": 0.0517578125, + "learning_rate": 0.0028420035948567373, + "loss": 0.8073, + "num_input_tokens_seen": 63987760, + "step": 110210 + }, + { + "epoch": 16.41569854036342, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0028408617998281487, + "loss": 0.8077, + "num_input_tokens_seen": 63990576, + "step": 110215 + }, + { + "epoch": 16.41644325290438, + "grad_norm": 0.052490234375, + "learning_rate": 0.0028397202102158374, + "loss": 0.8144, + "num_input_tokens_seen": 63993616, + "step": 110220 + }, + { + "epoch": 16.417187965445336, + "grad_norm": 0.03759765625, + "learning_rate": 0.0028385788260390983, + "loss": 0.7948, + "num_input_tokens_seen": 63996304, + "step": 110225 + }, + { + "epoch": 16.417932677986297, + "grad_norm": 0.078125, + "learning_rate": 0.0028374376473172067, + "loss": 0.7942, + "num_input_tokens_seen": 63999152, + "step": 110230 + }, + { + "epoch": 16.418677390527257, + "grad_norm": 0.05712890625, + "learning_rate": 0.002836296674069447, + "loss": 0.7931, + "num_input_tokens_seen": 64002032, + "step": 110235 + }, + { + "epoch": 16.419422103068214, + "grad_norm": 0.04541015625, + "learning_rate": 0.002835155906315094, + "loss": 0.7912, + "num_input_tokens_seen": 64004880, + "step": 110240 + }, + { + "epoch": 16.420166815609175, + "grad_norm": 0.052978515625, + "learning_rate": 0.0028340153440734126, + "loss": 0.7945, + "num_input_tokens_seen": 64007984, + "step": 110245 + }, + { + "epoch": 16.420911528150135, + "grad_norm": 0.02783203125, + "learning_rate": 0.0028328749873636817, + "loss": 0.8336, + "num_input_tokens_seen": 64011056, + "step": 110250 + }, + { + "epoch": 16.421656240691092, + "grad_norm": 0.0341796875, + "learning_rate": 0.002831734836205158, + "loss": 0.7941, + "num_input_tokens_seen": 64014064, + "step": 110255 + }, + { + "epoch": 16.422400953232053, + "grad_norm": 0.0380859375, + "learning_rate": 0.00283059489061711, + "loss": 0.8061, + "num_input_tokens_seen": 64017168, + "step": 110260 + }, + { + "epoch": 16.42314566577301, + "grad_norm": 0.0311279296875, + "learning_rate": 0.002829455150618794, + "loss": 0.7902, + "num_input_tokens_seen": 64020016, + "step": 110265 + }, + { + "epoch": 16.42389037831397, + "grad_norm": 0.05224609375, + "learning_rate": 0.002828315616229463, + "loss": 0.8408, + "num_input_tokens_seen": 64022896, + "step": 110270 + }, + { + "epoch": 16.42463509085493, + "grad_norm": 0.0277099609375, + "learning_rate": 0.002827176287468365, + "loss": 0.7922, + "num_input_tokens_seen": 64026032, + "step": 110275 + }, + { + "epoch": 16.425379803395888, + "grad_norm": 0.048828125, + "learning_rate": 0.0028260371643547572, + "loss": 0.796, + "num_input_tokens_seen": 64029136, + "step": 110280 + }, + { + "epoch": 16.42612451593685, + "grad_norm": 0.08154296875, + "learning_rate": 0.0028248982469078763, + "loss": 0.787, + "num_input_tokens_seen": 64031824, + "step": 110285 + }, + { + "epoch": 16.42686922847781, + "grad_norm": 0.06201171875, + "learning_rate": 0.0028237595351469622, + "loss": 0.7823, + "num_input_tokens_seen": 64034896, + "step": 110290 + }, + { + "epoch": 16.427613941018766, + "grad_norm": 0.052490234375, + "learning_rate": 0.0028226210290912593, + "loss": 0.7762, + "num_input_tokens_seen": 64037808, + "step": 110295 + }, + { + "epoch": 16.428358653559727, + "grad_norm": 0.057861328125, + "learning_rate": 0.002821482728759994, + "loss": 0.7933, + "num_input_tokens_seen": 64040848, + "step": 110300 + }, + { + "epoch": 16.429103366100684, + "grad_norm": 0.043701171875, + "learning_rate": 0.002820344634172404, + "loss": 0.7831, + "num_input_tokens_seen": 64043696, + "step": 110305 + }, + { + "epoch": 16.429848078641644, + "grad_norm": 0.045654296875, + "learning_rate": 0.0028192067453477075, + "loss": 0.8119, + "num_input_tokens_seen": 64046416, + "step": 110310 + }, + { + "epoch": 16.430592791182605, + "grad_norm": 0.056396484375, + "learning_rate": 0.0028180690623051388, + "loss": 0.8071, + "num_input_tokens_seen": 64049520, + "step": 110315 + }, + { + "epoch": 16.43133750372356, + "grad_norm": 0.08447265625, + "learning_rate": 0.0028169315850639066, + "loss": 0.7904, + "num_input_tokens_seen": 64052592, + "step": 110320 + }, + { + "epoch": 16.432082216264522, + "grad_norm": 0.0341796875, + "learning_rate": 0.0028157943136432376, + "loss": 0.7873, + "num_input_tokens_seen": 64055408, + "step": 110325 + }, + { + "epoch": 16.432826928805483, + "grad_norm": 0.052978515625, + "learning_rate": 0.002814657248062341, + "loss": 0.8459, + "num_input_tokens_seen": 64058096, + "step": 110330 + }, + { + "epoch": 16.43357164134644, + "grad_norm": 0.043701171875, + "learning_rate": 0.002813520388340424, + "loss": 0.7739, + "num_input_tokens_seen": 64060784, + "step": 110335 + }, + { + "epoch": 16.4343163538874, + "grad_norm": 0.0517578125, + "learning_rate": 0.002812383734496695, + "loss": 0.7932, + "num_input_tokens_seen": 64063568, + "step": 110340 + }, + { + "epoch": 16.435061066428357, + "grad_norm": 0.0537109375, + "learning_rate": 0.002811247286550352, + "loss": 0.8152, + "num_input_tokens_seen": 64066224, + "step": 110345 + }, + { + "epoch": 16.435805778969318, + "grad_norm": 0.0279541015625, + "learning_rate": 0.002810111044520602, + "loss": 0.7733, + "num_input_tokens_seen": 64069264, + "step": 110350 + }, + { + "epoch": 16.43655049151028, + "grad_norm": 0.035888671875, + "learning_rate": 0.0028089750084266327, + "loss": 0.7841, + "num_input_tokens_seen": 64071856, + "step": 110355 + }, + { + "epoch": 16.437295204051235, + "grad_norm": 0.0673828125, + "learning_rate": 0.002807839178287645, + "loss": 0.7977, + "num_input_tokens_seen": 64074704, + "step": 110360 + }, + { + "epoch": 16.438039916592196, + "grad_norm": 0.036376953125, + "learning_rate": 0.002806703554122821, + "loss": 0.7911, + "num_input_tokens_seen": 64077712, + "step": 110365 + }, + { + "epoch": 16.438784629133156, + "grad_norm": 0.046142578125, + "learning_rate": 0.002805568135951345, + "loss": 0.7817, + "num_input_tokens_seen": 64080656, + "step": 110370 + }, + { + "epoch": 16.439529341674113, + "grad_norm": 0.09912109375, + "learning_rate": 0.002804432923792406, + "loss": 0.797, + "num_input_tokens_seen": 64083600, + "step": 110375 + }, + { + "epoch": 16.440274054215074, + "grad_norm": 0.04638671875, + "learning_rate": 0.002803297917665172, + "loss": 0.813, + "num_input_tokens_seen": 64086672, + "step": 110380 + }, + { + "epoch": 16.44101876675603, + "grad_norm": 0.06396484375, + "learning_rate": 0.0028021631175888287, + "loss": 0.7934, + "num_input_tokens_seen": 64089552, + "step": 110385 + }, + { + "epoch": 16.44176347929699, + "grad_norm": 0.041748046875, + "learning_rate": 0.0028010285235825372, + "loss": 0.7873, + "num_input_tokens_seen": 64092304, + "step": 110390 + }, + { + "epoch": 16.442508191837952, + "grad_norm": 0.0260009765625, + "learning_rate": 0.002799894135665476, + "loss": 0.8069, + "num_input_tokens_seen": 64095216, + "step": 110395 + }, + { + "epoch": 16.44325290437891, + "grad_norm": 0.04638671875, + "learning_rate": 0.0027987599538568018, + "loss": 0.798, + "num_input_tokens_seen": 64098192, + "step": 110400 + }, + { + "epoch": 16.44399761691987, + "grad_norm": 0.03564453125, + "learning_rate": 0.0027976259781756795, + "loss": 0.8019, + "num_input_tokens_seen": 64101040, + "step": 110405 + }, + { + "epoch": 16.44474232946083, + "grad_norm": 0.044677734375, + "learning_rate": 0.0027964922086412616, + "loss": 0.7876, + "num_input_tokens_seen": 64104112, + "step": 110410 + }, + { + "epoch": 16.445487042001787, + "grad_norm": 0.035888671875, + "learning_rate": 0.0027953586452727037, + "loss": 0.7915, + "num_input_tokens_seen": 64107088, + "step": 110415 + }, + { + "epoch": 16.446231754542747, + "grad_norm": 0.053466796875, + "learning_rate": 0.0027942252880891593, + "loss": 0.8029, + "num_input_tokens_seen": 64110000, + "step": 110420 + }, + { + "epoch": 16.446976467083704, + "grad_norm": 0.0361328125, + "learning_rate": 0.0027930921371097706, + "loss": 0.794, + "num_input_tokens_seen": 64112752, + "step": 110425 + }, + { + "epoch": 16.447721179624665, + "grad_norm": 0.0269775390625, + "learning_rate": 0.002791959192353687, + "loss": 0.7892, + "num_input_tokens_seen": 64115568, + "step": 110430 + }, + { + "epoch": 16.448465892165625, + "grad_norm": 0.08154296875, + "learning_rate": 0.0027908264538400412, + "loss": 0.8045, + "num_input_tokens_seen": 64118320, + "step": 110435 + }, + { + "epoch": 16.449210604706582, + "grad_norm": 0.054443359375, + "learning_rate": 0.0027896939215879767, + "loss": 0.8233, + "num_input_tokens_seen": 64121072, + "step": 110440 + }, + { + "epoch": 16.449955317247543, + "grad_norm": 0.0380859375, + "learning_rate": 0.0027885615956166248, + "loss": 0.7991, + "num_input_tokens_seen": 64124016, + "step": 110445 + }, + { + "epoch": 16.4507000297885, + "grad_norm": 0.061767578125, + "learning_rate": 0.0027874294759451085, + "loss": 0.7826, + "num_input_tokens_seen": 64127280, + "step": 110450 + }, + { + "epoch": 16.45144474232946, + "grad_norm": 0.0238037109375, + "learning_rate": 0.0027862975625925638, + "loss": 0.7974, + "num_input_tokens_seen": 64130192, + "step": 110455 + }, + { + "epoch": 16.45218945487042, + "grad_norm": 0.046142578125, + "learning_rate": 0.0027851658555781075, + "loss": 0.7829, + "num_input_tokens_seen": 64133360, + "step": 110460 + }, + { + "epoch": 16.452934167411378, + "grad_norm": 0.08154296875, + "learning_rate": 0.0027840343549208595, + "loss": 0.7807, + "num_input_tokens_seen": 64136560, + "step": 110465 + }, + { + "epoch": 16.45367887995234, + "grad_norm": 0.05322265625, + "learning_rate": 0.0027829030606399325, + "loss": 0.7989, + "num_input_tokens_seen": 64139760, + "step": 110470 + }, + { + "epoch": 16.4544235924933, + "grad_norm": 0.0341796875, + "learning_rate": 0.0027817719727544444, + "loss": 0.7907, + "num_input_tokens_seen": 64142672, + "step": 110475 + }, + { + "epoch": 16.455168305034256, + "grad_norm": 0.0556640625, + "learning_rate": 0.0027806410912834967, + "loss": 0.7988, + "num_input_tokens_seen": 64145488, + "step": 110480 + }, + { + "epoch": 16.455913017575217, + "grad_norm": 0.042724609375, + "learning_rate": 0.0027795104162462022, + "loss": 0.8119, + "num_input_tokens_seen": 64148464, + "step": 110485 + }, + { + "epoch": 16.456657730116174, + "grad_norm": 0.060302734375, + "learning_rate": 0.0027783799476616597, + "loss": 0.8087, + "num_input_tokens_seen": 64151376, + "step": 110490 + }, + { + "epoch": 16.457402442657134, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0027772496855489634, + "loss": 0.8159, + "num_input_tokens_seen": 64154352, + "step": 110495 + }, + { + "epoch": 16.458147155198095, + "grad_norm": 0.034912109375, + "learning_rate": 0.0027761196299272133, + "loss": 0.7858, + "num_input_tokens_seen": 64157232, + "step": 110500 + }, + { + "epoch": 16.45889186773905, + "grad_norm": 0.036376953125, + "learning_rate": 0.0027749897808154954, + "loss": 0.7978, + "num_input_tokens_seen": 64160400, + "step": 110505 + }, + { + "epoch": 16.459636580280012, + "grad_norm": 0.044189453125, + "learning_rate": 0.002773860138232903, + "loss": 0.8042, + "num_input_tokens_seen": 64163088, + "step": 110510 + }, + { + "epoch": 16.460381292820973, + "grad_norm": 0.04052734375, + "learning_rate": 0.0027727307021985143, + "loss": 0.7892, + "num_input_tokens_seen": 64166000, + "step": 110515 + }, + { + "epoch": 16.46112600536193, + "grad_norm": 0.042724609375, + "learning_rate": 0.002771601472731417, + "loss": 0.7954, + "num_input_tokens_seen": 64169040, + "step": 110520 + }, + { + "epoch": 16.46187071790289, + "grad_norm": 0.050537109375, + "learning_rate": 0.0027704724498506844, + "loss": 0.838, + "num_input_tokens_seen": 64171696, + "step": 110525 + }, + { + "epoch": 16.462615430443847, + "grad_norm": 0.046142578125, + "learning_rate": 0.0027693436335753897, + "loss": 0.816, + "num_input_tokens_seen": 64174672, + "step": 110530 + }, + { + "epoch": 16.463360142984808, + "grad_norm": 0.0673828125, + "learning_rate": 0.0027682150239246037, + "loss": 0.7871, + "num_input_tokens_seen": 64177488, + "step": 110535 + }, + { + "epoch": 16.46410485552577, + "grad_norm": 0.041015625, + "learning_rate": 0.0027670866209173906, + "loss": 0.808, + "num_input_tokens_seen": 64180592, + "step": 110540 + }, + { + "epoch": 16.464849568066725, + "grad_norm": 0.0478515625, + "learning_rate": 0.002765958424572819, + "loss": 0.7959, + "num_input_tokens_seen": 64183376, + "step": 110545 + }, + { + "epoch": 16.465594280607686, + "grad_norm": 0.05419921875, + "learning_rate": 0.002764830434909941, + "loss": 0.7891, + "num_input_tokens_seen": 64186160, + "step": 110550 + }, + { + "epoch": 16.466338993148646, + "grad_norm": 0.07763671875, + "learning_rate": 0.002763702651947823, + "loss": 0.7801, + "num_input_tokens_seen": 64188976, + "step": 110555 + }, + { + "epoch": 16.467083705689603, + "grad_norm": 0.0634765625, + "learning_rate": 0.0027625750757055076, + "loss": 0.7718, + "num_input_tokens_seen": 64192368, + "step": 110560 + }, + { + "epoch": 16.467828418230564, + "grad_norm": 0.0390625, + "learning_rate": 0.002761447706202052, + "loss": 0.7946, + "num_input_tokens_seen": 64194960, + "step": 110565 + }, + { + "epoch": 16.46857313077152, + "grad_norm": 0.050537109375, + "learning_rate": 0.0027603205434564987, + "loss": 0.8086, + "num_input_tokens_seen": 64197968, + "step": 110570 + }, + { + "epoch": 16.46931784331248, + "grad_norm": 0.1064453125, + "learning_rate": 0.0027591935874878874, + "loss": 0.7985, + "num_input_tokens_seen": 64200720, + "step": 110575 + }, + { + "epoch": 16.470062555853442, + "grad_norm": 0.044189453125, + "learning_rate": 0.002758066838315262, + "loss": 0.8074, + "num_input_tokens_seen": 64203568, + "step": 110580 + }, + { + "epoch": 16.4708072683944, + "grad_norm": 0.06298828125, + "learning_rate": 0.0027569402959576527, + "loss": 0.799, + "num_input_tokens_seen": 64206448, + "step": 110585 + }, + { + "epoch": 16.47155198093536, + "grad_norm": 0.048583984375, + "learning_rate": 0.002755813960434097, + "loss": 0.794, + "num_input_tokens_seen": 64209488, + "step": 110590 + }, + { + "epoch": 16.472296693476316, + "grad_norm": 0.033447265625, + "learning_rate": 0.002754687831763619, + "loss": 0.8067, + "num_input_tokens_seen": 64212432, + "step": 110595 + }, + { + "epoch": 16.473041406017277, + "grad_norm": 0.054443359375, + "learning_rate": 0.002753561909965246, + "loss": 0.8031, + "num_input_tokens_seen": 64215568, + "step": 110600 + }, + { + "epoch": 16.473786118558237, + "grad_norm": 0.0556640625, + "learning_rate": 0.0027524361950579967, + "loss": 0.7965, + "num_input_tokens_seen": 64218512, + "step": 110605 + }, + { + "epoch": 16.474530831099194, + "grad_norm": 0.04736328125, + "learning_rate": 0.0027513106870608863, + "loss": 0.7992, + "num_input_tokens_seen": 64221520, + "step": 110610 + }, + { + "epoch": 16.475275543640155, + "grad_norm": 0.06640625, + "learning_rate": 0.002750185385992937, + "loss": 0.8127, + "num_input_tokens_seen": 64224176, + "step": 110615 + }, + { + "epoch": 16.476020256181116, + "grad_norm": 0.054931640625, + "learning_rate": 0.0027490602918731526, + "loss": 0.8087, + "num_input_tokens_seen": 64226896, + "step": 110620 + }, + { + "epoch": 16.476764968722073, + "grad_norm": 0.057861328125, + "learning_rate": 0.0027479354047205465, + "loss": 0.7784, + "num_input_tokens_seen": 64229904, + "step": 110625 + }, + { + "epoch": 16.477509681263033, + "grad_norm": 0.07080078125, + "learning_rate": 0.002746810724554114, + "loss": 0.7731, + "num_input_tokens_seen": 64232816, + "step": 110630 + }, + { + "epoch": 16.47825439380399, + "grad_norm": 0.0595703125, + "learning_rate": 0.002745686251392866, + "loss": 0.8007, + "num_input_tokens_seen": 64236208, + "step": 110635 + }, + { + "epoch": 16.47899910634495, + "grad_norm": 0.02978515625, + "learning_rate": 0.0027445619852557905, + "loss": 0.8, + "num_input_tokens_seen": 64239344, + "step": 110640 + }, + { + "epoch": 16.47974381888591, + "grad_norm": 0.035400390625, + "learning_rate": 0.0027434379261618877, + "loss": 0.7919, + "num_input_tokens_seen": 64242096, + "step": 110645 + }, + { + "epoch": 16.480488531426868, + "grad_norm": 0.039306640625, + "learning_rate": 0.0027423140741301443, + "loss": 0.7932, + "num_input_tokens_seen": 64244880, + "step": 110650 + }, + { + "epoch": 16.48123324396783, + "grad_norm": 0.0576171875, + "learning_rate": 0.0027411904291795414, + "loss": 0.7819, + "num_input_tokens_seen": 64247536, + "step": 110655 + }, + { + "epoch": 16.48197795650879, + "grad_norm": 0.04345703125, + "learning_rate": 0.002740066991329072, + "loss": 0.8019, + "num_input_tokens_seen": 64250736, + "step": 110660 + }, + { + "epoch": 16.482722669049746, + "grad_norm": 0.04541015625, + "learning_rate": 0.0027389437605977096, + "loss": 0.8037, + "num_input_tokens_seen": 64253648, + "step": 110665 + }, + { + "epoch": 16.483467381590707, + "grad_norm": 0.0419921875, + "learning_rate": 0.002737820737004432, + "loss": 0.7746, + "num_input_tokens_seen": 64256368, + "step": 110670 + }, + { + "epoch": 16.484212094131664, + "grad_norm": 0.052490234375, + "learning_rate": 0.0027366979205682043, + "loss": 0.7995, + "num_input_tokens_seen": 64259280, + "step": 110675 + }, + { + "epoch": 16.484956806672624, + "grad_norm": 0.059814453125, + "learning_rate": 0.002735575311308008, + "loss": 0.7961, + "num_input_tokens_seen": 64262160, + "step": 110680 + }, + { + "epoch": 16.485701519213585, + "grad_norm": 0.041015625, + "learning_rate": 0.0027344529092428, + "loss": 0.7906, + "num_input_tokens_seen": 64265040, + "step": 110685 + }, + { + "epoch": 16.48644623175454, + "grad_norm": 0.041259765625, + "learning_rate": 0.0027333307143915407, + "loss": 0.8007, + "num_input_tokens_seen": 64267952, + "step": 110690 + }, + { + "epoch": 16.487190944295502, + "grad_norm": 0.08154296875, + "learning_rate": 0.0027322087267731957, + "loss": 0.7835, + "num_input_tokens_seen": 64271184, + "step": 110695 + }, + { + "epoch": 16.487935656836463, + "grad_norm": 0.171875, + "learning_rate": 0.0027310869464067114, + "loss": 0.8009, + "num_input_tokens_seen": 64273904, + "step": 110700 + }, + { + "epoch": 16.48868036937742, + "grad_norm": 0.30859375, + "learning_rate": 0.002729965373311048, + "loss": 0.8144, + "num_input_tokens_seen": 64276880, + "step": 110705 + }, + { + "epoch": 16.48942508191838, + "grad_norm": 0.08984375, + "learning_rate": 0.0027288440075051443, + "loss": 0.7879, + "num_input_tokens_seen": 64279984, + "step": 110710 + }, + { + "epoch": 16.490169794459337, + "grad_norm": 0.09912109375, + "learning_rate": 0.002727722849007953, + "loss": 0.7796, + "num_input_tokens_seen": 64283056, + "step": 110715 + }, + { + "epoch": 16.490914507000298, + "grad_norm": 0.034912109375, + "learning_rate": 0.002726601897838408, + "loss": 0.7917, + "num_input_tokens_seen": 64285680, + "step": 110720 + }, + { + "epoch": 16.49165921954126, + "grad_norm": 0.03564453125, + "learning_rate": 0.0027254811540154533, + "loss": 0.8083, + "num_input_tokens_seen": 64288272, + "step": 110725 + }, + { + "epoch": 16.492403932082215, + "grad_norm": 0.0986328125, + "learning_rate": 0.00272436061755802, + "loss": 0.8047, + "num_input_tokens_seen": 64290928, + "step": 110730 + }, + { + "epoch": 16.493148644623176, + "grad_norm": 0.0966796875, + "learning_rate": 0.0027232402884850375, + "loss": 0.7992, + "num_input_tokens_seen": 64293872, + "step": 110735 + }, + { + "epoch": 16.493893357164133, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0027221201668154314, + "loss": 0.7898, + "num_input_tokens_seen": 64296944, + "step": 110740 + }, + { + "epoch": 16.494638069705093, + "grad_norm": 0.1484375, + "learning_rate": 0.0027210002525681247, + "loss": 0.798, + "num_input_tokens_seen": 64299856, + "step": 110745 + }, + { + "epoch": 16.495382782246054, + "grad_norm": 0.044189453125, + "learning_rate": 0.0027198805457620406, + "loss": 0.8002, + "num_input_tokens_seen": 64302768, + "step": 110750 + }, + { + "epoch": 16.49612749478701, + "grad_norm": 0.06298828125, + "learning_rate": 0.002718761046416092, + "loss": 0.8183, + "num_input_tokens_seen": 64305936, + "step": 110755 + }, + { + "epoch": 16.49687220732797, + "grad_norm": 0.037841796875, + "learning_rate": 0.002717641754549196, + "loss": 0.7981, + "num_input_tokens_seen": 64308880, + "step": 110760 + }, + { + "epoch": 16.497616919868932, + "grad_norm": 0.048828125, + "learning_rate": 0.0027165226701802584, + "loss": 0.7807, + "num_input_tokens_seen": 64311952, + "step": 110765 + }, + { + "epoch": 16.49836163240989, + "grad_norm": 0.06787109375, + "learning_rate": 0.0027154037933281847, + "loss": 0.7744, + "num_input_tokens_seen": 64314672, + "step": 110770 + }, + { + "epoch": 16.49910634495085, + "grad_norm": 0.08544921875, + "learning_rate": 0.0027142851240118796, + "loss": 0.8289, + "num_input_tokens_seen": 64317552, + "step": 110775 + }, + { + "epoch": 16.499851057491806, + "grad_norm": 0.0380859375, + "learning_rate": 0.002713166662250238, + "loss": 0.8034, + "num_input_tokens_seen": 64320464, + "step": 110780 + }, + { + "epoch": 16.500595770032767, + "grad_norm": 0.035888671875, + "learning_rate": 0.002712048408062161, + "loss": 0.7822, + "num_input_tokens_seen": 64323472, + "step": 110785 + }, + { + "epoch": 16.501340482573728, + "grad_norm": 0.044921875, + "learning_rate": 0.0027109303614665397, + "loss": 0.808, + "num_input_tokens_seen": 64326224, + "step": 110790 + }, + { + "epoch": 16.502085195114685, + "grad_norm": 0.04150390625, + "learning_rate": 0.002709812522482258, + "loss": 0.7863, + "num_input_tokens_seen": 64329104, + "step": 110795 + }, + { + "epoch": 16.502829907655645, + "grad_norm": 0.099609375, + "learning_rate": 0.0027086948911281988, + "loss": 0.8171, + "num_input_tokens_seen": 64332176, + "step": 110800 + }, + { + "epoch": 16.503574620196606, + "grad_norm": 0.07080078125, + "learning_rate": 0.0027075774674232512, + "loss": 0.8545, + "num_input_tokens_seen": 64335184, + "step": 110805 + }, + { + "epoch": 16.504319332737563, + "grad_norm": 0.046142578125, + "learning_rate": 0.0027064602513862876, + "loss": 0.8064, + "num_input_tokens_seen": 64338032, + "step": 110810 + }, + { + "epoch": 16.505064045278523, + "grad_norm": 0.04052734375, + "learning_rate": 0.0027053432430361804, + "loss": 0.7831, + "num_input_tokens_seen": 64340784, + "step": 110815 + }, + { + "epoch": 16.50580875781948, + "grad_norm": 0.053955078125, + "learning_rate": 0.0027042264423918078, + "loss": 0.7943, + "num_input_tokens_seen": 64343664, + "step": 110820 + }, + { + "epoch": 16.50655347036044, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0027031098494720287, + "loss": 0.7973, + "num_input_tokens_seen": 64346672, + "step": 110825 + }, + { + "epoch": 16.5072981829014, + "grad_norm": 0.078125, + "learning_rate": 0.0027019934642957145, + "loss": 0.7954, + "num_input_tokens_seen": 64349616, + "step": 110830 + }, + { + "epoch": 16.508042895442358, + "grad_norm": 0.030029296875, + "learning_rate": 0.002700877286881717, + "loss": 0.7778, + "num_input_tokens_seen": 64352816, + "step": 110835 + }, + { + "epoch": 16.50878760798332, + "grad_norm": 0.08935546875, + "learning_rate": 0.002699761317248902, + "loss": 0.799, + "num_input_tokens_seen": 64355856, + "step": 110840 + }, + { + "epoch": 16.50953232052428, + "grad_norm": 0.040771484375, + "learning_rate": 0.0026986455554161176, + "loss": 0.8527, + "num_input_tokens_seen": 64359120, + "step": 110845 + }, + { + "epoch": 16.510277033065236, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00269753000140221, + "loss": 0.8119, + "num_input_tokens_seen": 64361616, + "step": 110850 + }, + { + "epoch": 16.511021745606197, + "grad_norm": 0.042236328125, + "learning_rate": 0.002696414655226033, + "loss": 0.8127, + "num_input_tokens_seen": 64364624, + "step": 110855 + }, + { + "epoch": 16.511766458147154, + "grad_norm": 0.03515625, + "learning_rate": 0.0026952995169064265, + "loss": 0.8157, + "num_input_tokens_seen": 64367472, + "step": 110860 + }, + { + "epoch": 16.512511170688114, + "grad_norm": 0.0439453125, + "learning_rate": 0.00269418458646223, + "loss": 0.8028, + "num_input_tokens_seen": 64370224, + "step": 110865 + }, + { + "epoch": 16.513255883229075, + "grad_norm": 0.03955078125, + "learning_rate": 0.0026930698639122718, + "loss": 0.7843, + "num_input_tokens_seen": 64373008, + "step": 110870 + }, + { + "epoch": 16.51400059577003, + "grad_norm": 0.07763671875, + "learning_rate": 0.002691955349275394, + "loss": 0.769, + "num_input_tokens_seen": 64375664, + "step": 110875 + }, + { + "epoch": 16.514745308310992, + "grad_norm": 0.04638671875, + "learning_rate": 0.0026908410425704175, + "loss": 0.7911, + "num_input_tokens_seen": 64378352, + "step": 110880 + }, + { + "epoch": 16.515490020851953, + "grad_norm": 0.0498046875, + "learning_rate": 0.002689726943816176, + "loss": 0.8035, + "num_input_tokens_seen": 64381200, + "step": 110885 + }, + { + "epoch": 16.51623473339291, + "grad_norm": 0.03759765625, + "learning_rate": 0.002688613053031485, + "loss": 0.7754, + "num_input_tokens_seen": 64384304, + "step": 110890 + }, + { + "epoch": 16.51697944593387, + "grad_norm": 0.06787109375, + "learning_rate": 0.002687499370235162, + "loss": 0.7814, + "num_input_tokens_seen": 64387024, + "step": 110895 + }, + { + "epoch": 16.517724158474827, + "grad_norm": 0.058837890625, + "learning_rate": 0.002686385895446025, + "loss": 0.7863, + "num_input_tokens_seen": 64389904, + "step": 110900 + }, + { + "epoch": 16.518468871015788, + "grad_norm": 0.037109375, + "learning_rate": 0.00268527262868288, + "loss": 0.7908, + "num_input_tokens_seen": 64392880, + "step": 110905 + }, + { + "epoch": 16.51921358355675, + "grad_norm": 0.060546875, + "learning_rate": 0.002684159569964541, + "loss": 0.81, + "num_input_tokens_seen": 64395632, + "step": 110910 + }, + { + "epoch": 16.519958296097705, + "grad_norm": 0.058349609375, + "learning_rate": 0.0026830467193098067, + "loss": 0.8073, + "num_input_tokens_seen": 64398608, + "step": 110915 + }, + { + "epoch": 16.520703008638666, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0026819340767374822, + "loss": 0.7877, + "num_input_tokens_seen": 64401264, + "step": 110920 + }, + { + "epoch": 16.521447721179626, + "grad_norm": 0.043212890625, + "learning_rate": 0.002680821642266362, + "loss": 0.7872, + "num_input_tokens_seen": 64404304, + "step": 110925 + }, + { + "epoch": 16.522192433720583, + "grad_norm": 0.040283203125, + "learning_rate": 0.00267970941591524, + "loss": 0.8071, + "num_input_tokens_seen": 64407248, + "step": 110930 + }, + { + "epoch": 16.522937146261544, + "grad_norm": 0.061767578125, + "learning_rate": 0.002678597397702904, + "loss": 0.7996, + "num_input_tokens_seen": 64410256, + "step": 110935 + }, + { + "epoch": 16.5236818588025, + "grad_norm": 0.043701171875, + "learning_rate": 0.0026774855876481414, + "loss": 0.7935, + "num_input_tokens_seen": 64413200, + "step": 110940 + }, + { + "epoch": 16.52442657134346, + "grad_norm": 0.054443359375, + "learning_rate": 0.0026763739857697377, + "loss": 0.8123, + "num_input_tokens_seen": 64415984, + "step": 110945 + }, + { + "epoch": 16.525171283884422, + "grad_norm": 0.03955078125, + "learning_rate": 0.0026752625920864652, + "loss": 0.8018, + "num_input_tokens_seen": 64418800, + "step": 110950 + }, + { + "epoch": 16.52591599642538, + "grad_norm": 0.0869140625, + "learning_rate": 0.0026741514066171107, + "loss": 0.8208, + "num_input_tokens_seen": 64421552, + "step": 110955 + }, + { + "epoch": 16.52666070896634, + "grad_norm": 0.0361328125, + "learning_rate": 0.0026730404293804372, + "loss": 0.7803, + "num_input_tokens_seen": 64424400, + "step": 110960 + }, + { + "epoch": 16.527405421507297, + "grad_norm": 0.05322265625, + "learning_rate": 0.00267192966039522, + "loss": 0.7806, + "num_input_tokens_seen": 64427216, + "step": 110965 + }, + { + "epoch": 16.528150134048257, + "grad_norm": 0.02783203125, + "learning_rate": 0.00267081909968022, + "loss": 0.7984, + "num_input_tokens_seen": 64430032, + "step": 110970 + }, + { + "epoch": 16.528894846589218, + "grad_norm": 0.06689453125, + "learning_rate": 0.0026697087472541997, + "loss": 0.7954, + "num_input_tokens_seen": 64432976, + "step": 110975 + }, + { + "epoch": 16.529639559130175, + "grad_norm": 0.048828125, + "learning_rate": 0.0026685986031359204, + "loss": 0.7821, + "num_input_tokens_seen": 64435664, + "step": 110980 + }, + { + "epoch": 16.530384271671135, + "grad_norm": 0.03759765625, + "learning_rate": 0.0026674886673441305, + "loss": 0.8071, + "num_input_tokens_seen": 64438352, + "step": 110985 + }, + { + "epoch": 16.531128984212096, + "grad_norm": 0.032470703125, + "learning_rate": 0.00266637893989759, + "loss": 0.7885, + "num_input_tokens_seen": 64441360, + "step": 110990 + }, + { + "epoch": 16.531873696753053, + "grad_norm": 0.064453125, + "learning_rate": 0.0026652694208150404, + "loss": 0.7995, + "num_input_tokens_seen": 64444528, + "step": 110995 + }, + { + "epoch": 16.532618409294013, + "grad_norm": 0.04248046875, + "learning_rate": 0.002664160110115229, + "loss": 0.7829, + "num_input_tokens_seen": 64447440, + "step": 111000 + }, + { + "epoch": 16.53336312183497, + "grad_norm": 0.051025390625, + "learning_rate": 0.0026630510078168917, + "loss": 0.7777, + "num_input_tokens_seen": 64450352, + "step": 111005 + }, + { + "epoch": 16.53410783437593, + "grad_norm": 0.1943359375, + "learning_rate": 0.0026619421139387704, + "loss": 0.8003, + "num_input_tokens_seen": 64453072, + "step": 111010 + }, + { + "epoch": 16.53485254691689, + "grad_norm": 0.06689453125, + "learning_rate": 0.0026608334284995987, + "loss": 0.7947, + "num_input_tokens_seen": 64456240, + "step": 111015 + }, + { + "epoch": 16.535597259457848, + "grad_norm": 0.0654296875, + "learning_rate": 0.0026597249515181013, + "loss": 0.8248, + "num_input_tokens_seen": 64459088, + "step": 111020 + }, + { + "epoch": 16.53634197199881, + "grad_norm": 0.03515625, + "learning_rate": 0.0026586166830130137, + "loss": 0.7724, + "num_input_tokens_seen": 64461776, + "step": 111025 + }, + { + "epoch": 16.53708668453977, + "grad_norm": 0.052001953125, + "learning_rate": 0.0026575086230030487, + "loss": 0.8075, + "num_input_tokens_seen": 64464624, + "step": 111030 + }, + { + "epoch": 16.537831397080726, + "grad_norm": 0.0791015625, + "learning_rate": 0.002656400771506935, + "loss": 0.7975, + "num_input_tokens_seen": 64467280, + "step": 111035 + }, + { + "epoch": 16.538576109621687, + "grad_norm": 0.047119140625, + "learning_rate": 0.002655293128543382, + "loss": 0.7903, + "num_input_tokens_seen": 64470032, + "step": 111040 + }, + { + "epoch": 16.539320822162644, + "grad_norm": 0.03271484375, + "learning_rate": 0.0026541856941311075, + "loss": 0.7937, + "num_input_tokens_seen": 64473104, + "step": 111045 + }, + { + "epoch": 16.540065534703604, + "grad_norm": 0.052734375, + "learning_rate": 0.0026530784682888193, + "loss": 0.7905, + "num_input_tokens_seen": 64475824, + "step": 111050 + }, + { + "epoch": 16.540810247244565, + "grad_norm": 0.0595703125, + "learning_rate": 0.002651971451035217, + "loss": 0.7876, + "num_input_tokens_seen": 64478768, + "step": 111055 + }, + { + "epoch": 16.541554959785522, + "grad_norm": 0.043212890625, + "learning_rate": 0.0026508646423890115, + "loss": 0.7997, + "num_input_tokens_seen": 64481616, + "step": 111060 + }, + { + "epoch": 16.542299672326482, + "grad_norm": 0.0458984375, + "learning_rate": 0.002649758042368895, + "loss": 0.8173, + "num_input_tokens_seen": 64484528, + "step": 111065 + }, + { + "epoch": 16.543044384867443, + "grad_norm": 0.04736328125, + "learning_rate": 0.0026486516509935656, + "loss": 0.7939, + "num_input_tokens_seen": 64487664, + "step": 111070 + }, + { + "epoch": 16.5437890974084, + "grad_norm": 0.05322265625, + "learning_rate": 0.0026475454682817093, + "loss": 0.7919, + "num_input_tokens_seen": 64490544, + "step": 111075 + }, + { + "epoch": 16.54453380994936, + "grad_norm": 0.055419921875, + "learning_rate": 0.0026464394942520213, + "loss": 0.8127, + "num_input_tokens_seen": 64493872, + "step": 111080 + }, + { + "epoch": 16.545278522490317, + "grad_norm": 0.05810546875, + "learning_rate": 0.0026453337289231798, + "loss": 0.7889, + "num_input_tokens_seen": 64496880, + "step": 111085 + }, + { + "epoch": 16.546023235031278, + "grad_norm": 0.0341796875, + "learning_rate": 0.00264422817231387, + "loss": 0.7949, + "num_input_tokens_seen": 64499536, + "step": 111090 + }, + { + "epoch": 16.54676794757224, + "grad_norm": 0.052978515625, + "learning_rate": 0.0026431228244427683, + "loss": 0.7787, + "num_input_tokens_seen": 64502416, + "step": 111095 + }, + { + "epoch": 16.547512660113195, + "grad_norm": 0.040283203125, + "learning_rate": 0.002642017685328545, + "loss": 0.798, + "num_input_tokens_seen": 64505264, + "step": 111100 + }, + { + "epoch": 16.548257372654156, + "grad_norm": 0.041748046875, + "learning_rate": 0.0026409127549898758, + "loss": 0.8035, + "num_input_tokens_seen": 64508272, + "step": 111105 + }, + { + "epoch": 16.549002085195113, + "grad_norm": 0.03759765625, + "learning_rate": 0.0026398080334454196, + "loss": 0.7987, + "num_input_tokens_seen": 64511120, + "step": 111110 + }, + { + "epoch": 16.549746797736073, + "grad_norm": 0.0732421875, + "learning_rate": 0.0026387035207138493, + "loss": 0.7907, + "num_input_tokens_seen": 64514608, + "step": 111115 + }, + { + "epoch": 16.550491510277034, + "grad_norm": 0.0625, + "learning_rate": 0.002637599216813819, + "loss": 0.8157, + "num_input_tokens_seen": 64517424, + "step": 111120 + }, + { + "epoch": 16.55123622281799, + "grad_norm": 0.1201171875, + "learning_rate": 0.002636495121763986, + "loss": 0.8003, + "num_input_tokens_seen": 64520304, + "step": 111125 + }, + { + "epoch": 16.55198093535895, + "grad_norm": 0.042236328125, + "learning_rate": 0.0026353912355829966, + "loss": 0.7871, + "num_input_tokens_seen": 64522832, + "step": 111130 + }, + { + "epoch": 16.552725647899912, + "grad_norm": 0.08935546875, + "learning_rate": 0.0026342875582895113, + "loss": 0.8112, + "num_input_tokens_seen": 64525744, + "step": 111135 + }, + { + "epoch": 16.55347036044087, + "grad_norm": 0.040771484375, + "learning_rate": 0.0026331840899021684, + "loss": 0.7934, + "num_input_tokens_seen": 64528688, + "step": 111140 + }, + { + "epoch": 16.55421507298183, + "grad_norm": 0.06396484375, + "learning_rate": 0.002632080830439608, + "loss": 0.823, + "num_input_tokens_seen": 64531632, + "step": 111145 + }, + { + "epoch": 16.554959785522787, + "grad_norm": 0.05810546875, + "learning_rate": 0.002630977779920475, + "loss": 0.8059, + "num_input_tokens_seen": 64534384, + "step": 111150 + }, + { + "epoch": 16.555704498063747, + "grad_norm": 0.051513671875, + "learning_rate": 0.002629874938363398, + "loss": 0.8349, + "num_input_tokens_seen": 64537008, + "step": 111155 + }, + { + "epoch": 16.556449210604708, + "grad_norm": 0.034912109375, + "learning_rate": 0.002628772305787016, + "loss": 0.7932, + "num_input_tokens_seen": 64539440, + "step": 111160 + }, + { + "epoch": 16.557193923145665, + "grad_norm": 0.0286865234375, + "learning_rate": 0.002627669882209948, + "loss": 0.786, + "num_input_tokens_seen": 64542224, + "step": 111165 + }, + { + "epoch": 16.557938635686625, + "grad_norm": 0.0478515625, + "learning_rate": 0.0026265676676508246, + "loss": 0.8126, + "num_input_tokens_seen": 64545168, + "step": 111170 + }, + { + "epoch": 16.558683348227586, + "grad_norm": 0.050537109375, + "learning_rate": 0.002625465662128268, + "loss": 0.7975, + "num_input_tokens_seen": 64548272, + "step": 111175 + }, + { + "epoch": 16.559428060768543, + "grad_norm": 0.06591796875, + "learning_rate": 0.0026243638656608856, + "loss": 0.7874, + "num_input_tokens_seen": 64550864, + "step": 111180 + }, + { + "epoch": 16.560172773309503, + "grad_norm": 0.030029296875, + "learning_rate": 0.002623262278267302, + "loss": 0.7912, + "num_input_tokens_seen": 64553904, + "step": 111185 + }, + { + "epoch": 16.56091748585046, + "grad_norm": 0.03759765625, + "learning_rate": 0.0026221608999661234, + "loss": 0.7904, + "num_input_tokens_seen": 64557072, + "step": 111190 + }, + { + "epoch": 16.56166219839142, + "grad_norm": 0.05517578125, + "learning_rate": 0.0026210597307759565, + "loss": 0.8017, + "num_input_tokens_seen": 64560176, + "step": 111195 + }, + { + "epoch": 16.56240691093238, + "grad_norm": 0.057861328125, + "learning_rate": 0.0026199587707154, + "loss": 0.8044, + "num_input_tokens_seen": 64562736, + "step": 111200 + }, + { + "epoch": 16.56315162347334, + "grad_norm": 0.0380859375, + "learning_rate": 0.002618858019803062, + "loss": 0.8091, + "num_input_tokens_seen": 64565328, + "step": 111205 + }, + { + "epoch": 16.5638963360143, + "grad_norm": 0.05078125, + "learning_rate": 0.002617757478057533, + "loss": 0.8012, + "num_input_tokens_seen": 64568176, + "step": 111210 + }, + { + "epoch": 16.56464104855526, + "grad_norm": 0.05419921875, + "learning_rate": 0.002616657145497403, + "loss": 0.7868, + "num_input_tokens_seen": 64571056, + "step": 111215 + }, + { + "epoch": 16.565385761096216, + "grad_norm": 0.03271484375, + "learning_rate": 0.0026155570221412685, + "loss": 0.7919, + "num_input_tokens_seen": 64573904, + "step": 111220 + }, + { + "epoch": 16.566130473637177, + "grad_norm": 0.0751953125, + "learning_rate": 0.002614457108007707, + "loss": 0.8202, + "num_input_tokens_seen": 64576656, + "step": 111225 + }, + { + "epoch": 16.566875186178134, + "grad_norm": 0.060791015625, + "learning_rate": 0.00261335740311531, + "loss": 0.799, + "num_input_tokens_seen": 64579376, + "step": 111230 + }, + { + "epoch": 16.567619898719094, + "grad_norm": 0.06396484375, + "learning_rate": 0.0026122579074826457, + "loss": 0.7997, + "num_input_tokens_seen": 64582448, + "step": 111235 + }, + { + "epoch": 16.568364611260055, + "grad_norm": 0.031494140625, + "learning_rate": 0.002611158621128296, + "loss": 0.7913, + "num_input_tokens_seen": 64585424, + "step": 111240 + }, + { + "epoch": 16.569109323801012, + "grad_norm": 0.037353515625, + "learning_rate": 0.002610059544070828, + "loss": 0.8286, + "num_input_tokens_seen": 64588240, + "step": 111245 + }, + { + "epoch": 16.569854036341972, + "grad_norm": 0.036865234375, + "learning_rate": 0.0026089606763288146, + "loss": 0.81, + "num_input_tokens_seen": 64591024, + "step": 111250 + }, + { + "epoch": 16.57059874888293, + "grad_norm": 0.05615234375, + "learning_rate": 0.0026078620179208183, + "loss": 0.7902, + "num_input_tokens_seen": 64593872, + "step": 111255 + }, + { + "epoch": 16.57134346142389, + "grad_norm": 0.043212890625, + "learning_rate": 0.0026067635688653956, + "loss": 0.7953, + "num_input_tokens_seen": 64596528, + "step": 111260 + }, + { + "epoch": 16.57208817396485, + "grad_norm": 0.07958984375, + "learning_rate": 0.0026056653291811082, + "loss": 0.8009, + "num_input_tokens_seen": 64599216, + "step": 111265 + }, + { + "epoch": 16.572832886505807, + "grad_norm": 0.0341796875, + "learning_rate": 0.002604567298886505, + "loss": 0.8202, + "num_input_tokens_seen": 64601808, + "step": 111270 + }, + { + "epoch": 16.573577599046768, + "grad_norm": 0.057861328125, + "learning_rate": 0.0026034694780001423, + "loss": 0.7984, + "num_input_tokens_seen": 64604560, + "step": 111275 + }, + { + "epoch": 16.57432231158773, + "grad_norm": 0.0400390625, + "learning_rate": 0.0026023718665405585, + "loss": 0.7738, + "num_input_tokens_seen": 64607536, + "step": 111280 + }, + { + "epoch": 16.575067024128685, + "grad_norm": 0.07421875, + "learning_rate": 0.002601274464526306, + "loss": 0.7917, + "num_input_tokens_seen": 64610224, + "step": 111285 + }, + { + "epoch": 16.575811736669646, + "grad_norm": 0.041259765625, + "learning_rate": 0.00260017727197592, + "loss": 0.7945, + "num_input_tokens_seen": 64613168, + "step": 111290 + }, + { + "epoch": 16.576556449210603, + "grad_norm": 0.046142578125, + "learning_rate": 0.0025990802889079337, + "loss": 0.8234, + "num_input_tokens_seen": 64616176, + "step": 111295 + }, + { + "epoch": 16.577301161751564, + "grad_norm": 0.052978515625, + "learning_rate": 0.002597983515340884, + "loss": 0.7982, + "num_input_tokens_seen": 64619056, + "step": 111300 + }, + { + "epoch": 16.578045874292524, + "grad_norm": 0.042724609375, + "learning_rate": 0.0025968869512932964, + "loss": 0.8115, + "num_input_tokens_seen": 64622064, + "step": 111305 + }, + { + "epoch": 16.57879058683348, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0025957905967837003, + "loss": 0.8104, + "num_input_tokens_seen": 64625232, + "step": 111310 + }, + { + "epoch": 16.57953529937444, + "grad_norm": 0.047607421875, + "learning_rate": 0.0025946944518306114, + "loss": 0.7873, + "num_input_tokens_seen": 64628048, + "step": 111315 + }, + { + "epoch": 16.580280011915402, + "grad_norm": 0.07958984375, + "learning_rate": 0.0025935985164525547, + "loss": 0.8116, + "num_input_tokens_seen": 64630896, + "step": 111320 + }, + { + "epoch": 16.58102472445636, + "grad_norm": 0.03369140625, + "learning_rate": 0.0025925027906680423, + "loss": 0.7993, + "num_input_tokens_seen": 64633456, + "step": 111325 + }, + { + "epoch": 16.58176943699732, + "grad_norm": 0.0966796875, + "learning_rate": 0.0025914072744955847, + "loss": 0.7883, + "num_input_tokens_seen": 64636432, + "step": 111330 + }, + { + "epoch": 16.582514149538277, + "grad_norm": 0.04736328125, + "learning_rate": 0.0025903119679536895, + "loss": 0.8027, + "num_input_tokens_seen": 64639280, + "step": 111335 + }, + { + "epoch": 16.583258862079237, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0025892168710608576, + "loss": 0.7916, + "num_input_tokens_seen": 64642416, + "step": 111340 + }, + { + "epoch": 16.584003574620198, + "grad_norm": 0.05712890625, + "learning_rate": 0.0025881219838355973, + "loss": 0.7934, + "num_input_tokens_seen": 64646512, + "step": 111345 + }, + { + "epoch": 16.584748287161155, + "grad_norm": 0.0625, + "learning_rate": 0.002587027306296397, + "loss": 0.7926, + "num_input_tokens_seen": 64649232, + "step": 111350 + }, + { + "epoch": 16.585492999702115, + "grad_norm": 0.03564453125, + "learning_rate": 0.0025859328384617575, + "loss": 0.7786, + "num_input_tokens_seen": 64652208, + "step": 111355 + }, + { + "epoch": 16.586237712243076, + "grad_norm": 0.05029296875, + "learning_rate": 0.0025848385803501615, + "loss": 0.8246, + "num_input_tokens_seen": 64654896, + "step": 111360 + }, + { + "epoch": 16.586982424784033, + "grad_norm": 0.03564453125, + "learning_rate": 0.0025837445319801047, + "loss": 0.7996, + "num_input_tokens_seen": 64657776, + "step": 111365 + }, + { + "epoch": 16.587727137324993, + "grad_norm": 0.042236328125, + "learning_rate": 0.002582650693370064, + "loss": 0.784, + "num_input_tokens_seen": 64660784, + "step": 111370 + }, + { + "epoch": 16.58847184986595, + "grad_norm": 0.045166015625, + "learning_rate": 0.0025815570645385147, + "loss": 0.7804, + "num_input_tokens_seen": 64663760, + "step": 111375 + }, + { + "epoch": 16.58921656240691, + "grad_norm": 0.052001953125, + "learning_rate": 0.002580463645503942, + "loss": 0.8005, + "num_input_tokens_seen": 64666544, + "step": 111380 + }, + { + "epoch": 16.58996127494787, + "grad_norm": 0.049560546875, + "learning_rate": 0.0025793704362848105, + "loss": 0.7773, + "num_input_tokens_seen": 64669552, + "step": 111385 + }, + { + "epoch": 16.59070598748883, + "grad_norm": 0.03759765625, + "learning_rate": 0.0025782774368995945, + "loss": 0.8039, + "num_input_tokens_seen": 64672528, + "step": 111390 + }, + { + "epoch": 16.59145070002979, + "grad_norm": 0.048583984375, + "learning_rate": 0.002577184647366756, + "loss": 0.8117, + "num_input_tokens_seen": 64675440, + "step": 111395 + }, + { + "epoch": 16.59219541257075, + "grad_norm": 0.06298828125, + "learning_rate": 0.0025760920677047575, + "loss": 0.7954, + "num_input_tokens_seen": 64678448, + "step": 111400 + }, + { + "epoch": 16.592940125111706, + "grad_norm": 0.05419921875, + "learning_rate": 0.002574999697932052, + "loss": 0.8007, + "num_input_tokens_seen": 64681104, + "step": 111405 + }, + { + "epoch": 16.593684837652667, + "grad_norm": 0.0703125, + "learning_rate": 0.002573907538067103, + "loss": 0.8172, + "num_input_tokens_seen": 64683920, + "step": 111410 + }, + { + "epoch": 16.594429550193624, + "grad_norm": 0.037841796875, + "learning_rate": 0.0025728155881283563, + "loss": 0.8014, + "num_input_tokens_seen": 64686576, + "step": 111415 + }, + { + "epoch": 16.595174262734584, + "grad_norm": 0.05615234375, + "learning_rate": 0.0025717238481342553, + "loss": 0.7914, + "num_input_tokens_seen": 64689424, + "step": 111420 + }, + { + "epoch": 16.595918975275545, + "grad_norm": 0.062255859375, + "learning_rate": 0.0025706323181032523, + "loss": 0.785, + "num_input_tokens_seen": 64692432, + "step": 111425 + }, + { + "epoch": 16.596663687816502, + "grad_norm": 0.039306640625, + "learning_rate": 0.002569540998053782, + "loss": 0.7873, + "num_input_tokens_seen": 64695280, + "step": 111430 + }, + { + "epoch": 16.597408400357462, + "grad_norm": 0.04150390625, + "learning_rate": 0.002568449888004284, + "loss": 0.7955, + "num_input_tokens_seen": 64698128, + "step": 111435 + }, + { + "epoch": 16.598153112898423, + "grad_norm": 0.05712890625, + "learning_rate": 0.002567358987973188, + "loss": 0.7935, + "num_input_tokens_seen": 64700944, + "step": 111440 + }, + { + "epoch": 16.59889782543938, + "grad_norm": 0.054931640625, + "learning_rate": 0.0025662682979789293, + "loss": 0.7909, + "num_input_tokens_seen": 64703920, + "step": 111445 + }, + { + "epoch": 16.59964253798034, + "grad_norm": 0.0830078125, + "learning_rate": 0.0025651778180399297, + "loss": 0.7796, + "num_input_tokens_seen": 64706704, + "step": 111450 + }, + { + "epoch": 16.600387250521297, + "grad_norm": 0.060302734375, + "learning_rate": 0.002564087548174614, + "loss": 0.8074, + "num_input_tokens_seen": 64709584, + "step": 111455 + }, + { + "epoch": 16.601131963062258, + "grad_norm": 0.037353515625, + "learning_rate": 0.0025629974884013966, + "loss": 0.7708, + "num_input_tokens_seen": 64712336, + "step": 111460 + }, + { + "epoch": 16.60187667560322, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0025619076387386986, + "loss": 0.7988, + "num_input_tokens_seen": 64715120, + "step": 111465 + }, + { + "epoch": 16.602621388144176, + "grad_norm": 0.055419921875, + "learning_rate": 0.0025608179992049297, + "loss": 0.8031, + "num_input_tokens_seen": 64717936, + "step": 111470 + }, + { + "epoch": 16.603366100685136, + "grad_norm": 0.055908203125, + "learning_rate": 0.002559728569818495, + "loss": 0.7968, + "num_input_tokens_seen": 64721040, + "step": 111475 + }, + { + "epoch": 16.604110813226093, + "grad_norm": 0.025634765625, + "learning_rate": 0.0025586393505978053, + "loss": 0.7845, + "num_input_tokens_seen": 64723856, + "step": 111480 + }, + { + "epoch": 16.604855525767054, + "grad_norm": 0.06640625, + "learning_rate": 0.002557550341561256, + "loss": 0.7794, + "num_input_tokens_seen": 64726608, + "step": 111485 + }, + { + "epoch": 16.605600238308014, + "grad_norm": 0.0546875, + "learning_rate": 0.002556461542727252, + "loss": 0.8086, + "num_input_tokens_seen": 64729616, + "step": 111490 + }, + { + "epoch": 16.60634495084897, + "grad_norm": 0.050048828125, + "learning_rate": 0.0025553729541141815, + "loss": 0.7795, + "num_input_tokens_seen": 64732272, + "step": 111495 + }, + { + "epoch": 16.60708966338993, + "grad_norm": 0.083984375, + "learning_rate": 0.002554284575740434, + "loss": 0.801, + "num_input_tokens_seen": 64735056, + "step": 111500 + }, + { + "epoch": 16.607834375930892, + "grad_norm": 0.037353515625, + "learning_rate": 0.0025531964076244027, + "loss": 0.7787, + "num_input_tokens_seen": 64737776, + "step": 111505 + }, + { + "epoch": 16.60857908847185, + "grad_norm": 0.055419921875, + "learning_rate": 0.002552108449784465, + "loss": 0.7863, + "num_input_tokens_seen": 64740944, + "step": 111510 + }, + { + "epoch": 16.60932380101281, + "grad_norm": 0.09228515625, + "learning_rate": 0.002551020702239005, + "loss": 0.8019, + "num_input_tokens_seen": 64744176, + "step": 111515 + }, + { + "epoch": 16.610068513553767, + "grad_norm": 0.039306640625, + "learning_rate": 0.0025499331650063995, + "loss": 0.8009, + "num_input_tokens_seen": 64746960, + "step": 111520 + }, + { + "epoch": 16.610813226094727, + "grad_norm": 0.033203125, + "learning_rate": 0.0025488458381050195, + "loss": 0.8187, + "num_input_tokens_seen": 64749968, + "step": 111525 + }, + { + "epoch": 16.611557938635688, + "grad_norm": 0.037109375, + "learning_rate": 0.0025477587215532306, + "loss": 0.8018, + "num_input_tokens_seen": 64753168, + "step": 111530 + }, + { + "epoch": 16.612302651176645, + "grad_norm": 0.0576171875, + "learning_rate": 0.0025466718153694043, + "loss": 0.8005, + "num_input_tokens_seen": 64756240, + "step": 111535 + }, + { + "epoch": 16.613047363717605, + "grad_norm": 0.0810546875, + "learning_rate": 0.0025455851195719035, + "loss": 0.7644, + "num_input_tokens_seen": 64759056, + "step": 111540 + }, + { + "epoch": 16.613792076258566, + "grad_norm": 0.046630859375, + "learning_rate": 0.0025444986341790782, + "loss": 0.7907, + "num_input_tokens_seen": 64761904, + "step": 111545 + }, + { + "epoch": 16.614536788799523, + "grad_norm": 0.035888671875, + "learning_rate": 0.002543412359209293, + "loss": 0.7803, + "num_input_tokens_seen": 64764784, + "step": 111550 + }, + { + "epoch": 16.615281501340483, + "grad_norm": 0.058349609375, + "learning_rate": 0.0025423262946808927, + "loss": 0.7893, + "num_input_tokens_seen": 64767792, + "step": 111555 + }, + { + "epoch": 16.61602621388144, + "grad_norm": 0.054931640625, + "learning_rate": 0.0025412404406122318, + "loss": 0.8068, + "num_input_tokens_seen": 64770992, + "step": 111560 + }, + { + "epoch": 16.6167709264224, + "grad_norm": 0.06494140625, + "learning_rate": 0.0025401547970216477, + "loss": 0.7756, + "num_input_tokens_seen": 64773840, + "step": 111565 + }, + { + "epoch": 16.61751563896336, + "grad_norm": 0.051025390625, + "learning_rate": 0.0025390693639274885, + "loss": 0.7893, + "num_input_tokens_seen": 64776976, + "step": 111570 + }, + { + "epoch": 16.61826035150432, + "grad_norm": 0.11767578125, + "learning_rate": 0.002537984141348087, + "loss": 0.7844, + "num_input_tokens_seen": 64779760, + "step": 111575 + }, + { + "epoch": 16.61900506404528, + "grad_norm": 0.09228515625, + "learning_rate": 0.0025368991293017746, + "loss": 0.8328, + "num_input_tokens_seen": 64782832, + "step": 111580 + }, + { + "epoch": 16.61974977658624, + "grad_norm": 0.06787109375, + "learning_rate": 0.0025358143278068866, + "loss": 0.7766, + "num_input_tokens_seen": 64785840, + "step": 111585 + }, + { + "epoch": 16.620494489127196, + "grad_norm": 0.07470703125, + "learning_rate": 0.002534729736881749, + "loss": 0.7963, + "num_input_tokens_seen": 64788752, + "step": 111590 + }, + { + "epoch": 16.621239201668157, + "grad_norm": 0.04150390625, + "learning_rate": 0.0025336453565446833, + "loss": 0.831, + "num_input_tokens_seen": 64791664, + "step": 111595 + }, + { + "epoch": 16.621983914209114, + "grad_norm": 0.08984375, + "learning_rate": 0.0025325611868140044, + "loss": 0.7895, + "num_input_tokens_seen": 64794416, + "step": 111600 + }, + { + "epoch": 16.622728626750074, + "grad_norm": 0.056396484375, + "learning_rate": 0.002531477227708037, + "loss": 0.7922, + "num_input_tokens_seen": 64797616, + "step": 111605 + }, + { + "epoch": 16.623473339291035, + "grad_norm": 0.1318359375, + "learning_rate": 0.002530393479245086, + "loss": 0.8312, + "num_input_tokens_seen": 64800944, + "step": 111610 + }, + { + "epoch": 16.624218051831992, + "grad_norm": 0.03076171875, + "learning_rate": 0.0025293099414434683, + "loss": 0.7912, + "num_input_tokens_seen": 64803792, + "step": 111615 + }, + { + "epoch": 16.624962764372953, + "grad_norm": 0.031005859375, + "learning_rate": 0.0025282266143214827, + "loss": 0.7825, + "num_input_tokens_seen": 64806480, + "step": 111620 + }, + { + "epoch": 16.62570747691391, + "grad_norm": 0.0400390625, + "learning_rate": 0.0025271434978974294, + "loss": 0.7952, + "num_input_tokens_seen": 64809392, + "step": 111625 + }, + { + "epoch": 16.62645218945487, + "grad_norm": 0.1015625, + "learning_rate": 0.002526060592189614, + "loss": 0.7902, + "num_input_tokens_seen": 64813744, + "step": 111630 + }, + { + "epoch": 16.62719690199583, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0025249778972163232, + "loss": 0.8071, + "num_input_tokens_seen": 64816304, + "step": 111635 + }, + { + "epoch": 16.627941614536788, + "grad_norm": 0.06787109375, + "learning_rate": 0.0025238954129958562, + "loss": 0.7942, + "num_input_tokens_seen": 64818928, + "step": 111640 + }, + { + "epoch": 16.628686327077748, + "grad_norm": 0.046630859375, + "learning_rate": 0.002522813139546491, + "loss": 0.7871, + "num_input_tokens_seen": 64821808, + "step": 111645 + }, + { + "epoch": 16.62943103961871, + "grad_norm": 0.031982421875, + "learning_rate": 0.0025217310768865207, + "loss": 0.7913, + "num_input_tokens_seen": 64824688, + "step": 111650 + }, + { + "epoch": 16.630175752159666, + "grad_norm": 0.061767578125, + "learning_rate": 0.0025206492250342215, + "loss": 0.8246, + "num_input_tokens_seen": 64828016, + "step": 111655 + }, + { + "epoch": 16.630920464700626, + "grad_norm": 0.072265625, + "learning_rate": 0.0025195675840078706, + "loss": 0.7979, + "num_input_tokens_seen": 64830768, + "step": 111660 + }, + { + "epoch": 16.631665177241583, + "grad_norm": 0.059814453125, + "learning_rate": 0.0025184861538257402, + "loss": 0.7952, + "num_input_tokens_seen": 64833968, + "step": 111665 + }, + { + "epoch": 16.632409889782544, + "grad_norm": 0.053955078125, + "learning_rate": 0.002517404934506097, + "loss": 0.7833, + "num_input_tokens_seen": 64836624, + "step": 111670 + }, + { + "epoch": 16.633154602323504, + "grad_norm": 0.037353515625, + "learning_rate": 0.0025163239260672155, + "loss": 0.8013, + "num_input_tokens_seen": 64839312, + "step": 111675 + }, + { + "epoch": 16.63389931486446, + "grad_norm": 0.08740234375, + "learning_rate": 0.0025152431285273484, + "loss": 0.8074, + "num_input_tokens_seen": 64842192, + "step": 111680 + }, + { + "epoch": 16.63464402740542, + "grad_norm": 0.04443359375, + "learning_rate": 0.0025141625419047653, + "loss": 0.797, + "num_input_tokens_seen": 64845104, + "step": 111685 + }, + { + "epoch": 16.635388739946382, + "grad_norm": 0.06884765625, + "learning_rate": 0.002513082166217709, + "loss": 0.7843, + "num_input_tokens_seen": 64848304, + "step": 111690 + }, + { + "epoch": 16.63613345248734, + "grad_norm": 0.0576171875, + "learning_rate": 0.0025120020014844447, + "loss": 0.7898, + "num_input_tokens_seen": 64851344, + "step": 111695 + }, + { + "epoch": 16.6368781650283, + "grad_norm": 0.06494140625, + "learning_rate": 0.0025109220477232148, + "loss": 0.7845, + "num_input_tokens_seen": 64854224, + "step": 111700 + }, + { + "epoch": 16.637622877569257, + "grad_norm": 0.052001953125, + "learning_rate": 0.0025098423049522583, + "loss": 0.7958, + "num_input_tokens_seen": 64857072, + "step": 111705 + }, + { + "epoch": 16.638367590110217, + "grad_norm": 0.08056640625, + "learning_rate": 0.002508762773189826, + "loss": 0.7983, + "num_input_tokens_seen": 64860112, + "step": 111710 + }, + { + "epoch": 16.639112302651178, + "grad_norm": 0.10986328125, + "learning_rate": 0.0025076834524541463, + "loss": 0.7814, + "num_input_tokens_seen": 64863056, + "step": 111715 + }, + { + "epoch": 16.639857015192135, + "grad_norm": 0.046630859375, + "learning_rate": 0.002506604342763463, + "loss": 0.8015, + "num_input_tokens_seen": 64865936, + "step": 111720 + }, + { + "epoch": 16.640601727733095, + "grad_norm": 0.06201171875, + "learning_rate": 0.0025055254441360003, + "loss": 0.8026, + "num_input_tokens_seen": 64868464, + "step": 111725 + }, + { + "epoch": 16.641346440274056, + "grad_norm": 0.038818359375, + "learning_rate": 0.002504446756589988, + "loss": 0.7906, + "num_input_tokens_seen": 64871472, + "step": 111730 + }, + { + "epoch": 16.642091152815013, + "grad_norm": 0.08203125, + "learning_rate": 0.0025033682801436467, + "loss": 0.761, + "num_input_tokens_seen": 64874160, + "step": 111735 + }, + { + "epoch": 16.642835865355973, + "grad_norm": 0.053955078125, + "learning_rate": 0.0025022900148151937, + "loss": 0.7944, + "num_input_tokens_seen": 64877232, + "step": 111740 + }, + { + "epoch": 16.64358057789693, + "grad_norm": 0.091796875, + "learning_rate": 0.0025012119606228544, + "loss": 0.7734, + "num_input_tokens_seen": 64880016, + "step": 111745 + }, + { + "epoch": 16.64432529043789, + "grad_norm": 0.0859375, + "learning_rate": 0.002500134117584829, + "loss": 0.7962, + "num_input_tokens_seen": 64882768, + "step": 111750 + }, + { + "epoch": 16.64507000297885, + "grad_norm": 0.203125, + "learning_rate": 0.0024990564857193397, + "loss": 0.8273, + "num_input_tokens_seen": 64885936, + "step": 111755 + }, + { + "epoch": 16.64581471551981, + "grad_norm": 0.10595703125, + "learning_rate": 0.0024979790650445804, + "loss": 0.7768, + "num_input_tokens_seen": 64888656, + "step": 111760 + }, + { + "epoch": 16.64655942806077, + "grad_norm": 0.07666015625, + "learning_rate": 0.002496901855578763, + "loss": 0.8051, + "num_input_tokens_seen": 64891568, + "step": 111765 + }, + { + "epoch": 16.647304140601726, + "grad_norm": 0.07080078125, + "learning_rate": 0.0024958248573400763, + "loss": 0.8004, + "num_input_tokens_seen": 64894384, + "step": 111770 + }, + { + "epoch": 16.648048853142686, + "grad_norm": 0.08935546875, + "learning_rate": 0.0024947480703467243, + "loss": 0.79, + "num_input_tokens_seen": 64897296, + "step": 111775 + }, + { + "epoch": 16.648793565683647, + "grad_norm": 0.09033203125, + "learning_rate": 0.002493671494616894, + "loss": 0.8053, + "num_input_tokens_seen": 64899984, + "step": 111780 + }, + { + "epoch": 16.649538278224604, + "grad_norm": 0.051513671875, + "learning_rate": 0.0024925951301687727, + "loss": 0.7853, + "num_input_tokens_seen": 64902672, + "step": 111785 + }, + { + "epoch": 16.650282990765565, + "grad_norm": 0.057861328125, + "learning_rate": 0.002491518977020542, + "loss": 0.8044, + "num_input_tokens_seen": 64905328, + "step": 111790 + }, + { + "epoch": 16.651027703306525, + "grad_norm": 0.056396484375, + "learning_rate": 0.0024904430351903877, + "loss": 0.8155, + "num_input_tokens_seen": 64908080, + "step": 111795 + }, + { + "epoch": 16.651772415847482, + "grad_norm": 0.0634765625, + "learning_rate": 0.002489367304696485, + "loss": 0.7751, + "num_input_tokens_seen": 64910960, + "step": 111800 + }, + { + "epoch": 16.652517128388443, + "grad_norm": 0.06494140625, + "learning_rate": 0.002488291785557002, + "loss": 0.7941, + "num_input_tokens_seen": 64913776, + "step": 111805 + }, + { + "epoch": 16.6532618409294, + "grad_norm": 0.09814453125, + "learning_rate": 0.002487216477790116, + "loss": 0.7719, + "num_input_tokens_seen": 64916656, + "step": 111810 + }, + { + "epoch": 16.65400655347036, + "grad_norm": 0.0869140625, + "learning_rate": 0.002486141381413992, + "loss": 0.8136, + "num_input_tokens_seen": 64919440, + "step": 111815 + }, + { + "epoch": 16.65475126601132, + "grad_norm": 0.115234375, + "learning_rate": 0.0024850664964467866, + "loss": 0.7677, + "num_input_tokens_seen": 64922288, + "step": 111820 + }, + { + "epoch": 16.655495978552278, + "grad_norm": 0.123046875, + "learning_rate": 0.002483991822906665, + "loss": 0.8099, + "num_input_tokens_seen": 64925488, + "step": 111825 + }, + { + "epoch": 16.656240691093238, + "grad_norm": 0.08056640625, + "learning_rate": 0.0024829173608117805, + "loss": 0.7849, + "num_input_tokens_seen": 64928496, + "step": 111830 + }, + { + "epoch": 16.6569854036342, + "grad_norm": 0.26953125, + "learning_rate": 0.0024818431101802873, + "loss": 0.8284, + "num_input_tokens_seen": 64931312, + "step": 111835 + }, + { + "epoch": 16.657730116175156, + "grad_norm": 0.05908203125, + "learning_rate": 0.0024807690710303264, + "loss": 0.7708, + "num_input_tokens_seen": 64934448, + "step": 111840 + }, + { + "epoch": 16.658474828716116, + "grad_norm": 0.0673828125, + "learning_rate": 0.002479695243380055, + "loss": 0.8078, + "num_input_tokens_seen": 64937776, + "step": 111845 + }, + { + "epoch": 16.659219541257073, + "grad_norm": 0.091796875, + "learning_rate": 0.0024786216272476053, + "loss": 0.7796, + "num_input_tokens_seen": 64940912, + "step": 111850 + }, + { + "epoch": 16.659964253798034, + "grad_norm": 0.11279296875, + "learning_rate": 0.0024775482226511176, + "loss": 0.7596, + "num_input_tokens_seen": 64943696, + "step": 111855 + }, + { + "epoch": 16.660708966338994, + "grad_norm": 0.1142578125, + "learning_rate": 0.0024764750296087257, + "loss": 0.8412, + "num_input_tokens_seen": 64946608, + "step": 111860 + }, + { + "epoch": 16.66145367887995, + "grad_norm": 0.0478515625, + "learning_rate": 0.002475402048138555, + "loss": 0.7859, + "num_input_tokens_seen": 64949712, + "step": 111865 + }, + { + "epoch": 16.66219839142091, + "grad_norm": 0.09375, + "learning_rate": 0.0024743292782587415, + "loss": 0.8104, + "num_input_tokens_seen": 64952336, + "step": 111870 + }, + { + "epoch": 16.662943103961872, + "grad_norm": 0.033203125, + "learning_rate": 0.0024732567199874022, + "loss": 0.7981, + "num_input_tokens_seen": 64955568, + "step": 111875 + }, + { + "epoch": 16.66368781650283, + "grad_norm": 0.028076171875, + "learning_rate": 0.0024721843733426607, + "loss": 0.8314, + "num_input_tokens_seen": 64958416, + "step": 111880 + }, + { + "epoch": 16.66443252904379, + "grad_norm": 0.037109375, + "learning_rate": 0.002471112238342629, + "loss": 0.7956, + "num_input_tokens_seen": 64961168, + "step": 111885 + }, + { + "epoch": 16.665177241584747, + "grad_norm": 0.034912109375, + "learning_rate": 0.002470040315005425, + "loss": 0.7944, + "num_input_tokens_seen": 64963792, + "step": 111890 + }, + { + "epoch": 16.665921954125707, + "grad_norm": 0.07470703125, + "learning_rate": 0.002468968603349155, + "loss": 0.807, + "num_input_tokens_seen": 64966608, + "step": 111895 + }, + { + "epoch": 16.666666666666668, + "grad_norm": 0.08251953125, + "learning_rate": 0.0024678971033919223, + "loss": 0.7966, + "num_input_tokens_seen": 64969296, + "step": 111900 + }, + { + "epoch": 16.667411379207625, + "grad_norm": 0.06689453125, + "learning_rate": 0.002466825815151833, + "loss": 0.8142, + "num_input_tokens_seen": 64972240, + "step": 111905 + }, + { + "epoch": 16.668156091748585, + "grad_norm": 0.036865234375, + "learning_rate": 0.0024657547386469814, + "loss": 0.7954, + "num_input_tokens_seen": 64975024, + "step": 111910 + }, + { + "epoch": 16.668900804289546, + "grad_norm": 0.0478515625, + "learning_rate": 0.002464683873895466, + "loss": 0.7894, + "num_input_tokens_seen": 64978000, + "step": 111915 + }, + { + "epoch": 16.669645516830503, + "grad_norm": 0.052490234375, + "learning_rate": 0.0024636132209153766, + "loss": 0.8142, + "num_input_tokens_seen": 64981104, + "step": 111920 + }, + { + "epoch": 16.670390229371463, + "grad_norm": 0.05419921875, + "learning_rate": 0.002462542779724801, + "loss": 0.7922, + "num_input_tokens_seen": 64984048, + "step": 111925 + }, + { + "epoch": 16.67113494191242, + "grad_norm": 0.047607421875, + "learning_rate": 0.0024614725503418187, + "loss": 0.7878, + "num_input_tokens_seen": 64986992, + "step": 111930 + }, + { + "epoch": 16.67187965445338, + "grad_norm": 0.03515625, + "learning_rate": 0.0024604025327845183, + "loss": 0.8118, + "num_input_tokens_seen": 64989744, + "step": 111935 + }, + { + "epoch": 16.67262436699434, + "grad_norm": 0.0390625, + "learning_rate": 0.0024593327270709723, + "loss": 0.7819, + "num_input_tokens_seen": 64992496, + "step": 111940 + }, + { + "epoch": 16.6733690795353, + "grad_norm": 0.037353515625, + "learning_rate": 0.0024582631332192495, + "loss": 0.7998, + "num_input_tokens_seen": 64995952, + "step": 111945 + }, + { + "epoch": 16.67411379207626, + "grad_norm": 0.0478515625, + "learning_rate": 0.00245719375124743, + "loss": 0.8095, + "num_input_tokens_seen": 64998960, + "step": 111950 + }, + { + "epoch": 16.67485850461722, + "grad_norm": 0.06396484375, + "learning_rate": 0.002456124581173568, + "loss": 0.8305, + "num_input_tokens_seen": 65001936, + "step": 111955 + }, + { + "epoch": 16.675603217158177, + "grad_norm": 0.0517578125, + "learning_rate": 0.0024550556230157378, + "loss": 0.8103, + "num_input_tokens_seen": 65005392, + "step": 111960 + }, + { + "epoch": 16.676347929699137, + "grad_norm": 0.058349609375, + "learning_rate": 0.0024539868767919876, + "loss": 0.7823, + "num_input_tokens_seen": 65008144, + "step": 111965 + }, + { + "epoch": 16.677092642240094, + "grad_norm": 0.047607421875, + "learning_rate": 0.0024529183425203815, + "loss": 0.7991, + "num_input_tokens_seen": 65011152, + "step": 111970 + }, + { + "epoch": 16.677837354781055, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0024518500202189686, + "loss": 0.8163, + "num_input_tokens_seen": 65013808, + "step": 111975 + }, + { + "epoch": 16.678582067322015, + "grad_norm": 0.04150390625, + "learning_rate": 0.0024507819099057926, + "loss": 0.8178, + "num_input_tokens_seen": 65016944, + "step": 111980 + }, + { + "epoch": 16.679326779862972, + "grad_norm": 0.0556640625, + "learning_rate": 0.002449714011598906, + "loss": 0.8019, + "num_input_tokens_seen": 65019696, + "step": 111985 + }, + { + "epoch": 16.680071492403933, + "grad_norm": 0.028076171875, + "learning_rate": 0.002448646325316344, + "loss": 0.8138, + "num_input_tokens_seen": 65022544, + "step": 111990 + }, + { + "epoch": 16.68081620494489, + "grad_norm": 0.03857421875, + "learning_rate": 0.002447578851076147, + "loss": 0.8138, + "num_input_tokens_seen": 65025488, + "step": 111995 + }, + { + "epoch": 16.68156091748585, + "grad_norm": 0.0286865234375, + "learning_rate": 0.002446511588896343, + "loss": 0.7962, + "num_input_tokens_seen": 65028336, + "step": 112000 + }, + { + "epoch": 16.68230563002681, + "grad_norm": 0.027587890625, + "learning_rate": 0.002445444538794972, + "loss": 0.8013, + "num_input_tokens_seen": 65031184, + "step": 112005 + }, + { + "epoch": 16.683050342567768, + "grad_norm": 0.03369140625, + "learning_rate": 0.0024443777007900494, + "loss": 0.7915, + "num_input_tokens_seen": 65033904, + "step": 112010 + }, + { + "epoch": 16.683795055108728, + "grad_norm": 0.06494140625, + "learning_rate": 0.00244331107489961, + "loss": 0.7877, + "num_input_tokens_seen": 65037040, + "step": 112015 + }, + { + "epoch": 16.68453976764969, + "grad_norm": 0.03759765625, + "learning_rate": 0.0024422446611416684, + "loss": 0.7973, + "num_input_tokens_seen": 65040016, + "step": 112020 + }, + { + "epoch": 16.685284480190646, + "grad_norm": 0.040771484375, + "learning_rate": 0.002441178459534236, + "loss": 0.7787, + "num_input_tokens_seen": 65042704, + "step": 112025 + }, + { + "epoch": 16.686029192731606, + "grad_norm": 0.060302734375, + "learning_rate": 0.002440112470095333, + "loss": 0.7931, + "num_input_tokens_seen": 65045360, + "step": 112030 + }, + { + "epoch": 16.686773905272563, + "grad_norm": 0.0537109375, + "learning_rate": 0.00243904669284296, + "loss": 0.8128, + "num_input_tokens_seen": 65048464, + "step": 112035 + }, + { + "epoch": 16.687518617813524, + "grad_norm": 0.04296875, + "learning_rate": 0.0024379811277951305, + "loss": 0.7922, + "num_input_tokens_seen": 65051376, + "step": 112040 + }, + { + "epoch": 16.688263330354484, + "grad_norm": 0.052001953125, + "learning_rate": 0.002436915774969839, + "loss": 0.7944, + "num_input_tokens_seen": 65054256, + "step": 112045 + }, + { + "epoch": 16.68900804289544, + "grad_norm": 0.05517578125, + "learning_rate": 0.0024358506343850916, + "loss": 0.8107, + "num_input_tokens_seen": 65057008, + "step": 112050 + }, + { + "epoch": 16.689752755436402, + "grad_norm": 0.038818359375, + "learning_rate": 0.0024347857060588777, + "loss": 0.789, + "num_input_tokens_seen": 65060048, + "step": 112055 + }, + { + "epoch": 16.690497467977362, + "grad_norm": 0.03857421875, + "learning_rate": 0.0024337209900091887, + "loss": 0.795, + "num_input_tokens_seen": 65063056, + "step": 112060 + }, + { + "epoch": 16.69124218051832, + "grad_norm": 0.058837890625, + "learning_rate": 0.0024326564862540115, + "loss": 0.7924, + "num_input_tokens_seen": 65066000, + "step": 112065 + }, + { + "epoch": 16.69198689305928, + "grad_norm": 0.032470703125, + "learning_rate": 0.0024315921948113257, + "loss": 0.7844, + "num_input_tokens_seen": 65068816, + "step": 112070 + }, + { + "epoch": 16.692731605600237, + "grad_norm": 0.034423828125, + "learning_rate": 0.002430528115699121, + "loss": 0.8045, + "num_input_tokens_seen": 65071760, + "step": 112075 + }, + { + "epoch": 16.693476318141197, + "grad_norm": 0.049072265625, + "learning_rate": 0.0024294642489353633, + "loss": 0.7961, + "num_input_tokens_seen": 65074768, + "step": 112080 + }, + { + "epoch": 16.694221030682158, + "grad_norm": 0.039794921875, + "learning_rate": 0.0024284005945380354, + "loss": 0.7988, + "num_input_tokens_seen": 65077808, + "step": 112085 + }, + { + "epoch": 16.694965743223115, + "grad_norm": 0.05224609375, + "learning_rate": 0.0024273371525250992, + "loss": 0.8037, + "num_input_tokens_seen": 65080720, + "step": 112090 + }, + { + "epoch": 16.695710455764075, + "grad_norm": 0.0458984375, + "learning_rate": 0.002426273922914527, + "loss": 0.7885, + "num_input_tokens_seen": 65083440, + "step": 112095 + }, + { + "epoch": 16.696455168305036, + "grad_norm": 0.0390625, + "learning_rate": 0.002425210905724276, + "loss": 0.7936, + "num_input_tokens_seen": 65086608, + "step": 112100 + }, + { + "epoch": 16.697199880845993, + "grad_norm": 0.037353515625, + "learning_rate": 0.0024241481009723035, + "loss": 0.8024, + "num_input_tokens_seen": 65089232, + "step": 112105 + }, + { + "epoch": 16.697944593386953, + "grad_norm": 0.04931640625, + "learning_rate": 0.0024230855086765724, + "loss": 0.7928, + "num_input_tokens_seen": 65091792, + "step": 112110 + }, + { + "epoch": 16.69868930592791, + "grad_norm": 0.050048828125, + "learning_rate": 0.0024220231288550275, + "loss": 0.7962, + "num_input_tokens_seen": 65094832, + "step": 112115 + }, + { + "epoch": 16.69943401846887, + "grad_norm": 0.05126953125, + "learning_rate": 0.002420960961525615, + "loss": 0.792, + "num_input_tokens_seen": 65097808, + "step": 112120 + }, + { + "epoch": 16.70017873100983, + "grad_norm": 0.06787109375, + "learning_rate": 0.0024198990067062854, + "loss": 0.7721, + "num_input_tokens_seen": 65100784, + "step": 112125 + }, + { + "epoch": 16.70092344355079, + "grad_norm": 0.05712890625, + "learning_rate": 0.0024188372644149774, + "loss": 0.8109, + "num_input_tokens_seen": 65103696, + "step": 112130 + }, + { + "epoch": 16.70166815609175, + "grad_norm": 0.05419921875, + "learning_rate": 0.002417775734669622, + "loss": 0.7864, + "num_input_tokens_seen": 65106448, + "step": 112135 + }, + { + "epoch": 16.702412868632706, + "grad_norm": 0.054931640625, + "learning_rate": 0.0024167144174881616, + "loss": 0.79, + "num_input_tokens_seen": 65109264, + "step": 112140 + }, + { + "epoch": 16.703157581173667, + "grad_norm": 0.045166015625, + "learning_rate": 0.002415653312888523, + "loss": 0.7879, + "num_input_tokens_seen": 65112240, + "step": 112145 + }, + { + "epoch": 16.703902293714627, + "grad_norm": 0.0341796875, + "learning_rate": 0.0024145924208886257, + "loss": 0.8142, + "num_input_tokens_seen": 65115408, + "step": 112150 + }, + { + "epoch": 16.704647006255584, + "grad_norm": 0.041259765625, + "learning_rate": 0.002413531741506404, + "loss": 0.8148, + "num_input_tokens_seen": 65118320, + "step": 112155 + }, + { + "epoch": 16.705391718796545, + "grad_norm": 0.038818359375, + "learning_rate": 0.0024124712747597677, + "loss": 0.7854, + "num_input_tokens_seen": 65121296, + "step": 112160 + }, + { + "epoch": 16.706136431337505, + "grad_norm": 0.049072265625, + "learning_rate": 0.0024114110206666377, + "loss": 0.7948, + "num_input_tokens_seen": 65124080, + "step": 112165 + }, + { + "epoch": 16.706881143878462, + "grad_norm": 0.052978515625, + "learning_rate": 0.0024103509792449216, + "loss": 0.7809, + "num_input_tokens_seen": 65126736, + "step": 112170 + }, + { + "epoch": 16.707625856419423, + "grad_norm": 0.064453125, + "learning_rate": 0.0024092911505125335, + "loss": 0.7973, + "num_input_tokens_seen": 65129648, + "step": 112175 + }, + { + "epoch": 16.70837056896038, + "grad_norm": 0.052978515625, + "learning_rate": 0.002408231534487374, + "loss": 0.7751, + "num_input_tokens_seen": 65132848, + "step": 112180 + }, + { + "epoch": 16.70911528150134, + "grad_norm": 0.0302734375, + "learning_rate": 0.0024071721311873465, + "loss": 0.796, + "num_input_tokens_seen": 65135504, + "step": 112185 + }, + { + "epoch": 16.7098599940423, + "grad_norm": 0.029296875, + "learning_rate": 0.002406112940630346, + "loss": 0.7938, + "num_input_tokens_seen": 65138288, + "step": 112190 + }, + { + "epoch": 16.710604706583258, + "grad_norm": 0.06396484375, + "learning_rate": 0.002405053962834264, + "loss": 0.7853, + "num_input_tokens_seen": 65140944, + "step": 112195 + }, + { + "epoch": 16.71134941912422, + "grad_norm": 0.03955078125, + "learning_rate": 0.002403995197816999, + "loss": 0.7985, + "num_input_tokens_seen": 65144048, + "step": 112200 + }, + { + "epoch": 16.71209413166518, + "grad_norm": 0.043701171875, + "learning_rate": 0.0024029366455964284, + "loss": 0.8001, + "num_input_tokens_seen": 65146800, + "step": 112205 + }, + { + "epoch": 16.712838844206136, + "grad_norm": 0.039306640625, + "learning_rate": 0.002401878306190444, + "loss": 0.7977, + "num_input_tokens_seen": 65149712, + "step": 112210 + }, + { + "epoch": 16.713583556747096, + "grad_norm": 0.04541015625, + "learning_rate": 0.0024008201796169183, + "loss": 0.7883, + "num_input_tokens_seen": 65152528, + "step": 112215 + }, + { + "epoch": 16.714328269288053, + "grad_norm": 0.044921875, + "learning_rate": 0.002399762265893733, + "loss": 0.8075, + "num_input_tokens_seen": 65155248, + "step": 112220 + }, + { + "epoch": 16.715072981829014, + "grad_norm": 0.059814453125, + "learning_rate": 0.002398704565038759, + "loss": 0.7847, + "num_input_tokens_seen": 65158128, + "step": 112225 + }, + { + "epoch": 16.715817694369974, + "grad_norm": 0.04443359375, + "learning_rate": 0.002397647077069859, + "loss": 0.7875, + "num_input_tokens_seen": 65160912, + "step": 112230 + }, + { + "epoch": 16.71656240691093, + "grad_norm": 0.062255859375, + "learning_rate": 0.002396589802004908, + "loss": 0.8097, + "num_input_tokens_seen": 65163760, + "step": 112235 + }, + { + "epoch": 16.717307119451892, + "grad_norm": 0.357421875, + "learning_rate": 0.0023955327398617575, + "loss": 0.8527, + "num_input_tokens_seen": 65166512, + "step": 112240 + }, + { + "epoch": 16.718051831992852, + "grad_norm": 0.05908203125, + "learning_rate": 0.0023944758906582764, + "loss": 0.7897, + "num_input_tokens_seen": 65169776, + "step": 112245 + }, + { + "epoch": 16.71879654453381, + "grad_norm": 0.0576171875, + "learning_rate": 0.002393419254412313, + "loss": 0.7758, + "num_input_tokens_seen": 65172752, + "step": 112250 + }, + { + "epoch": 16.71954125707477, + "grad_norm": 0.047119140625, + "learning_rate": 0.002392362831141718, + "loss": 0.7976, + "num_input_tokens_seen": 65176240, + "step": 112255 + }, + { + "epoch": 16.720285969615727, + "grad_norm": 0.06005859375, + "learning_rate": 0.00239130662086434, + "loss": 0.8076, + "num_input_tokens_seen": 65179408, + "step": 112260 + }, + { + "epoch": 16.721030682156687, + "grad_norm": 0.023681640625, + "learning_rate": 0.0023902506235980166, + "loss": 0.8097, + "num_input_tokens_seen": 65182096, + "step": 112265 + }, + { + "epoch": 16.721775394697648, + "grad_norm": 0.039794921875, + "learning_rate": 0.002389194839360597, + "loss": 0.7718, + "num_input_tokens_seen": 65185072, + "step": 112270 + }, + { + "epoch": 16.722520107238605, + "grad_norm": 0.0361328125, + "learning_rate": 0.0023881392681699113, + "loss": 0.8042, + "num_input_tokens_seen": 65187984, + "step": 112275 + }, + { + "epoch": 16.723264819779565, + "grad_norm": 0.040283203125, + "learning_rate": 0.0023870839100437967, + "loss": 0.7993, + "num_input_tokens_seen": 65190960, + "step": 112280 + }, + { + "epoch": 16.724009532320522, + "grad_norm": 0.041259765625, + "learning_rate": 0.0023860287650000773, + "loss": 0.7941, + "num_input_tokens_seen": 65194000, + "step": 112285 + }, + { + "epoch": 16.724754244861483, + "grad_norm": 0.032958984375, + "learning_rate": 0.002384973833056585, + "loss": 0.8068, + "num_input_tokens_seen": 65197136, + "step": 112290 + }, + { + "epoch": 16.725498957402444, + "grad_norm": 0.046630859375, + "learning_rate": 0.0023839191142311362, + "loss": 0.8054, + "num_input_tokens_seen": 65200016, + "step": 112295 + }, + { + "epoch": 16.7262436699434, + "grad_norm": 0.03955078125, + "learning_rate": 0.002382864608541553, + "loss": 0.7873, + "num_input_tokens_seen": 65202736, + "step": 112300 + }, + { + "epoch": 16.72698838248436, + "grad_norm": 0.05859375, + "learning_rate": 0.002381810316005649, + "loss": 0.7887, + "num_input_tokens_seen": 65205680, + "step": 112305 + }, + { + "epoch": 16.72773309502532, + "grad_norm": 0.0439453125, + "learning_rate": 0.0023807562366412326, + "loss": 0.7949, + "num_input_tokens_seen": 65208400, + "step": 112310 + }, + { + "epoch": 16.72847780756628, + "grad_norm": 0.04541015625, + "learning_rate": 0.0023797023704661175, + "loss": 0.7989, + "num_input_tokens_seen": 65211312, + "step": 112315 + }, + { + "epoch": 16.72922252010724, + "grad_norm": 0.048095703125, + "learning_rate": 0.002378648717498104, + "loss": 0.7952, + "num_input_tokens_seen": 65213936, + "step": 112320 + }, + { + "epoch": 16.7299672326482, + "grad_norm": 0.03466796875, + "learning_rate": 0.002377595277754992, + "loss": 0.7814, + "num_input_tokens_seen": 65216656, + "step": 112325 + }, + { + "epoch": 16.730711945189157, + "grad_norm": 0.060791015625, + "learning_rate": 0.0023765420512545763, + "loss": 0.8032, + "num_input_tokens_seen": 65219600, + "step": 112330 + }, + { + "epoch": 16.731456657730117, + "grad_norm": 0.07421875, + "learning_rate": 0.0023754890380146554, + "loss": 0.8079, + "num_input_tokens_seen": 65222608, + "step": 112335 + }, + { + "epoch": 16.732201370271074, + "grad_norm": 0.04150390625, + "learning_rate": 0.0023744362380530176, + "loss": 0.7999, + "num_input_tokens_seen": 65225520, + "step": 112340 + }, + { + "epoch": 16.732946082812035, + "grad_norm": 0.042236328125, + "learning_rate": 0.0023733836513874424, + "loss": 0.8242, + "num_input_tokens_seen": 65228560, + "step": 112345 + }, + { + "epoch": 16.733690795352995, + "grad_norm": 0.06640625, + "learning_rate": 0.0023723312780357217, + "loss": 0.8219, + "num_input_tokens_seen": 65231152, + "step": 112350 + }, + { + "epoch": 16.734435507893952, + "grad_norm": 0.08837890625, + "learning_rate": 0.0023712791180156255, + "loss": 0.7992, + "num_input_tokens_seen": 65234128, + "step": 112355 + }, + { + "epoch": 16.735180220434913, + "grad_norm": 0.0537109375, + "learning_rate": 0.002370227171344939, + "loss": 0.8043, + "num_input_tokens_seen": 65236656, + "step": 112360 + }, + { + "epoch": 16.73592493297587, + "grad_norm": 0.041748046875, + "learning_rate": 0.0023691754380414233, + "loss": 0.8023, + "num_input_tokens_seen": 65239792, + "step": 112365 + }, + { + "epoch": 16.73666964551683, + "grad_norm": 0.07470703125, + "learning_rate": 0.0023681239181228535, + "loss": 0.795, + "num_input_tokens_seen": 65242832, + "step": 112370 + }, + { + "epoch": 16.73741435805779, + "grad_norm": 0.037841796875, + "learning_rate": 0.0023670726116069895, + "loss": 0.794, + "num_input_tokens_seen": 65245776, + "step": 112375 + }, + { + "epoch": 16.738159070598748, + "grad_norm": 0.034423828125, + "learning_rate": 0.0023660215185115968, + "loss": 0.8078, + "num_input_tokens_seen": 65248624, + "step": 112380 + }, + { + "epoch": 16.73890378313971, + "grad_norm": 0.04443359375, + "learning_rate": 0.0023649706388544313, + "loss": 0.814, + "num_input_tokens_seen": 65251440, + "step": 112385 + }, + { + "epoch": 16.73964849568067, + "grad_norm": 0.044189453125, + "learning_rate": 0.0023639199726532435, + "loss": 0.8, + "num_input_tokens_seen": 65254320, + "step": 112390 + }, + { + "epoch": 16.740393208221626, + "grad_norm": 0.053466796875, + "learning_rate": 0.002362869519925786, + "loss": 0.8049, + "num_input_tokens_seen": 65257200, + "step": 112395 + }, + { + "epoch": 16.741137920762586, + "grad_norm": 0.038330078125, + "learning_rate": 0.0023618192806897998, + "loss": 0.8025, + "num_input_tokens_seen": 65260144, + "step": 112400 + }, + { + "epoch": 16.741882633303543, + "grad_norm": 0.04541015625, + "learning_rate": 0.002360769254963037, + "loss": 0.7783, + "num_input_tokens_seen": 65262864, + "step": 112405 + }, + { + "epoch": 16.742627345844504, + "grad_norm": 0.0380859375, + "learning_rate": 0.0023597194427632273, + "loss": 0.7993, + "num_input_tokens_seen": 65265616, + "step": 112410 + }, + { + "epoch": 16.743372058385464, + "grad_norm": 0.0439453125, + "learning_rate": 0.0023586698441081144, + "loss": 0.7949, + "num_input_tokens_seen": 65268528, + "step": 112415 + }, + { + "epoch": 16.74411677092642, + "grad_norm": 0.04345703125, + "learning_rate": 0.002357620459015426, + "loss": 0.7942, + "num_input_tokens_seen": 65271472, + "step": 112420 + }, + { + "epoch": 16.744861483467382, + "grad_norm": 0.055908203125, + "learning_rate": 0.0023565712875028874, + "loss": 0.8271, + "num_input_tokens_seen": 65274416, + "step": 112425 + }, + { + "epoch": 16.745606196008342, + "grad_norm": 0.06103515625, + "learning_rate": 0.0023555223295882305, + "loss": 0.8079, + "num_input_tokens_seen": 65277584, + "step": 112430 + }, + { + "epoch": 16.7463509085493, + "grad_norm": 0.037109375, + "learning_rate": 0.0023544735852891684, + "loss": 0.8112, + "num_input_tokens_seen": 65280528, + "step": 112435 + }, + { + "epoch": 16.74709562109026, + "grad_norm": 0.06640625, + "learning_rate": 0.0023534250546234274, + "loss": 0.7922, + "num_input_tokens_seen": 65283312, + "step": 112440 + }, + { + "epoch": 16.747840333631217, + "grad_norm": 0.0277099609375, + "learning_rate": 0.002352376737608715, + "loss": 0.7836, + "num_input_tokens_seen": 65286224, + "step": 112445 + }, + { + "epoch": 16.748585046172177, + "grad_norm": 0.05517578125, + "learning_rate": 0.00235132863426274, + "loss": 0.781, + "num_input_tokens_seen": 65289296, + "step": 112450 + }, + { + "epoch": 16.749329758713138, + "grad_norm": 0.044189453125, + "learning_rate": 0.0023502807446032174, + "loss": 0.8089, + "num_input_tokens_seen": 65291856, + "step": 112455 + }, + { + "epoch": 16.750074471254095, + "grad_norm": 0.04296875, + "learning_rate": 0.002349233068647844, + "loss": 0.7983, + "num_input_tokens_seen": 65294704, + "step": 112460 + }, + { + "epoch": 16.750819183795056, + "grad_norm": 0.09033203125, + "learning_rate": 0.0023481856064143207, + "loss": 0.7926, + "num_input_tokens_seen": 65297520, + "step": 112465 + }, + { + "epoch": 16.751563896336016, + "grad_norm": 0.0419921875, + "learning_rate": 0.0023471383579203387, + "loss": 0.792, + "num_input_tokens_seen": 65300496, + "step": 112470 + }, + { + "epoch": 16.752308608876973, + "grad_norm": 0.0458984375, + "learning_rate": 0.002346091323183598, + "loss": 0.8013, + "num_input_tokens_seen": 65303344, + "step": 112475 + }, + { + "epoch": 16.753053321417934, + "grad_norm": 0.045166015625, + "learning_rate": 0.002345044502221779, + "loss": 0.7829, + "num_input_tokens_seen": 65306320, + "step": 112480 + }, + { + "epoch": 16.75379803395889, + "grad_norm": 0.05078125, + "learning_rate": 0.0023439978950525768, + "loss": 0.8126, + "num_input_tokens_seen": 65309136, + "step": 112485 + }, + { + "epoch": 16.75454274649985, + "grad_norm": 0.06689453125, + "learning_rate": 0.0023429515016936625, + "loss": 0.7831, + "num_input_tokens_seen": 65311952, + "step": 112490 + }, + { + "epoch": 16.75528745904081, + "grad_norm": 0.0390625, + "learning_rate": 0.0023419053221627214, + "loss": 0.8069, + "num_input_tokens_seen": 65314896, + "step": 112495 + }, + { + "epoch": 16.75603217158177, + "grad_norm": 0.0556640625, + "learning_rate": 0.0023408593564774255, + "loss": 0.8037, + "num_input_tokens_seen": 65317456, + "step": 112500 + }, + { + "epoch": 16.75677688412273, + "grad_norm": 0.040771484375, + "learning_rate": 0.0023398136046554412, + "loss": 0.7867, + "num_input_tokens_seen": 65320240, + "step": 112505 + }, + { + "epoch": 16.757521596663686, + "grad_norm": 0.0703125, + "learning_rate": 0.002338768066714443, + "loss": 0.799, + "num_input_tokens_seen": 65323376, + "step": 112510 + }, + { + "epoch": 16.758266309204647, + "grad_norm": 0.03125, + "learning_rate": 0.002337722742672089, + "loss": 0.7881, + "num_input_tokens_seen": 65326416, + "step": 112515 + }, + { + "epoch": 16.759011021745607, + "grad_norm": 0.037353515625, + "learning_rate": 0.0023366776325460395, + "loss": 0.7896, + "num_input_tokens_seen": 65329200, + "step": 112520 + }, + { + "epoch": 16.759755734286564, + "grad_norm": 0.055419921875, + "learning_rate": 0.002335632736353949, + "loss": 0.7882, + "num_input_tokens_seen": 65331888, + "step": 112525 + }, + { + "epoch": 16.760500446827525, + "grad_norm": 0.037841796875, + "learning_rate": 0.002334588054113475, + "loss": 0.7964, + "num_input_tokens_seen": 65335056, + "step": 112530 + }, + { + "epoch": 16.761245159368485, + "grad_norm": 0.062255859375, + "learning_rate": 0.002333543585842258, + "loss": 0.7902, + "num_input_tokens_seen": 65338064, + "step": 112535 + }, + { + "epoch": 16.761989871909442, + "grad_norm": 0.047119140625, + "learning_rate": 0.0023324993315579543, + "loss": 0.7894, + "num_input_tokens_seen": 65341008, + "step": 112540 + }, + { + "epoch": 16.762734584450403, + "grad_norm": 0.036865234375, + "learning_rate": 0.0023314552912781982, + "loss": 0.7976, + "num_input_tokens_seen": 65343568, + "step": 112545 + }, + { + "epoch": 16.76347929699136, + "grad_norm": 0.044677734375, + "learning_rate": 0.0023304114650206267, + "loss": 0.802, + "num_input_tokens_seen": 65346320, + "step": 112550 + }, + { + "epoch": 16.76422400953232, + "grad_norm": 0.029541015625, + "learning_rate": 0.0023293678528028797, + "loss": 0.7866, + "num_input_tokens_seen": 65349168, + "step": 112555 + }, + { + "epoch": 16.76496872207328, + "grad_norm": 0.027099609375, + "learning_rate": 0.002328324454642581, + "loss": 0.8225, + "num_input_tokens_seen": 65352208, + "step": 112560 + }, + { + "epoch": 16.765713434614238, + "grad_norm": 0.038818359375, + "learning_rate": 0.002327281270557365, + "loss": 0.8077, + "num_input_tokens_seen": 65355248, + "step": 112565 + }, + { + "epoch": 16.7664581471552, + "grad_norm": 0.0595703125, + "learning_rate": 0.0023262383005648495, + "loss": 0.7984, + "num_input_tokens_seen": 65358128, + "step": 112570 + }, + { + "epoch": 16.76720285969616, + "grad_norm": 0.03466796875, + "learning_rate": 0.0023251955446826618, + "loss": 0.7911, + "num_input_tokens_seen": 65361232, + "step": 112575 + }, + { + "epoch": 16.767947572237116, + "grad_norm": 0.05029296875, + "learning_rate": 0.002324153002928411, + "loss": 0.7934, + "num_input_tokens_seen": 65364016, + "step": 112580 + }, + { + "epoch": 16.768692284778076, + "grad_norm": 0.05859375, + "learning_rate": 0.0023231106753197127, + "loss": 0.8075, + "num_input_tokens_seen": 65366832, + "step": 112585 + }, + { + "epoch": 16.769436997319033, + "grad_norm": 0.060302734375, + "learning_rate": 0.0023220685618741753, + "loss": 0.7717, + "num_input_tokens_seen": 65369488, + "step": 112590 + }, + { + "epoch": 16.770181709859994, + "grad_norm": 0.04296875, + "learning_rate": 0.0023210266626094013, + "loss": 0.7752, + "num_input_tokens_seen": 65372176, + "step": 112595 + }, + { + "epoch": 16.770926422400954, + "grad_norm": 0.05419921875, + "learning_rate": 0.0023199849775429977, + "loss": 0.8157, + "num_input_tokens_seen": 65374960, + "step": 112600 + }, + { + "epoch": 16.77167113494191, + "grad_norm": 0.053955078125, + "learning_rate": 0.0023189435066925572, + "loss": 0.7875, + "num_input_tokens_seen": 65377936, + "step": 112605 + }, + { + "epoch": 16.772415847482872, + "grad_norm": 0.08349609375, + "learning_rate": 0.0023179022500756812, + "loss": 0.8622, + "num_input_tokens_seen": 65380976, + "step": 112610 + }, + { + "epoch": 16.773160560023832, + "grad_norm": 0.06884765625, + "learning_rate": 0.0023168612077099555, + "loss": 0.7912, + "num_input_tokens_seen": 65384176, + "step": 112615 + }, + { + "epoch": 16.77390527256479, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0023158203796129705, + "loss": 0.8042, + "num_input_tokens_seen": 65387088, + "step": 112620 + }, + { + "epoch": 16.77464998510575, + "grad_norm": 0.0390625, + "learning_rate": 0.0023147797658023095, + "loss": 0.7883, + "num_input_tokens_seen": 65390096, + "step": 112625 + }, + { + "epoch": 16.775394697646707, + "grad_norm": 0.0693359375, + "learning_rate": 0.0023137393662955477, + "loss": 0.7941, + "num_input_tokens_seen": 65392976, + "step": 112630 + }, + { + "epoch": 16.776139410187668, + "grad_norm": 0.0341796875, + "learning_rate": 0.00231269918111027, + "loss": 0.8009, + "num_input_tokens_seen": 65395696, + "step": 112635 + }, + { + "epoch": 16.776884122728628, + "grad_norm": 0.041015625, + "learning_rate": 0.0023116592102640408, + "loss": 0.8029, + "num_input_tokens_seen": 65398640, + "step": 112640 + }, + { + "epoch": 16.777628835269585, + "grad_norm": 0.060302734375, + "learning_rate": 0.002310619453774437, + "loss": 0.7774, + "num_input_tokens_seen": 65401520, + "step": 112645 + }, + { + "epoch": 16.778373547810546, + "grad_norm": 0.054931640625, + "learning_rate": 0.0023095799116590204, + "loss": 0.7931, + "num_input_tokens_seen": 65404080, + "step": 112650 + }, + { + "epoch": 16.779118260351503, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0023085405839353527, + "loss": 0.7965, + "num_input_tokens_seen": 65407024, + "step": 112655 + }, + { + "epoch": 16.779862972892463, + "grad_norm": 0.0498046875, + "learning_rate": 0.002307501470620994, + "loss": 0.8024, + "num_input_tokens_seen": 65410256, + "step": 112660 + }, + { + "epoch": 16.780607685433424, + "grad_norm": 0.0576171875, + "learning_rate": 0.0023064625717334948, + "loss": 0.8042, + "num_input_tokens_seen": 65413232, + "step": 112665 + }, + { + "epoch": 16.78135239797438, + "grad_norm": 0.058837890625, + "learning_rate": 0.002305423887290411, + "loss": 0.8043, + "num_input_tokens_seen": 65416048, + "step": 112670 + }, + { + "epoch": 16.78209711051534, + "grad_norm": 0.052001953125, + "learning_rate": 0.002304385417309285, + "loss": 0.8, + "num_input_tokens_seen": 65418704, + "step": 112675 + }, + { + "epoch": 16.7828418230563, + "grad_norm": 0.061767578125, + "learning_rate": 0.0023033471618076687, + "loss": 0.801, + "num_input_tokens_seen": 65421392, + "step": 112680 + }, + { + "epoch": 16.78358653559726, + "grad_norm": 0.07470703125, + "learning_rate": 0.0023023091208030943, + "loss": 0.7886, + "num_input_tokens_seen": 65424144, + "step": 112685 + }, + { + "epoch": 16.78433124813822, + "grad_norm": 0.04248046875, + "learning_rate": 0.0023012712943131057, + "loss": 0.7936, + "num_input_tokens_seen": 65427088, + "step": 112690 + }, + { + "epoch": 16.785075960679176, + "grad_norm": 0.05078125, + "learning_rate": 0.0023002336823552275, + "loss": 0.8107, + "num_input_tokens_seen": 65429872, + "step": 112695 + }, + { + "epoch": 16.785820673220137, + "grad_norm": 0.05615234375, + "learning_rate": 0.0022991962849469977, + "loss": 0.7896, + "num_input_tokens_seen": 65432912, + "step": 112700 + }, + { + "epoch": 16.786565385761097, + "grad_norm": 0.056396484375, + "learning_rate": 0.0022981591021059384, + "loss": 0.7956, + "num_input_tokens_seen": 65435760, + "step": 112705 + }, + { + "epoch": 16.787310098302054, + "grad_norm": 0.053955078125, + "learning_rate": 0.002297122133849569, + "loss": 0.8003, + "num_input_tokens_seen": 65438448, + "step": 112710 + }, + { + "epoch": 16.788054810843015, + "grad_norm": 0.07177734375, + "learning_rate": 0.0022960853801954123, + "loss": 0.7906, + "num_input_tokens_seen": 65441360, + "step": 112715 + }, + { + "epoch": 16.788799523383975, + "grad_norm": 0.029541015625, + "learning_rate": 0.0022950488411609825, + "loss": 0.7983, + "num_input_tokens_seen": 65444400, + "step": 112720 + }, + { + "epoch": 16.789544235924932, + "grad_norm": 0.048095703125, + "learning_rate": 0.002294012516763789, + "loss": 0.7912, + "num_input_tokens_seen": 65447504, + "step": 112725 + }, + { + "epoch": 16.790288948465893, + "grad_norm": 0.0419921875, + "learning_rate": 0.0022929764070213363, + "loss": 0.7921, + "num_input_tokens_seen": 65450544, + "step": 112730 + }, + { + "epoch": 16.79103366100685, + "grad_norm": 0.0634765625, + "learning_rate": 0.002291940511951137, + "loss": 0.8058, + "num_input_tokens_seen": 65453456, + "step": 112735 + }, + { + "epoch": 16.79177837354781, + "grad_norm": 0.045166015625, + "learning_rate": 0.0022909048315706837, + "loss": 0.8059, + "num_input_tokens_seen": 65456400, + "step": 112740 + }, + { + "epoch": 16.79252308608877, + "grad_norm": 0.04248046875, + "learning_rate": 0.002289869365897479, + "loss": 0.8086, + "num_input_tokens_seen": 65459280, + "step": 112745 + }, + { + "epoch": 16.793267798629728, + "grad_norm": 0.028076171875, + "learning_rate": 0.002288834114949014, + "loss": 0.8233, + "num_input_tokens_seen": 65462128, + "step": 112750 + }, + { + "epoch": 16.79401251117069, + "grad_norm": 0.06982421875, + "learning_rate": 0.0022877990787427724, + "loss": 0.7887, + "num_input_tokens_seen": 65464944, + "step": 112755 + }, + { + "epoch": 16.79475722371165, + "grad_norm": 0.057373046875, + "learning_rate": 0.00228676425729625, + "loss": 0.8165, + "num_input_tokens_seen": 65468112, + "step": 112760 + }, + { + "epoch": 16.795501936252606, + "grad_norm": 0.03125, + "learning_rate": 0.0022857296506269215, + "loss": 0.7904, + "num_input_tokens_seen": 65470992, + "step": 112765 + }, + { + "epoch": 16.796246648793566, + "grad_norm": 0.02685546875, + "learning_rate": 0.00228469525875227, + "loss": 0.7886, + "num_input_tokens_seen": 65473648, + "step": 112770 + }, + { + "epoch": 16.796991361334523, + "grad_norm": 0.044921875, + "learning_rate": 0.00228366108168977, + "loss": 0.8207, + "num_input_tokens_seen": 65476528, + "step": 112775 + }, + { + "epoch": 16.797736073875484, + "grad_norm": 0.0291748046875, + "learning_rate": 0.002282627119456889, + "loss": 0.8242, + "num_input_tokens_seen": 65479376, + "step": 112780 + }, + { + "epoch": 16.798480786416444, + "grad_norm": 0.05810546875, + "learning_rate": 0.0022815933720711002, + "loss": 0.797, + "num_input_tokens_seen": 65482064, + "step": 112785 + }, + { + "epoch": 16.7992254989574, + "grad_norm": 0.045654296875, + "learning_rate": 0.0022805598395498655, + "loss": 0.7725, + "num_input_tokens_seen": 65484848, + "step": 112790 + }, + { + "epoch": 16.799970211498362, + "grad_norm": 0.03857421875, + "learning_rate": 0.002279526521910643, + "loss": 0.7832, + "num_input_tokens_seen": 65487728, + "step": 112795 + }, + { + "epoch": 16.800714924039323, + "grad_norm": 0.08544921875, + "learning_rate": 0.002278493419170889, + "loss": 0.8254, + "num_input_tokens_seen": 65490768, + "step": 112800 + }, + { + "epoch": 16.80145963658028, + "grad_norm": 0.068359375, + "learning_rate": 0.002277460531348064, + "loss": 0.7853, + "num_input_tokens_seen": 65493712, + "step": 112805 + }, + { + "epoch": 16.80220434912124, + "grad_norm": 0.037841796875, + "learning_rate": 0.0022764278584596074, + "loss": 0.8102, + "num_input_tokens_seen": 65496368, + "step": 112810 + }, + { + "epoch": 16.802949061662197, + "grad_norm": 0.037109375, + "learning_rate": 0.002275395400522974, + "loss": 0.8042, + "num_input_tokens_seen": 65499280, + "step": 112815 + }, + { + "epoch": 16.803693774203158, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0022743631575556, + "loss": 0.7862, + "num_input_tokens_seen": 65502128, + "step": 112820 + }, + { + "epoch": 16.804438486744118, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0022733311295749293, + "loss": 0.7825, + "num_input_tokens_seen": 65504848, + "step": 112825 + }, + { + "epoch": 16.805183199285075, + "grad_norm": 0.034912109375, + "learning_rate": 0.002272299316598396, + "loss": 0.8084, + "num_input_tokens_seen": 65507856, + "step": 112830 + }, + { + "epoch": 16.805927911826036, + "grad_norm": 0.03759765625, + "learning_rate": 0.0022712677186434256, + "loss": 0.8102, + "num_input_tokens_seen": 65510832, + "step": 112835 + }, + { + "epoch": 16.806672624366996, + "grad_norm": 0.07861328125, + "learning_rate": 0.002270236335727453, + "loss": 0.7969, + "num_input_tokens_seen": 65513712, + "step": 112840 + }, + { + "epoch": 16.807417336907953, + "grad_norm": 0.041748046875, + "learning_rate": 0.0022692051678678995, + "loss": 0.8009, + "num_input_tokens_seen": 65516560, + "step": 112845 + }, + { + "epoch": 16.808162049448914, + "grad_norm": 0.029541015625, + "learning_rate": 0.0022681742150821855, + "loss": 0.8133, + "num_input_tokens_seen": 65519376, + "step": 112850 + }, + { + "epoch": 16.80890676198987, + "grad_norm": 0.048583984375, + "learning_rate": 0.002267143477387726, + "loss": 0.8026, + "num_input_tokens_seen": 65522384, + "step": 112855 + }, + { + "epoch": 16.80965147453083, + "grad_norm": 0.05810546875, + "learning_rate": 0.0022661129548019387, + "loss": 0.7939, + "num_input_tokens_seen": 65525296, + "step": 112860 + }, + { + "epoch": 16.81039618707179, + "grad_norm": 0.05615234375, + "learning_rate": 0.0022650826473422318, + "loss": 0.7791, + "num_input_tokens_seen": 65527888, + "step": 112865 + }, + { + "epoch": 16.81114089961275, + "grad_norm": 0.04052734375, + "learning_rate": 0.0022640525550260064, + "loss": 0.8021, + "num_input_tokens_seen": 65530736, + "step": 112870 + }, + { + "epoch": 16.81188561215371, + "grad_norm": 0.043701171875, + "learning_rate": 0.002263022677870672, + "loss": 0.7968, + "num_input_tokens_seen": 65533840, + "step": 112875 + }, + { + "epoch": 16.812630324694666, + "grad_norm": 0.04736328125, + "learning_rate": 0.0022619930158936205, + "loss": 0.7847, + "num_input_tokens_seen": 65536784, + "step": 112880 + }, + { + "epoch": 16.813375037235627, + "grad_norm": 0.078125, + "learning_rate": 0.0022609635691122542, + "loss": 0.8058, + "num_input_tokens_seen": 65539728, + "step": 112885 + }, + { + "epoch": 16.814119749776587, + "grad_norm": 0.0517578125, + "learning_rate": 0.0022599343375439584, + "loss": 0.8128, + "num_input_tokens_seen": 65542352, + "step": 112890 + }, + { + "epoch": 16.814864462317544, + "grad_norm": 0.02880859375, + "learning_rate": 0.002258905321206126, + "loss": 0.8011, + "num_input_tokens_seen": 65545392, + "step": 112895 + }, + { + "epoch": 16.815609174858505, + "grad_norm": 0.037841796875, + "learning_rate": 0.0022578765201161366, + "loss": 0.8065, + "num_input_tokens_seen": 65548208, + "step": 112900 + }, + { + "epoch": 16.816353887399465, + "grad_norm": 0.05908203125, + "learning_rate": 0.002256847934291376, + "loss": 0.7966, + "num_input_tokens_seen": 65550736, + "step": 112905 + }, + { + "epoch": 16.817098599940422, + "grad_norm": 0.052490234375, + "learning_rate": 0.002255819563749217, + "loss": 0.8148, + "num_input_tokens_seen": 65553584, + "step": 112910 + }, + { + "epoch": 16.817843312481383, + "grad_norm": 0.072265625, + "learning_rate": 0.0022547914085070345, + "loss": 0.7882, + "num_input_tokens_seen": 65556496, + "step": 112915 + }, + { + "epoch": 16.81858802502234, + "grad_norm": 0.10693359375, + "learning_rate": 0.0022537634685821976, + "loss": 0.7958, + "num_input_tokens_seen": 65559344, + "step": 112920 + }, + { + "epoch": 16.8193327375633, + "grad_norm": 0.055419921875, + "learning_rate": 0.0022527357439920677, + "loss": 0.7881, + "num_input_tokens_seen": 65562288, + "step": 112925 + }, + { + "epoch": 16.82007745010426, + "grad_norm": 0.036376953125, + "learning_rate": 0.002251708234754017, + "loss": 0.7839, + "num_input_tokens_seen": 65565136, + "step": 112930 + }, + { + "epoch": 16.820822162645218, + "grad_norm": 0.33984375, + "learning_rate": 0.002250680940885393, + "loss": 0.821, + "num_input_tokens_seen": 65568624, + "step": 112935 + }, + { + "epoch": 16.82156687518618, + "grad_norm": 0.044921875, + "learning_rate": 0.0022496538624035617, + "loss": 0.8211, + "num_input_tokens_seen": 65571248, + "step": 112940 + }, + { + "epoch": 16.82231158772714, + "grad_norm": 0.048095703125, + "learning_rate": 0.0022486269993258695, + "loss": 0.8044, + "num_input_tokens_seen": 65574384, + "step": 112945 + }, + { + "epoch": 16.823056300268096, + "grad_norm": 0.0478515625, + "learning_rate": 0.002247600351669661, + "loss": 0.7956, + "num_input_tokens_seen": 65577392, + "step": 112950 + }, + { + "epoch": 16.823801012809056, + "grad_norm": 0.068359375, + "learning_rate": 0.0022465739194522864, + "loss": 0.8014, + "num_input_tokens_seen": 65580624, + "step": 112955 + }, + { + "epoch": 16.824545725350013, + "grad_norm": 0.03173828125, + "learning_rate": 0.002245547702691081, + "loss": 0.7739, + "num_input_tokens_seen": 65583504, + "step": 112960 + }, + { + "epoch": 16.825290437890974, + "grad_norm": 0.06298828125, + "learning_rate": 0.002244521701403387, + "loss": 0.8035, + "num_input_tokens_seen": 65586256, + "step": 112965 + }, + { + "epoch": 16.826035150431935, + "grad_norm": 0.052734375, + "learning_rate": 0.002243495915606531, + "loss": 0.7758, + "num_input_tokens_seen": 65589168, + "step": 112970 + }, + { + "epoch": 16.82677986297289, + "grad_norm": 0.059326171875, + "learning_rate": 0.0022424703453178497, + "loss": 0.7875, + "num_input_tokens_seen": 65592048, + "step": 112975 + }, + { + "epoch": 16.827524575513852, + "grad_norm": 0.05908203125, + "learning_rate": 0.0022414449905546656, + "loss": 0.8034, + "num_input_tokens_seen": 65594864, + "step": 112980 + }, + { + "epoch": 16.828269288054813, + "grad_norm": 0.0751953125, + "learning_rate": 0.002240419851334302, + "loss": 0.8158, + "num_input_tokens_seen": 65597776, + "step": 112985 + }, + { + "epoch": 16.82901400059577, + "grad_norm": 0.044677734375, + "learning_rate": 0.0022393949276740765, + "loss": 0.8093, + "num_input_tokens_seen": 65600592, + "step": 112990 + }, + { + "epoch": 16.82975871313673, + "grad_norm": 0.060791015625, + "learning_rate": 0.0022383702195913, + "loss": 0.7802, + "num_input_tokens_seen": 65603216, + "step": 112995 + }, + { + "epoch": 16.830503425677687, + "grad_norm": 0.0732421875, + "learning_rate": 0.0022373457271032925, + "loss": 0.7794, + "num_input_tokens_seen": 65605968, + "step": 113000 + }, + { + "epoch": 16.831248138218648, + "grad_norm": 0.06591796875, + "learning_rate": 0.002236321450227355, + "loss": 0.8157, + "num_input_tokens_seen": 65608784, + "step": 113005 + }, + { + "epoch": 16.831992850759608, + "grad_norm": 0.05712890625, + "learning_rate": 0.002235297388980796, + "loss": 0.7798, + "num_input_tokens_seen": 65611792, + "step": 113010 + }, + { + "epoch": 16.832737563300565, + "grad_norm": 0.0267333984375, + "learning_rate": 0.0022342735433809116, + "loss": 0.7853, + "num_input_tokens_seen": 65614768, + "step": 113015 + }, + { + "epoch": 16.833482275841526, + "grad_norm": 0.037841796875, + "learning_rate": 0.0022332499134450037, + "loss": 0.8001, + "num_input_tokens_seen": 65617712, + "step": 113020 + }, + { + "epoch": 16.834226988382483, + "grad_norm": 0.04833984375, + "learning_rate": 0.002232226499190364, + "loss": 0.7933, + "num_input_tokens_seen": 65620720, + "step": 113025 + }, + { + "epoch": 16.834971700923443, + "grad_norm": 0.06103515625, + "learning_rate": 0.002231203300634278, + "loss": 0.7986, + "num_input_tokens_seen": 65623568, + "step": 113030 + }, + { + "epoch": 16.835716413464404, + "grad_norm": 0.042724609375, + "learning_rate": 0.002230180317794037, + "loss": 0.8041, + "num_input_tokens_seen": 65626256, + "step": 113035 + }, + { + "epoch": 16.83646112600536, + "grad_norm": 0.027587890625, + "learning_rate": 0.0022291575506869178, + "loss": 0.7954, + "num_input_tokens_seen": 65629232, + "step": 113040 + }, + { + "epoch": 16.83720583854632, + "grad_norm": 0.048095703125, + "learning_rate": 0.0022281349993302055, + "loss": 0.7941, + "num_input_tokens_seen": 65632144, + "step": 113045 + }, + { + "epoch": 16.837950551087282, + "grad_norm": 0.04296875, + "learning_rate": 0.0022271126637411725, + "loss": 0.7871, + "num_input_tokens_seen": 65635056, + "step": 113050 + }, + { + "epoch": 16.83869526362824, + "grad_norm": 0.037841796875, + "learning_rate": 0.0022260905439370877, + "loss": 0.7772, + "num_input_tokens_seen": 65637840, + "step": 113055 + }, + { + "epoch": 16.8394399761692, + "grad_norm": 0.039794921875, + "learning_rate": 0.002225068639935217, + "loss": 0.8054, + "num_input_tokens_seen": 65641040, + "step": 113060 + }, + { + "epoch": 16.840184688710156, + "grad_norm": 0.043701171875, + "learning_rate": 0.0022240469517528323, + "loss": 0.7888, + "num_input_tokens_seen": 65643760, + "step": 113065 + }, + { + "epoch": 16.840929401251117, + "grad_norm": 0.04541015625, + "learning_rate": 0.002223025479407189, + "loss": 0.7904, + "num_input_tokens_seen": 65646768, + "step": 113070 + }, + { + "epoch": 16.841674113792077, + "grad_norm": 0.07666015625, + "learning_rate": 0.0022220042229155407, + "loss": 0.7909, + "num_input_tokens_seen": 65649840, + "step": 113075 + }, + { + "epoch": 16.842418826333034, + "grad_norm": 0.054443359375, + "learning_rate": 0.002220983182295148, + "loss": 0.8039, + "num_input_tokens_seen": 65652432, + "step": 113080 + }, + { + "epoch": 16.843163538873995, + "grad_norm": 0.078125, + "learning_rate": 0.0022199623575632527, + "loss": 0.7929, + "num_input_tokens_seen": 65655536, + "step": 113085 + }, + { + "epoch": 16.843908251414955, + "grad_norm": 0.0245361328125, + "learning_rate": 0.002218941748737107, + "loss": 0.8098, + "num_input_tokens_seen": 65658384, + "step": 113090 + }, + { + "epoch": 16.844652963955912, + "grad_norm": 0.033447265625, + "learning_rate": 0.0022179213558339477, + "loss": 0.8105, + "num_input_tokens_seen": 65661232, + "step": 113095 + }, + { + "epoch": 16.845397676496873, + "grad_norm": 0.050537109375, + "learning_rate": 0.0022169011788710207, + "loss": 0.7892, + "num_input_tokens_seen": 65663920, + "step": 113100 + }, + { + "epoch": 16.84614238903783, + "grad_norm": 0.048828125, + "learning_rate": 0.002215881217865554, + "loss": 0.7866, + "num_input_tokens_seen": 65666704, + "step": 113105 + }, + { + "epoch": 16.84688710157879, + "grad_norm": 0.04296875, + "learning_rate": 0.0022148614728347785, + "loss": 0.7926, + "num_input_tokens_seen": 65669808, + "step": 113110 + }, + { + "epoch": 16.84763181411975, + "grad_norm": 0.03173828125, + "learning_rate": 0.0022138419437959275, + "loss": 0.7899, + "num_input_tokens_seen": 65672752, + "step": 113115 + }, + { + "epoch": 16.848376526660708, + "grad_norm": 0.04248046875, + "learning_rate": 0.0022128226307662206, + "loss": 0.7893, + "num_input_tokens_seen": 65675280, + "step": 113120 + }, + { + "epoch": 16.84912123920167, + "grad_norm": 0.053466796875, + "learning_rate": 0.0022118035337628803, + "loss": 0.7802, + "num_input_tokens_seen": 65678256, + "step": 113125 + }, + { + "epoch": 16.84986595174263, + "grad_norm": 0.05712890625, + "learning_rate": 0.0022107846528031183, + "loss": 0.7917, + "num_input_tokens_seen": 65681200, + "step": 113130 + }, + { + "epoch": 16.850610664283586, + "grad_norm": 0.05322265625, + "learning_rate": 0.0022097659879041537, + "loss": 0.7924, + "num_input_tokens_seen": 65684208, + "step": 113135 + }, + { + "epoch": 16.851355376824547, + "grad_norm": 0.058349609375, + "learning_rate": 0.0022087475390831913, + "loss": 0.7827, + "num_input_tokens_seen": 65687344, + "step": 113140 + }, + { + "epoch": 16.852100089365504, + "grad_norm": 0.0830078125, + "learning_rate": 0.002207729306357442, + "loss": 0.8043, + "num_input_tokens_seen": 65690480, + "step": 113145 + }, + { + "epoch": 16.852844801906464, + "grad_norm": 0.0673828125, + "learning_rate": 0.0022067112897441042, + "loss": 0.7986, + "num_input_tokens_seen": 65693232, + "step": 113150 + }, + { + "epoch": 16.853589514447425, + "grad_norm": 0.049560546875, + "learning_rate": 0.0022056934892603726, + "loss": 0.7873, + "num_input_tokens_seen": 65696240, + "step": 113155 + }, + { + "epoch": 16.85433422698838, + "grad_norm": 0.0888671875, + "learning_rate": 0.0022046759049234492, + "loss": 0.7739, + "num_input_tokens_seen": 65699344, + "step": 113160 + }, + { + "epoch": 16.855078939529342, + "grad_norm": 0.08056640625, + "learning_rate": 0.0022036585367505193, + "loss": 0.8075, + "num_input_tokens_seen": 65702640, + "step": 113165 + }, + { + "epoch": 16.8558236520703, + "grad_norm": 0.0859375, + "learning_rate": 0.002202641384758776, + "loss": 0.7786, + "num_input_tokens_seen": 65705712, + "step": 113170 + }, + { + "epoch": 16.85656836461126, + "grad_norm": 0.12109375, + "learning_rate": 0.0022016244489654, + "loss": 0.7983, + "num_input_tokens_seen": 65708464, + "step": 113175 + }, + { + "epoch": 16.85731307715222, + "grad_norm": 0.12890625, + "learning_rate": 0.0022006077293875714, + "loss": 0.7875, + "num_input_tokens_seen": 65711504, + "step": 113180 + }, + { + "epoch": 16.858057789693177, + "grad_norm": 0.07958984375, + "learning_rate": 0.002199591226042463, + "loss": 0.8006, + "num_input_tokens_seen": 65714320, + "step": 113185 + }, + { + "epoch": 16.858802502234138, + "grad_norm": 0.1318359375, + "learning_rate": 0.0021985749389472537, + "loss": 0.8114, + "num_input_tokens_seen": 65717136, + "step": 113190 + }, + { + "epoch": 16.859547214775098, + "grad_norm": 0.0654296875, + "learning_rate": 0.0021975588681191105, + "loss": 0.8072, + "num_input_tokens_seen": 65719920, + "step": 113195 + }, + { + "epoch": 16.860291927316055, + "grad_norm": 0.052734375, + "learning_rate": 0.002196543013575196, + "loss": 0.7916, + "num_input_tokens_seen": 65722640, + "step": 113200 + }, + { + "epoch": 16.861036639857016, + "grad_norm": 0.042236328125, + "learning_rate": 0.002195527375332677, + "loss": 0.7832, + "num_input_tokens_seen": 65725488, + "step": 113205 + }, + { + "epoch": 16.861781352397973, + "grad_norm": 0.06689453125, + "learning_rate": 0.002194511953408706, + "loss": 0.7765, + "num_input_tokens_seen": 65728304, + "step": 113210 + }, + { + "epoch": 16.862526064938933, + "grad_norm": 0.055908203125, + "learning_rate": 0.0021934967478204424, + "loss": 0.8043, + "num_input_tokens_seen": 65731088, + "step": 113215 + }, + { + "epoch": 16.863270777479894, + "grad_norm": 0.043701171875, + "learning_rate": 0.002192481758585033, + "loss": 0.8107, + "num_input_tokens_seen": 65734000, + "step": 113220 + }, + { + "epoch": 16.86401549002085, + "grad_norm": 0.1142578125, + "learning_rate": 0.002191466985719632, + "loss": 0.7711, + "num_input_tokens_seen": 65736784, + "step": 113225 + }, + { + "epoch": 16.86476020256181, + "grad_norm": 0.049072265625, + "learning_rate": 0.0021904524292413762, + "loss": 0.8025, + "num_input_tokens_seen": 65739568, + "step": 113230 + }, + { + "epoch": 16.865504915102772, + "grad_norm": 0.09130859375, + "learning_rate": 0.0021894380891674062, + "loss": 0.7924, + "num_input_tokens_seen": 65742448, + "step": 113235 + }, + { + "epoch": 16.86624962764373, + "grad_norm": 0.07421875, + "learning_rate": 0.0021884239655148604, + "loss": 0.7719, + "num_input_tokens_seen": 65745232, + "step": 113240 + }, + { + "epoch": 16.86699434018469, + "grad_norm": 0.28515625, + "learning_rate": 0.0021874100583008736, + "loss": 0.8155, + "num_input_tokens_seen": 65748080, + "step": 113245 + }, + { + "epoch": 16.867739052725646, + "grad_norm": 0.06201171875, + "learning_rate": 0.00218639636754257, + "loss": 0.8169, + "num_input_tokens_seen": 65751280, + "step": 113250 + }, + { + "epoch": 16.868483765266607, + "grad_norm": 0.0615234375, + "learning_rate": 0.002185382893257073, + "loss": 0.8191, + "num_input_tokens_seen": 65754608, + "step": 113255 + }, + { + "epoch": 16.869228477807567, + "grad_norm": 0.1044921875, + "learning_rate": 0.0021843696354615113, + "loss": 0.7913, + "num_input_tokens_seen": 65757616, + "step": 113260 + }, + { + "epoch": 16.869973190348524, + "grad_norm": 0.07666015625, + "learning_rate": 0.0021833565941729977, + "loss": 0.8036, + "num_input_tokens_seen": 65760912, + "step": 113265 + }, + { + "epoch": 16.870717902889485, + "grad_norm": 0.05712890625, + "learning_rate": 0.002182343769408649, + "loss": 0.79, + "num_input_tokens_seen": 65764048, + "step": 113270 + }, + { + "epoch": 16.871462615430445, + "grad_norm": 0.044921875, + "learning_rate": 0.0021813311611855765, + "loss": 0.7757, + "num_input_tokens_seen": 65766832, + "step": 113275 + }, + { + "epoch": 16.872207327971402, + "grad_norm": 0.07666015625, + "learning_rate": 0.0021803187695208824, + "loss": 0.8456, + "num_input_tokens_seen": 65769744, + "step": 113280 + }, + { + "epoch": 16.872952040512363, + "grad_norm": 0.08251953125, + "learning_rate": 0.002179306594431676, + "loss": 0.791, + "num_input_tokens_seen": 65772496, + "step": 113285 + }, + { + "epoch": 16.87369675305332, + "grad_norm": 0.060302734375, + "learning_rate": 0.0021782946359350515, + "loss": 0.7961, + "num_input_tokens_seen": 65775536, + "step": 113290 + }, + { + "epoch": 16.87444146559428, + "grad_norm": 0.12255859375, + "learning_rate": 0.002177282894048112, + "loss": 0.8014, + "num_input_tokens_seen": 65778512, + "step": 113295 + }, + { + "epoch": 16.87518617813524, + "grad_norm": 0.04931640625, + "learning_rate": 0.0021762713687879407, + "loss": 0.8061, + "num_input_tokens_seen": 65781360, + "step": 113300 + }, + { + "epoch": 16.875930890676198, + "grad_norm": 0.06494140625, + "learning_rate": 0.002175260060171634, + "loss": 0.8015, + "num_input_tokens_seen": 65784464, + "step": 113305 + }, + { + "epoch": 16.87667560321716, + "grad_norm": 0.056640625, + "learning_rate": 0.002174248968216275, + "loss": 0.7779, + "num_input_tokens_seen": 65787408, + "step": 113310 + }, + { + "epoch": 16.87742031575812, + "grad_norm": 0.043701171875, + "learning_rate": 0.002173238092938945, + "loss": 0.7744, + "num_input_tokens_seen": 65790096, + "step": 113315 + }, + { + "epoch": 16.878165028299076, + "grad_norm": 0.0419921875, + "learning_rate": 0.00217222743435672, + "loss": 0.7877, + "num_input_tokens_seen": 65792912, + "step": 113320 + }, + { + "epoch": 16.878909740840037, + "grad_norm": 0.039306640625, + "learning_rate": 0.0021712169924866715, + "loss": 0.8059, + "num_input_tokens_seen": 65795696, + "step": 113325 + }, + { + "epoch": 16.879654453380994, + "grad_norm": 0.05078125, + "learning_rate": 0.0021702067673458774, + "loss": 0.7962, + "num_input_tokens_seen": 65798608, + "step": 113330 + }, + { + "epoch": 16.880399165921954, + "grad_norm": 0.06640625, + "learning_rate": 0.002169196758951396, + "loss": 0.8093, + "num_input_tokens_seen": 65801488, + "step": 113335 + }, + { + "epoch": 16.881143878462915, + "grad_norm": 0.07958984375, + "learning_rate": 0.0021681869673202984, + "loss": 0.7739, + "num_input_tokens_seen": 65804176, + "step": 113340 + }, + { + "epoch": 16.88188859100387, + "grad_norm": 0.042236328125, + "learning_rate": 0.002167177392469638, + "loss": 0.7997, + "num_input_tokens_seen": 65806896, + "step": 113345 + }, + { + "epoch": 16.882633303544832, + "grad_norm": 0.06201171875, + "learning_rate": 0.0021661680344164757, + "loss": 0.8002, + "num_input_tokens_seen": 65809840, + "step": 113350 + }, + { + "epoch": 16.883378016085793, + "grad_norm": 0.11865234375, + "learning_rate": 0.0021651588931778597, + "loss": 0.819, + "num_input_tokens_seen": 65812720, + "step": 113355 + }, + { + "epoch": 16.88412272862675, + "grad_norm": 0.0537109375, + "learning_rate": 0.0021641499687708363, + "loss": 0.7847, + "num_input_tokens_seen": 65815792, + "step": 113360 + }, + { + "epoch": 16.88486744116771, + "grad_norm": 0.054443359375, + "learning_rate": 0.0021631412612124574, + "loss": 0.8061, + "num_input_tokens_seen": 65818704, + "step": 113365 + }, + { + "epoch": 16.885612153708667, + "grad_norm": 0.053955078125, + "learning_rate": 0.0021621327705197567, + "loss": 0.8048, + "num_input_tokens_seen": 65821680, + "step": 113370 + }, + { + "epoch": 16.886356866249628, + "grad_norm": 0.061279296875, + "learning_rate": 0.002161124496709778, + "loss": 0.7696, + "num_input_tokens_seen": 65824368, + "step": 113375 + }, + { + "epoch": 16.88710157879059, + "grad_norm": 0.041748046875, + "learning_rate": 0.0021601164397995524, + "loss": 0.7902, + "num_input_tokens_seen": 65827472, + "step": 113380 + }, + { + "epoch": 16.887846291331545, + "grad_norm": 0.044921875, + "learning_rate": 0.00215910859980611, + "loss": 0.8028, + "num_input_tokens_seen": 65830544, + "step": 113385 + }, + { + "epoch": 16.888591003872506, + "grad_norm": 0.0556640625, + "learning_rate": 0.0021581009767464763, + "loss": 0.7954, + "num_input_tokens_seen": 65833680, + "step": 113390 + }, + { + "epoch": 16.889335716413463, + "grad_norm": 0.06005859375, + "learning_rate": 0.0021570935706376718, + "loss": 0.8112, + "num_input_tokens_seen": 65836528, + "step": 113395 + }, + { + "epoch": 16.890080428954423, + "grad_norm": 0.06787109375, + "learning_rate": 0.002156086381496721, + "loss": 0.8069, + "num_input_tokens_seen": 65839600, + "step": 113400 + }, + { + "epoch": 16.890825141495384, + "grad_norm": 0.0615234375, + "learning_rate": 0.0021550794093406355, + "loss": 0.7934, + "num_input_tokens_seen": 65842672, + "step": 113405 + }, + { + "epoch": 16.89156985403634, + "grad_norm": 0.09326171875, + "learning_rate": 0.00215407265418643, + "loss": 0.8037, + "num_input_tokens_seen": 65845488, + "step": 113410 + }, + { + "epoch": 16.8923145665773, + "grad_norm": 0.043701171875, + "learning_rate": 0.0021530661160511083, + "loss": 0.794, + "num_input_tokens_seen": 65848304, + "step": 113415 + }, + { + "epoch": 16.893059279118262, + "grad_norm": 0.064453125, + "learning_rate": 0.0021520597949516796, + "loss": 0.8093, + "num_input_tokens_seen": 65851248, + "step": 113420 + }, + { + "epoch": 16.89380399165922, + "grad_norm": 0.14453125, + "learning_rate": 0.0021510536909051395, + "loss": 0.802, + "num_input_tokens_seen": 65854096, + "step": 113425 + }, + { + "epoch": 16.89454870420018, + "grad_norm": 0.034912109375, + "learning_rate": 0.0021500478039284936, + "loss": 0.7919, + "num_input_tokens_seen": 65856816, + "step": 113430 + }, + { + "epoch": 16.895293416741136, + "grad_norm": 0.052734375, + "learning_rate": 0.002149042134038728, + "loss": 0.8106, + "num_input_tokens_seen": 65859536, + "step": 113435 + }, + { + "epoch": 16.896038129282097, + "grad_norm": 0.062255859375, + "learning_rate": 0.002148036681252832, + "loss": 0.8094, + "num_input_tokens_seen": 65862256, + "step": 113440 + }, + { + "epoch": 16.896782841823057, + "grad_norm": 0.04931640625, + "learning_rate": 0.002147031445587797, + "loss": 0.7891, + "num_input_tokens_seen": 65865072, + "step": 113445 + }, + { + "epoch": 16.897527554364014, + "grad_norm": 0.0458984375, + "learning_rate": 0.0021460264270606014, + "loss": 0.7866, + "num_input_tokens_seen": 65867792, + "step": 113450 + }, + { + "epoch": 16.898272266904975, + "grad_norm": 0.051025390625, + "learning_rate": 0.002145021625688227, + "loss": 0.7966, + "num_input_tokens_seen": 65870608, + "step": 113455 + }, + { + "epoch": 16.899016979445936, + "grad_norm": 0.07861328125, + "learning_rate": 0.002144017041487643, + "loss": 0.8081, + "num_input_tokens_seen": 65873584, + "step": 113460 + }, + { + "epoch": 16.899761691986892, + "grad_norm": 0.050048828125, + "learning_rate": 0.0021430126744758277, + "loss": 0.7913, + "num_input_tokens_seen": 65876784, + "step": 113465 + }, + { + "epoch": 16.900506404527853, + "grad_norm": 0.09228515625, + "learning_rate": 0.0021420085246697473, + "loss": 0.8181, + "num_input_tokens_seen": 65879664, + "step": 113470 + }, + { + "epoch": 16.90125111706881, + "grad_norm": 0.04931640625, + "learning_rate": 0.00214100459208636, + "loss": 0.7996, + "num_input_tokens_seen": 65882512, + "step": 113475 + }, + { + "epoch": 16.90199582960977, + "grad_norm": 0.047119140625, + "learning_rate": 0.002140000876742634, + "loss": 0.8272, + "num_input_tokens_seen": 65885520, + "step": 113480 + }, + { + "epoch": 16.90274054215073, + "grad_norm": 0.0380859375, + "learning_rate": 0.002138997378655521, + "loss": 0.7677, + "num_input_tokens_seen": 65888144, + "step": 113485 + }, + { + "epoch": 16.903485254691688, + "grad_norm": 0.291015625, + "learning_rate": 0.0021379940978419787, + "loss": 0.8489, + "num_input_tokens_seen": 65890896, + "step": 113490 + }, + { + "epoch": 16.90422996723265, + "grad_norm": 0.0634765625, + "learning_rate": 0.00213699103431895, + "loss": 0.8068, + "num_input_tokens_seen": 65894000, + "step": 113495 + }, + { + "epoch": 16.90497467977361, + "grad_norm": 0.06005859375, + "learning_rate": 0.002135988188103387, + "loss": 0.7786, + "num_input_tokens_seen": 65896816, + "step": 113500 + }, + { + "epoch": 16.905719392314566, + "grad_norm": 0.045654296875, + "learning_rate": 0.0021349855592122306, + "loss": 0.7848, + "num_input_tokens_seen": 65899568, + "step": 113505 + }, + { + "epoch": 16.906464104855527, + "grad_norm": 0.103515625, + "learning_rate": 0.0021339831476624178, + "loss": 0.7939, + "num_input_tokens_seen": 65902384, + "step": 113510 + }, + { + "epoch": 16.907208817396484, + "grad_norm": 0.049072265625, + "learning_rate": 0.002132980953470879, + "loss": 0.7896, + "num_input_tokens_seen": 65905264, + "step": 113515 + }, + { + "epoch": 16.907953529937444, + "grad_norm": 0.041748046875, + "learning_rate": 0.0021319789766545538, + "loss": 0.7909, + "num_input_tokens_seen": 65908144, + "step": 113520 + }, + { + "epoch": 16.908698242478405, + "grad_norm": 0.05078125, + "learning_rate": 0.0021309772172303644, + "loss": 0.791, + "num_input_tokens_seen": 65911056, + "step": 113525 + }, + { + "epoch": 16.90944295501936, + "grad_norm": 0.052001953125, + "learning_rate": 0.002129975675215232, + "loss": 0.7789, + "num_input_tokens_seen": 65913936, + "step": 113530 + }, + { + "epoch": 16.910187667560322, + "grad_norm": 0.10498046875, + "learning_rate": 0.0021289743506260838, + "loss": 0.7972, + "num_input_tokens_seen": 65917424, + "step": 113535 + }, + { + "epoch": 16.91093238010128, + "grad_norm": 0.042724609375, + "learning_rate": 0.002127973243479829, + "loss": 0.8, + "num_input_tokens_seen": 65920272, + "step": 113540 + }, + { + "epoch": 16.91167709264224, + "grad_norm": 0.09619140625, + "learning_rate": 0.0021269723537933882, + "loss": 0.789, + "num_input_tokens_seen": 65923248, + "step": 113545 + }, + { + "epoch": 16.9124218051832, + "grad_norm": 0.042724609375, + "learning_rate": 0.0021259716815836636, + "loss": 0.7984, + "num_input_tokens_seen": 65926000, + "step": 113550 + }, + { + "epoch": 16.913166517724157, + "grad_norm": 0.05810546875, + "learning_rate": 0.0021249712268675606, + "loss": 0.8134, + "num_input_tokens_seen": 65929744, + "step": 113555 + }, + { + "epoch": 16.913911230265118, + "grad_norm": 0.031494140625, + "learning_rate": 0.002123970989661987, + "loss": 0.818, + "num_input_tokens_seen": 65932432, + "step": 113560 + }, + { + "epoch": 16.91465594280608, + "grad_norm": 0.043701171875, + "learning_rate": 0.0021229709699838333, + "loss": 0.8181, + "num_input_tokens_seen": 65935152, + "step": 113565 + }, + { + "epoch": 16.915400655347035, + "grad_norm": 0.058837890625, + "learning_rate": 0.002121971167850002, + "loss": 0.7865, + "num_input_tokens_seen": 65937776, + "step": 113570 + }, + { + "epoch": 16.916145367887996, + "grad_norm": 0.0517578125, + "learning_rate": 0.002120971583277378, + "loss": 0.8143, + "num_input_tokens_seen": 65940336, + "step": 113575 + }, + { + "epoch": 16.916890080428953, + "grad_norm": 0.08740234375, + "learning_rate": 0.0021199722162828474, + "loss": 0.7909, + "num_input_tokens_seen": 65943184, + "step": 113580 + }, + { + "epoch": 16.917634792969913, + "grad_norm": 0.042236328125, + "learning_rate": 0.0021189730668832946, + "loss": 0.8248, + "num_input_tokens_seen": 65946032, + "step": 113585 + }, + { + "epoch": 16.918379505510874, + "grad_norm": 0.039794921875, + "learning_rate": 0.002117974135095603, + "loss": 0.8017, + "num_input_tokens_seen": 65948816, + "step": 113590 + }, + { + "epoch": 16.91912421805183, + "grad_norm": 0.056640625, + "learning_rate": 0.0021169754209366468, + "loss": 0.8014, + "num_input_tokens_seen": 65951792, + "step": 113595 + }, + { + "epoch": 16.91986893059279, + "grad_norm": 0.042724609375, + "learning_rate": 0.002115976924423292, + "loss": 0.8281, + "num_input_tokens_seen": 65954864, + "step": 113600 + }, + { + "epoch": 16.920613643133752, + "grad_norm": 0.07421875, + "learning_rate": 0.002114978645572417, + "loss": 0.7963, + "num_input_tokens_seen": 65957552, + "step": 113605 + }, + { + "epoch": 16.92135835567471, + "grad_norm": 0.033935546875, + "learning_rate": 0.0021139805844008768, + "loss": 0.7927, + "num_input_tokens_seen": 65960240, + "step": 113610 + }, + { + "epoch": 16.92210306821567, + "grad_norm": 0.0830078125, + "learning_rate": 0.0021129827409255425, + "loss": 0.8, + "num_input_tokens_seen": 65963088, + "step": 113615 + }, + { + "epoch": 16.922847780756626, + "grad_norm": 0.078125, + "learning_rate": 0.002111985115163264, + "loss": 0.7927, + "num_input_tokens_seen": 65966000, + "step": 113620 + }, + { + "epoch": 16.923592493297587, + "grad_norm": 0.038818359375, + "learning_rate": 0.002110987707130901, + "loss": 0.8237, + "num_input_tokens_seen": 65969072, + "step": 113625 + }, + { + "epoch": 16.924337205838548, + "grad_norm": 0.04345703125, + "learning_rate": 0.002109990516845301, + "loss": 0.8097, + "num_input_tokens_seen": 65972080, + "step": 113630 + }, + { + "epoch": 16.925081918379504, + "grad_norm": 0.05322265625, + "learning_rate": 0.0021089935443233078, + "loss": 0.8004, + "num_input_tokens_seen": 65974928, + "step": 113635 + }, + { + "epoch": 16.925826630920465, + "grad_norm": 0.042236328125, + "learning_rate": 0.0021079967895817698, + "loss": 0.8206, + "num_input_tokens_seen": 65977808, + "step": 113640 + }, + { + "epoch": 16.926571343461426, + "grad_norm": 0.042724609375, + "learning_rate": 0.002107000252637522, + "loss": 0.7852, + "num_input_tokens_seen": 65980528, + "step": 113645 + }, + { + "epoch": 16.927316056002383, + "grad_norm": 0.068359375, + "learning_rate": 0.0021060039335074024, + "loss": 0.8126, + "num_input_tokens_seen": 65983056, + "step": 113650 + }, + { + "epoch": 16.928060768543343, + "grad_norm": 0.0380859375, + "learning_rate": 0.0021050078322082384, + "loss": 0.8082, + "num_input_tokens_seen": 65986160, + "step": 113655 + }, + { + "epoch": 16.9288054810843, + "grad_norm": 0.05615234375, + "learning_rate": 0.0021040119487568637, + "loss": 0.7872, + "num_input_tokens_seen": 65989488, + "step": 113660 + }, + { + "epoch": 16.92955019362526, + "grad_norm": 0.055419921875, + "learning_rate": 0.0021030162831700973, + "loss": 0.7941, + "num_input_tokens_seen": 65992496, + "step": 113665 + }, + { + "epoch": 16.93029490616622, + "grad_norm": 0.04931640625, + "learning_rate": 0.002102020835464765, + "loss": 0.7929, + "num_input_tokens_seen": 65995664, + "step": 113670 + }, + { + "epoch": 16.931039618707178, + "grad_norm": 0.0498046875, + "learning_rate": 0.0021010256056576827, + "loss": 0.8203, + "num_input_tokens_seen": 65998352, + "step": 113675 + }, + { + "epoch": 16.93178433124814, + "grad_norm": 0.05419921875, + "learning_rate": 0.002100030593765659, + "loss": 0.7853, + "num_input_tokens_seen": 66001168, + "step": 113680 + }, + { + "epoch": 16.932529043789096, + "grad_norm": 0.0693359375, + "learning_rate": 0.002099035799805512, + "loss": 0.7717, + "num_input_tokens_seen": 66004016, + "step": 113685 + }, + { + "epoch": 16.933273756330056, + "grad_norm": 0.042724609375, + "learning_rate": 0.0020980412237940386, + "loss": 0.7916, + "num_input_tokens_seen": 66006704, + "step": 113690 + }, + { + "epoch": 16.934018468871017, + "grad_norm": 0.068359375, + "learning_rate": 0.0020970468657480496, + "loss": 0.7794, + "num_input_tokens_seen": 66009968, + "step": 113695 + }, + { + "epoch": 16.934763181411974, + "grad_norm": 0.059326171875, + "learning_rate": 0.0020960527256843374, + "loss": 0.7823, + "num_input_tokens_seen": 66013008, + "step": 113700 + }, + { + "epoch": 16.935507893952934, + "grad_norm": 0.06298828125, + "learning_rate": 0.002095058803619701, + "loss": 0.796, + "num_input_tokens_seen": 66015888, + "step": 113705 + }, + { + "epoch": 16.936252606493895, + "grad_norm": 0.0771484375, + "learning_rate": 0.0020940650995709314, + "loss": 0.7925, + "num_input_tokens_seen": 66018704, + "step": 113710 + }, + { + "epoch": 16.93699731903485, + "grad_norm": 0.05078125, + "learning_rate": 0.0020930716135548156, + "loss": 0.8119, + "num_input_tokens_seen": 66021712, + "step": 113715 + }, + { + "epoch": 16.937742031575812, + "grad_norm": 0.052490234375, + "learning_rate": 0.0020920783455881348, + "loss": 0.7966, + "num_input_tokens_seen": 66024272, + "step": 113720 + }, + { + "epoch": 16.93848674411677, + "grad_norm": 0.047607421875, + "learning_rate": 0.002091085295687669, + "loss": 0.7838, + "num_input_tokens_seen": 66027056, + "step": 113725 + }, + { + "epoch": 16.93923145665773, + "grad_norm": 0.0830078125, + "learning_rate": 0.0020900924638702, + "loss": 0.7977, + "num_input_tokens_seen": 66029680, + "step": 113730 + }, + { + "epoch": 16.93997616919869, + "grad_norm": 0.1259765625, + "learning_rate": 0.002089099850152496, + "loss": 0.7848, + "num_input_tokens_seen": 66032624, + "step": 113735 + }, + { + "epoch": 16.940720881739647, + "grad_norm": 0.056640625, + "learning_rate": 0.00208810745455133, + "loss": 0.7748, + "num_input_tokens_seen": 66035600, + "step": 113740 + }, + { + "epoch": 16.941465594280608, + "grad_norm": 0.060546875, + "learning_rate": 0.002087115277083462, + "loss": 0.8035, + "num_input_tokens_seen": 66038544, + "step": 113745 + }, + { + "epoch": 16.94221030682157, + "grad_norm": 0.051513671875, + "learning_rate": 0.0020861233177656606, + "loss": 0.8141, + "num_input_tokens_seen": 66041456, + "step": 113750 + }, + { + "epoch": 16.942955019362525, + "grad_norm": 0.05419921875, + "learning_rate": 0.0020851315766146823, + "loss": 0.8175, + "num_input_tokens_seen": 66044432, + "step": 113755 + }, + { + "epoch": 16.943699731903486, + "grad_norm": 0.03271484375, + "learning_rate": 0.002084140053647275, + "loss": 0.8037, + "num_input_tokens_seen": 66047440, + "step": 113760 + }, + { + "epoch": 16.944444444444443, + "grad_norm": 0.0966796875, + "learning_rate": 0.0020831487488801987, + "loss": 0.7916, + "num_input_tokens_seen": 66050000, + "step": 113765 + }, + { + "epoch": 16.945189156985403, + "grad_norm": 0.12158203125, + "learning_rate": 0.002082157662330192, + "loss": 0.7867, + "num_input_tokens_seen": 66052944, + "step": 113770 + }, + { + "epoch": 16.945933869526364, + "grad_norm": 0.045166015625, + "learning_rate": 0.002081166794014006, + "loss": 0.7842, + "num_input_tokens_seen": 66056048, + "step": 113775 + }, + { + "epoch": 16.94667858206732, + "grad_norm": 0.2255859375, + "learning_rate": 0.002080176143948376, + "loss": 0.8011, + "num_input_tokens_seen": 66058992, + "step": 113780 + }, + { + "epoch": 16.94742329460828, + "grad_norm": 0.04931640625, + "learning_rate": 0.0020791857121500406, + "loss": 0.8037, + "num_input_tokens_seen": 66061872, + "step": 113785 + }, + { + "epoch": 16.948168007149242, + "grad_norm": 0.07763671875, + "learning_rate": 0.002078195498635729, + "loss": 0.7972, + "num_input_tokens_seen": 66065040, + "step": 113790 + }, + { + "epoch": 16.9489127196902, + "grad_norm": 0.04345703125, + "learning_rate": 0.00207720550342217, + "loss": 0.7943, + "num_input_tokens_seen": 66067856, + "step": 113795 + }, + { + "epoch": 16.94965743223116, + "grad_norm": 0.038818359375, + "learning_rate": 0.002076215726526092, + "loss": 0.7889, + "num_input_tokens_seen": 66070512, + "step": 113800 + }, + { + "epoch": 16.950402144772116, + "grad_norm": 0.0947265625, + "learning_rate": 0.0020752261679642097, + "loss": 0.8017, + "num_input_tokens_seen": 66073424, + "step": 113805 + }, + { + "epoch": 16.951146857313077, + "grad_norm": 0.03369140625, + "learning_rate": 0.0020742368277532498, + "loss": 0.7801, + "num_input_tokens_seen": 66076528, + "step": 113810 + }, + { + "epoch": 16.951891569854038, + "grad_norm": 0.07080078125, + "learning_rate": 0.0020732477059099185, + "loss": 0.8047, + "num_input_tokens_seen": 66079760, + "step": 113815 + }, + { + "epoch": 16.952636282394995, + "grad_norm": 0.07275390625, + "learning_rate": 0.0020722588024509312, + "loss": 0.8244, + "num_input_tokens_seen": 66082896, + "step": 113820 + }, + { + "epoch": 16.953380994935955, + "grad_norm": 0.06787109375, + "learning_rate": 0.00207127011739299, + "loss": 0.7763, + "num_input_tokens_seen": 66086000, + "step": 113825 + }, + { + "epoch": 16.954125707476916, + "grad_norm": 0.10400390625, + "learning_rate": 0.002070281650752804, + "loss": 0.8083, + "num_input_tokens_seen": 66088976, + "step": 113830 + }, + { + "epoch": 16.954870420017873, + "grad_norm": 0.07568359375, + "learning_rate": 0.0020692934025470667, + "loss": 0.8016, + "num_input_tokens_seen": 66091856, + "step": 113835 + }, + { + "epoch": 16.955615132558833, + "grad_norm": 0.07666015625, + "learning_rate": 0.0020683053727924747, + "loss": 0.7958, + "num_input_tokens_seen": 66094704, + "step": 113840 + }, + { + "epoch": 16.95635984509979, + "grad_norm": 0.08349609375, + "learning_rate": 0.0020673175615057186, + "loss": 0.8084, + "num_input_tokens_seen": 66097840, + "step": 113845 + }, + { + "epoch": 16.95710455764075, + "grad_norm": 0.044677734375, + "learning_rate": 0.002066329968703491, + "loss": 0.801, + "num_input_tokens_seen": 66100560, + "step": 113850 + }, + { + "epoch": 16.95784927018171, + "grad_norm": 0.0546875, + "learning_rate": 0.002065342594402473, + "loss": 0.7907, + "num_input_tokens_seen": 66103568, + "step": 113855 + }, + { + "epoch": 16.958593982722668, + "grad_norm": 0.09375, + "learning_rate": 0.002064355438619343, + "loss": 0.8038, + "num_input_tokens_seen": 66106672, + "step": 113860 + }, + { + "epoch": 16.95933869526363, + "grad_norm": 0.042724609375, + "learning_rate": 0.0020633685013707838, + "loss": 0.804, + "num_input_tokens_seen": 66109904, + "step": 113865 + }, + { + "epoch": 16.96008340780459, + "grad_norm": 0.0751953125, + "learning_rate": 0.0020623817826734615, + "loss": 0.7981, + "num_input_tokens_seen": 66112656, + "step": 113870 + }, + { + "epoch": 16.960828120345546, + "grad_norm": 0.08740234375, + "learning_rate": 0.002061395282544053, + "loss": 0.7951, + "num_input_tokens_seen": 66115504, + "step": 113875 + }, + { + "epoch": 16.961572832886507, + "grad_norm": 0.044921875, + "learning_rate": 0.0020604090009992216, + "loss": 0.7878, + "num_input_tokens_seen": 66118608, + "step": 113880 + }, + { + "epoch": 16.962317545427464, + "grad_norm": 0.050537109375, + "learning_rate": 0.0020594229380556254, + "loss": 0.8088, + "num_input_tokens_seen": 66121648, + "step": 113885 + }, + { + "epoch": 16.963062257968424, + "grad_norm": 0.06640625, + "learning_rate": 0.00205843709372993, + "loss": 0.8426, + "num_input_tokens_seen": 66124432, + "step": 113890 + }, + { + "epoch": 16.963806970509385, + "grad_norm": 0.08349609375, + "learning_rate": 0.002057451468038783, + "loss": 0.7945, + "num_input_tokens_seen": 66127184, + "step": 113895 + }, + { + "epoch": 16.964551683050342, + "grad_norm": 0.10205078125, + "learning_rate": 0.002056466060998844, + "loss": 0.7999, + "num_input_tokens_seen": 66130064, + "step": 113900 + }, + { + "epoch": 16.965296395591302, + "grad_norm": 0.0595703125, + "learning_rate": 0.002055480872626753, + "loss": 0.7797, + "num_input_tokens_seen": 66132848, + "step": 113905 + }, + { + "epoch": 16.96604110813226, + "grad_norm": 0.04443359375, + "learning_rate": 0.002054495902939158, + "loss": 0.7975, + "num_input_tokens_seen": 66135856, + "step": 113910 + }, + { + "epoch": 16.96678582067322, + "grad_norm": 0.083984375, + "learning_rate": 0.0020535111519526983, + "loss": 0.7803, + "num_input_tokens_seen": 66138608, + "step": 113915 + }, + { + "epoch": 16.96753053321418, + "grad_norm": 0.06103515625, + "learning_rate": 0.002052526619684004, + "loss": 0.8005, + "num_input_tokens_seen": 66141232, + "step": 113920 + }, + { + "epoch": 16.968275245755137, + "grad_norm": 0.0771484375, + "learning_rate": 0.002051542306149719, + "loss": 0.8002, + "num_input_tokens_seen": 66144208, + "step": 113925 + }, + { + "epoch": 16.969019958296098, + "grad_norm": 0.06494140625, + "learning_rate": 0.002050558211366461, + "loss": 0.767, + "num_input_tokens_seen": 66146896, + "step": 113930 + }, + { + "epoch": 16.96976467083706, + "grad_norm": 0.046630859375, + "learning_rate": 0.0020495743353508653, + "loss": 0.8021, + "num_input_tokens_seen": 66150000, + "step": 113935 + }, + { + "epoch": 16.970509383378015, + "grad_norm": 0.06787109375, + "learning_rate": 0.0020485906781195456, + "loss": 0.7945, + "num_input_tokens_seen": 66152880, + "step": 113940 + }, + { + "epoch": 16.971254095918976, + "grad_norm": 0.0927734375, + "learning_rate": 0.0020476072396891265, + "loss": 0.7694, + "num_input_tokens_seen": 66155504, + "step": 113945 + }, + { + "epoch": 16.971998808459933, + "grad_norm": 0.126953125, + "learning_rate": 0.0020466240200762162, + "loss": 0.8008, + "num_input_tokens_seen": 66158896, + "step": 113950 + }, + { + "epoch": 16.972743521000893, + "grad_norm": 0.166015625, + "learning_rate": 0.0020456410192974304, + "loss": 0.8045, + "num_input_tokens_seen": 66161968, + "step": 113955 + }, + { + "epoch": 16.973488233541854, + "grad_norm": 0.0693359375, + "learning_rate": 0.002044658237369372, + "loss": 0.7982, + "num_input_tokens_seen": 66164592, + "step": 113960 + }, + { + "epoch": 16.97423294608281, + "grad_norm": 0.07080078125, + "learning_rate": 0.002043675674308645, + "loss": 0.7759, + "num_input_tokens_seen": 66167152, + "step": 113965 + }, + { + "epoch": 16.97497765862377, + "grad_norm": 0.060546875, + "learning_rate": 0.00204269333013185, + "loss": 0.8036, + "num_input_tokens_seen": 66169968, + "step": 113970 + }, + { + "epoch": 16.975722371164732, + "grad_norm": 0.07568359375, + "learning_rate": 0.002041711204855583, + "loss": 0.7814, + "num_input_tokens_seen": 66173136, + "step": 113975 + }, + { + "epoch": 16.97646708370569, + "grad_norm": 0.134765625, + "learning_rate": 0.0020407292984964353, + "loss": 0.8332, + "num_input_tokens_seen": 66176368, + "step": 113980 + }, + { + "epoch": 16.97721179624665, + "grad_norm": 0.04931640625, + "learning_rate": 0.0020397476110709904, + "loss": 0.797, + "num_input_tokens_seen": 66179344, + "step": 113985 + }, + { + "epoch": 16.977956508787607, + "grad_norm": 0.052978515625, + "learning_rate": 0.002038766142595842, + "loss": 0.806, + "num_input_tokens_seen": 66182512, + "step": 113990 + }, + { + "epoch": 16.978701221328567, + "grad_norm": 0.06396484375, + "learning_rate": 0.0020377848930875665, + "loss": 0.8022, + "num_input_tokens_seen": 66185296, + "step": 113995 + }, + { + "epoch": 16.979445933869528, + "grad_norm": 0.06005859375, + "learning_rate": 0.002036803862562737, + "loss": 0.779, + "num_input_tokens_seen": 66188240, + "step": 114000 + }, + { + "epoch": 16.980190646410485, + "grad_norm": 0.06689453125, + "learning_rate": 0.0020358230510379343, + "loss": 0.8235, + "num_input_tokens_seen": 66190960, + "step": 114005 + }, + { + "epoch": 16.980935358951445, + "grad_norm": 0.056884765625, + "learning_rate": 0.002034842458529723, + "loss": 0.7966, + "num_input_tokens_seen": 66193648, + "step": 114010 + }, + { + "epoch": 16.981680071492406, + "grad_norm": 0.0791015625, + "learning_rate": 0.0020338620850546734, + "loss": 0.7788, + "num_input_tokens_seen": 66196656, + "step": 114015 + }, + { + "epoch": 16.982424784033363, + "grad_norm": 0.083984375, + "learning_rate": 0.002032881930629342, + "loss": 0.772, + "num_input_tokens_seen": 66199568, + "step": 114020 + }, + { + "epoch": 16.983169496574323, + "grad_norm": 0.06640625, + "learning_rate": 0.002031901995270294, + "loss": 0.7821, + "num_input_tokens_seen": 66202384, + "step": 114025 + }, + { + "epoch": 16.98391420911528, + "grad_norm": 0.1298828125, + "learning_rate": 0.00203092227899408, + "loss": 0.8181, + "num_input_tokens_seen": 66205104, + "step": 114030 + }, + { + "epoch": 16.98465892165624, + "grad_norm": 0.045166015625, + "learning_rate": 0.0020299427818172545, + "loss": 0.7855, + "num_input_tokens_seen": 66207824, + "step": 114035 + }, + { + "epoch": 16.9854036341972, + "grad_norm": 0.09326171875, + "learning_rate": 0.0020289635037563656, + "loss": 0.8073, + "num_input_tokens_seen": 66210512, + "step": 114040 + }, + { + "epoch": 16.986148346738158, + "grad_norm": 0.1376953125, + "learning_rate": 0.0020279844448279527, + "loss": 0.7727, + "num_input_tokens_seen": 66213552, + "step": 114045 + }, + { + "epoch": 16.98689305927912, + "grad_norm": 0.052001953125, + "learning_rate": 0.00202700560504856, + "loss": 0.8012, + "num_input_tokens_seen": 66216304, + "step": 114050 + }, + { + "epoch": 16.987637771820076, + "grad_norm": 0.055419921875, + "learning_rate": 0.0020260269844347184, + "loss": 0.7921, + "num_input_tokens_seen": 66219312, + "step": 114055 + }, + { + "epoch": 16.988382484361036, + "grad_norm": 0.09716796875, + "learning_rate": 0.002025048583002969, + "loss": 0.7944, + "num_input_tokens_seen": 66222320, + "step": 114060 + }, + { + "epoch": 16.989127196901997, + "grad_norm": 0.068359375, + "learning_rate": 0.0020240704007698325, + "loss": 0.7719, + "num_input_tokens_seen": 66225488, + "step": 114065 + }, + { + "epoch": 16.989871909442954, + "grad_norm": 0.072265625, + "learning_rate": 0.0020230924377518415, + "loss": 0.7866, + "num_input_tokens_seen": 66228560, + "step": 114070 + }, + { + "epoch": 16.990616621983914, + "grad_norm": 0.05078125, + "learning_rate": 0.0020221146939655155, + "loss": 0.7955, + "num_input_tokens_seen": 66232112, + "step": 114075 + }, + { + "epoch": 16.991361334524875, + "grad_norm": 0.080078125, + "learning_rate": 0.002021137169427369, + "loss": 0.7871, + "num_input_tokens_seen": 66234896, + "step": 114080 + }, + { + "epoch": 16.992106047065832, + "grad_norm": 0.166015625, + "learning_rate": 0.0020201598641539204, + "loss": 0.7902, + "num_input_tokens_seen": 66237488, + "step": 114085 + }, + { + "epoch": 16.992850759606792, + "grad_norm": 0.054443359375, + "learning_rate": 0.0020191827781616766, + "loss": 0.7846, + "num_input_tokens_seen": 66240496, + "step": 114090 + }, + { + "epoch": 16.99359547214775, + "grad_norm": 0.12890625, + "learning_rate": 0.0020182059114671495, + "loss": 0.8211, + "num_input_tokens_seen": 66243728, + "step": 114095 + }, + { + "epoch": 16.99434018468871, + "grad_norm": 0.0732421875, + "learning_rate": 0.002017229264086836, + "loss": 0.8125, + "num_input_tokens_seen": 66246896, + "step": 114100 + }, + { + "epoch": 16.99508489722967, + "grad_norm": 0.0615234375, + "learning_rate": 0.0020162528360372425, + "loss": 0.781, + "num_input_tokens_seen": 66250000, + "step": 114105 + }, + { + "epoch": 16.995829609770627, + "grad_norm": 0.0927734375, + "learning_rate": 0.0020152766273348614, + "loss": 0.8166, + "num_input_tokens_seen": 66252816, + "step": 114110 + }, + { + "epoch": 16.996574322311588, + "grad_norm": 0.11669921875, + "learning_rate": 0.002014300637996184, + "loss": 0.8052, + "num_input_tokens_seen": 66255696, + "step": 114115 + }, + { + "epoch": 16.99731903485255, + "grad_norm": 0.039794921875, + "learning_rate": 0.0020133248680377007, + "loss": 0.7966, + "num_input_tokens_seen": 66258736, + "step": 114120 + }, + { + "epoch": 16.998063747393505, + "grad_norm": 0.040283203125, + "learning_rate": 0.00201234931747589, + "loss": 0.7915, + "num_input_tokens_seen": 66261424, + "step": 114125 + }, + { + "epoch": 16.998808459934466, + "grad_norm": 0.0830078125, + "learning_rate": 0.00201137398632724, + "loss": 0.7969, + "num_input_tokens_seen": 66264176, + "step": 114130 + }, + { + "epoch": 16.999553172475423, + "grad_norm": 0.07861328125, + "learning_rate": 0.0020103988746082244, + "loss": 0.7752, + "num_input_tokens_seen": 66267120, + "step": 114135 + }, + { + "epoch": 17.0, + "eval_loss": 0.8002009987831116, + "eval_runtime": 70.4795, + "eval_samples_per_second": 42.339, + "eval_steps_per_second": 10.585, + "num_input_tokens_seen": 66268336, + "step": 114138 + }, + { + "epoch": 17.000297885016384, + "grad_norm": 0.052490234375, + "learning_rate": 0.0020094239823353205, + "loss": 0.7933, + "num_input_tokens_seen": 66269776, + "step": 114140 + }, + { + "epoch": 17.001042597557344, + "grad_norm": 0.04296875, + "learning_rate": 0.002008449309524991, + "loss": 0.7942, + "num_input_tokens_seen": 66272496, + "step": 114145 + }, + { + "epoch": 17.0017873100983, + "grad_norm": 0.037109375, + "learning_rate": 0.00200747485619371, + "loss": 0.7792, + "num_input_tokens_seen": 66275248, + "step": 114150 + }, + { + "epoch": 17.00253202263926, + "grad_norm": 0.042236328125, + "learning_rate": 0.0020065006223579374, + "loss": 0.8194, + "num_input_tokens_seen": 66277776, + "step": 114155 + }, + { + "epoch": 17.003276735180222, + "grad_norm": 0.0810546875, + "learning_rate": 0.0020055266080341266, + "loss": 0.8288, + "num_input_tokens_seen": 66280656, + "step": 114160 + }, + { + "epoch": 17.00402144772118, + "grad_norm": 0.04052734375, + "learning_rate": 0.0020045528132387395, + "loss": 0.8041, + "num_input_tokens_seen": 66283888, + "step": 114165 + }, + { + "epoch": 17.00476616026214, + "grad_norm": 0.036865234375, + "learning_rate": 0.0020035792379882247, + "loss": 0.7977, + "num_input_tokens_seen": 66286864, + "step": 114170 + }, + { + "epoch": 17.005510872803097, + "grad_norm": 0.03759765625, + "learning_rate": 0.002002605882299028, + "loss": 0.7872, + "num_input_tokens_seen": 66289808, + "step": 114175 + }, + { + "epoch": 17.006255585344057, + "grad_norm": 0.060546875, + "learning_rate": 0.0020016327461875967, + "loss": 0.7784, + "num_input_tokens_seen": 66292848, + "step": 114180 + }, + { + "epoch": 17.007000297885018, + "grad_norm": 0.056640625, + "learning_rate": 0.00200065982967037, + "loss": 0.7896, + "num_input_tokens_seen": 66295760, + "step": 114185 + }, + { + "epoch": 17.007745010425975, + "grad_norm": 0.038818359375, + "learning_rate": 0.001999687132763779, + "loss": 0.8064, + "num_input_tokens_seen": 66298768, + "step": 114190 + }, + { + "epoch": 17.008489722966935, + "grad_norm": 0.0712890625, + "learning_rate": 0.0019987146554842653, + "loss": 0.8092, + "num_input_tokens_seen": 66301680, + "step": 114195 + }, + { + "epoch": 17.009234435507896, + "grad_norm": 0.10107421875, + "learning_rate": 0.0019977423978482533, + "loss": 0.7695, + "num_input_tokens_seen": 66304688, + "step": 114200 + }, + { + "epoch": 17.009979148048853, + "grad_norm": 0.0517578125, + "learning_rate": 0.0019967703598721665, + "loss": 0.7921, + "num_input_tokens_seen": 66307536, + "step": 114205 + }, + { + "epoch": 17.010723860589813, + "grad_norm": 0.0634765625, + "learning_rate": 0.0019957985415724297, + "loss": 0.7871, + "num_input_tokens_seen": 66310352, + "step": 114210 + }, + { + "epoch": 17.01146857313077, + "grad_norm": 0.05859375, + "learning_rate": 0.001994826942965459, + "loss": 0.7809, + "num_input_tokens_seen": 66313360, + "step": 114215 + }, + { + "epoch": 17.01221328567173, + "grad_norm": 0.0712890625, + "learning_rate": 0.0019938555640676707, + "loss": 0.7971, + "num_input_tokens_seen": 66316176, + "step": 114220 + }, + { + "epoch": 17.01295799821269, + "grad_norm": 0.049560546875, + "learning_rate": 0.0019928844048954703, + "loss": 0.7983, + "num_input_tokens_seen": 66318928, + "step": 114225 + }, + { + "epoch": 17.01370271075365, + "grad_norm": 0.037109375, + "learning_rate": 0.0019919134654652723, + "loss": 0.7883, + "num_input_tokens_seen": 66321680, + "step": 114230 + }, + { + "epoch": 17.01444742329461, + "grad_norm": 0.0235595703125, + "learning_rate": 0.0019909427457934746, + "loss": 0.7925, + "num_input_tokens_seen": 66324336, + "step": 114235 + }, + { + "epoch": 17.015192135835566, + "grad_norm": 0.072265625, + "learning_rate": 0.0019899722458964785, + "loss": 0.797, + "num_input_tokens_seen": 66327056, + "step": 114240 + }, + { + "epoch": 17.015936848376526, + "grad_norm": 0.052978515625, + "learning_rate": 0.0019890019657906777, + "loss": 0.7949, + "num_input_tokens_seen": 66330128, + "step": 114245 + }, + { + "epoch": 17.016681560917487, + "grad_norm": 0.04296875, + "learning_rate": 0.001988031905492462, + "loss": 0.7846, + "num_input_tokens_seen": 66333136, + "step": 114250 + }, + { + "epoch": 17.017426273458444, + "grad_norm": 0.04150390625, + "learning_rate": 0.0019870620650182256, + "loss": 0.7726, + "num_input_tokens_seen": 66336016, + "step": 114255 + }, + { + "epoch": 17.018170985999404, + "grad_norm": 0.058349609375, + "learning_rate": 0.0019860924443843454, + "loss": 0.7971, + "num_input_tokens_seen": 66339120, + "step": 114260 + }, + { + "epoch": 17.018915698540365, + "grad_norm": 0.0299072265625, + "learning_rate": 0.001985123043607211, + "loss": 0.7785, + "num_input_tokens_seen": 66341936, + "step": 114265 + }, + { + "epoch": 17.019660411081322, + "grad_norm": 0.044189453125, + "learning_rate": 0.001984153862703193, + "loss": 0.7965, + "num_input_tokens_seen": 66344944, + "step": 114270 + }, + { + "epoch": 17.020405123622282, + "grad_norm": 0.0556640625, + "learning_rate": 0.001983184901688668, + "loss": 0.7958, + "num_input_tokens_seen": 66347696, + "step": 114275 + }, + { + "epoch": 17.02114983616324, + "grad_norm": 0.04541015625, + "learning_rate": 0.001982216160580005, + "loss": 0.8007, + "num_input_tokens_seen": 66350736, + "step": 114280 + }, + { + "epoch": 17.0218945487042, + "grad_norm": 0.087890625, + "learning_rate": 0.001981247639393566, + "loss": 0.7865, + "num_input_tokens_seen": 66353776, + "step": 114285 + }, + { + "epoch": 17.02263926124516, + "grad_norm": 0.046142578125, + "learning_rate": 0.0019802793381457195, + "loss": 0.8071, + "num_input_tokens_seen": 66356592, + "step": 114290 + }, + { + "epoch": 17.023383973786117, + "grad_norm": 0.052978515625, + "learning_rate": 0.0019793112568528186, + "loss": 0.7852, + "num_input_tokens_seen": 66359344, + "step": 114295 + }, + { + "epoch": 17.024128686327078, + "grad_norm": 0.04150390625, + "learning_rate": 0.001978343395531224, + "loss": 0.7765, + "num_input_tokens_seen": 66362096, + "step": 114300 + }, + { + "epoch": 17.02487339886804, + "grad_norm": 0.040771484375, + "learning_rate": 0.0019773757541972823, + "loss": 0.7926, + "num_input_tokens_seen": 66364944, + "step": 114305 + }, + { + "epoch": 17.025618111408996, + "grad_norm": 0.04833984375, + "learning_rate": 0.0019764083328673407, + "loss": 0.839, + "num_input_tokens_seen": 66367696, + "step": 114310 + }, + { + "epoch": 17.026362823949956, + "grad_norm": 0.03857421875, + "learning_rate": 0.001975441131557745, + "loss": 0.7788, + "num_input_tokens_seen": 66370832, + "step": 114315 + }, + { + "epoch": 17.027107536490913, + "grad_norm": 0.061279296875, + "learning_rate": 0.0019744741502848305, + "loss": 0.8149, + "num_input_tokens_seen": 66373808, + "step": 114320 + }, + { + "epoch": 17.027852249031874, + "grad_norm": 0.049072265625, + "learning_rate": 0.001973507389064939, + "loss": 0.7817, + "num_input_tokens_seen": 66376560, + "step": 114325 + }, + { + "epoch": 17.028596961572834, + "grad_norm": 0.0703125, + "learning_rate": 0.0019725408479143985, + "loss": 0.7626, + "num_input_tokens_seen": 66379312, + "step": 114330 + }, + { + "epoch": 17.02934167411379, + "grad_norm": 0.0576171875, + "learning_rate": 0.0019715745268495425, + "loss": 0.8002, + "num_input_tokens_seen": 66382224, + "step": 114335 + }, + { + "epoch": 17.03008638665475, + "grad_norm": 0.033935546875, + "learning_rate": 0.00197060842588669, + "loss": 0.8153, + "num_input_tokens_seen": 66385200, + "step": 114340 + }, + { + "epoch": 17.030831099195712, + "grad_norm": 0.0478515625, + "learning_rate": 0.0019696425450421696, + "loss": 0.8174, + "num_input_tokens_seen": 66388272, + "step": 114345 + }, + { + "epoch": 17.03157581173667, + "grad_norm": 0.09423828125, + "learning_rate": 0.001968676884332291, + "loss": 0.7913, + "num_input_tokens_seen": 66391216, + "step": 114350 + }, + { + "epoch": 17.03232052427763, + "grad_norm": 0.05224609375, + "learning_rate": 0.001967711443773375, + "loss": 0.8022, + "num_input_tokens_seen": 66394320, + "step": 114355 + }, + { + "epoch": 17.033065236818587, + "grad_norm": 0.034423828125, + "learning_rate": 0.001966746223381728, + "loss": 0.8034, + "num_input_tokens_seen": 66397264, + "step": 114360 + }, + { + "epoch": 17.033809949359547, + "grad_norm": 0.041259765625, + "learning_rate": 0.0019657812231736564, + "loss": 0.7838, + "num_input_tokens_seen": 66400208, + "step": 114365 + }, + { + "epoch": 17.034554661900508, + "grad_norm": 0.0498046875, + "learning_rate": 0.0019648164431654647, + "loss": 0.8097, + "num_input_tokens_seen": 66403056, + "step": 114370 + }, + { + "epoch": 17.035299374441465, + "grad_norm": 0.040283203125, + "learning_rate": 0.001963851883373453, + "loss": 0.7951, + "num_input_tokens_seen": 66405936, + "step": 114375 + }, + { + "epoch": 17.036044086982425, + "grad_norm": 0.048095703125, + "learning_rate": 0.001962887543813912, + "loss": 0.7833, + "num_input_tokens_seen": 66408912, + "step": 114380 + }, + { + "epoch": 17.036788799523382, + "grad_norm": 0.0654296875, + "learning_rate": 0.0019619234245031336, + "loss": 0.7833, + "num_input_tokens_seen": 66411856, + "step": 114385 + }, + { + "epoch": 17.037533512064343, + "grad_norm": 0.0595703125, + "learning_rate": 0.0019609595254574115, + "loss": 0.7999, + "num_input_tokens_seen": 66414576, + "step": 114390 + }, + { + "epoch": 17.038278224605303, + "grad_norm": 0.09521484375, + "learning_rate": 0.001959995846693024, + "loss": 0.7706, + "num_input_tokens_seen": 66417328, + "step": 114395 + }, + { + "epoch": 17.03902293714626, + "grad_norm": 0.0576171875, + "learning_rate": 0.001959032388226252, + "loss": 0.7963, + "num_input_tokens_seen": 66420048, + "step": 114400 + }, + { + "epoch": 17.03976764968722, + "grad_norm": 0.1025390625, + "learning_rate": 0.0019580691500733767, + "loss": 0.7802, + "num_input_tokens_seen": 66422672, + "step": 114405 + }, + { + "epoch": 17.04051236222818, + "grad_norm": 0.1142578125, + "learning_rate": 0.0019571061322506637, + "loss": 0.7678, + "num_input_tokens_seen": 66425520, + "step": 114410 + }, + { + "epoch": 17.04125707476914, + "grad_norm": 0.0625, + "learning_rate": 0.001956143334774389, + "loss": 0.8168, + "num_input_tokens_seen": 66428400, + "step": 114415 + }, + { + "epoch": 17.0420017873101, + "grad_norm": 0.060302734375, + "learning_rate": 0.001955180757660814, + "loss": 0.8018, + "num_input_tokens_seen": 66431760, + "step": 114420 + }, + { + "epoch": 17.042746499851056, + "grad_norm": 0.0673828125, + "learning_rate": 0.0019542184009262034, + "loss": 0.8045, + "num_input_tokens_seen": 66434992, + "step": 114425 + }, + { + "epoch": 17.043491212392016, + "grad_norm": 0.11181640625, + "learning_rate": 0.001953256264586812, + "loss": 0.761, + "num_input_tokens_seen": 66437904, + "step": 114430 + }, + { + "epoch": 17.044235924932977, + "grad_norm": 0.142578125, + "learning_rate": 0.0019522943486588982, + "loss": 0.8035, + "num_input_tokens_seen": 66440688, + "step": 114435 + }, + { + "epoch": 17.044980637473934, + "grad_norm": 0.068359375, + "learning_rate": 0.001951332653158712, + "loss": 0.7734, + "num_input_tokens_seen": 66443696, + "step": 114440 + }, + { + "epoch": 17.045725350014894, + "grad_norm": 0.04052734375, + "learning_rate": 0.0019503711781024974, + "loss": 0.8035, + "num_input_tokens_seen": 66446256, + "step": 114445 + }, + { + "epoch": 17.046470062555855, + "grad_norm": 0.0712890625, + "learning_rate": 0.0019494099235064977, + "loss": 0.7975, + "num_input_tokens_seen": 66449552, + "step": 114450 + }, + { + "epoch": 17.047214775096812, + "grad_norm": 0.06689453125, + "learning_rate": 0.0019484488893869516, + "loss": 0.7951, + "num_input_tokens_seen": 66452688, + "step": 114455 + }, + { + "epoch": 17.047959487637772, + "grad_norm": 0.0458984375, + "learning_rate": 0.001947488075760098, + "loss": 0.8009, + "num_input_tokens_seen": 66455696, + "step": 114460 + }, + { + "epoch": 17.04870420017873, + "grad_norm": 0.1123046875, + "learning_rate": 0.0019465274826421658, + "loss": 0.8014, + "num_input_tokens_seen": 66458416, + "step": 114465 + }, + { + "epoch": 17.04944891271969, + "grad_norm": 0.0546875, + "learning_rate": 0.001945567110049388, + "loss": 0.8101, + "num_input_tokens_seen": 66461328, + "step": 114470 + }, + { + "epoch": 17.05019362526065, + "grad_norm": 0.038330078125, + "learning_rate": 0.0019446069579979824, + "loss": 0.7941, + "num_input_tokens_seen": 66464112, + "step": 114475 + }, + { + "epoch": 17.050938337801608, + "grad_norm": 0.031982421875, + "learning_rate": 0.0019436470265041755, + "loss": 0.8175, + "num_input_tokens_seen": 66467088, + "step": 114480 + }, + { + "epoch": 17.051683050342568, + "grad_norm": 0.0703125, + "learning_rate": 0.0019426873155841845, + "loss": 0.803, + "num_input_tokens_seen": 66470160, + "step": 114485 + }, + { + "epoch": 17.05242776288353, + "grad_norm": 0.05419921875, + "learning_rate": 0.0019417278252542147, + "loss": 0.7946, + "num_input_tokens_seen": 66473104, + "step": 114490 + }, + { + "epoch": 17.053172475424486, + "grad_norm": 0.08056640625, + "learning_rate": 0.0019407685555304865, + "loss": 0.7695, + "num_input_tokens_seen": 66476208, + "step": 114495 + }, + { + "epoch": 17.053917187965446, + "grad_norm": 0.045166015625, + "learning_rate": 0.0019398095064292002, + "loss": 0.7985, + "num_input_tokens_seen": 66479344, + "step": 114500 + }, + { + "epoch": 17.054661900506403, + "grad_norm": 0.0255126953125, + "learning_rate": 0.0019388506779665548, + "loss": 0.7926, + "num_input_tokens_seen": 66482256, + "step": 114505 + }, + { + "epoch": 17.055406613047364, + "grad_norm": 0.045654296875, + "learning_rate": 0.0019378920701587554, + "loss": 0.8035, + "num_input_tokens_seen": 66485616, + "step": 114510 + }, + { + "epoch": 17.056151325588324, + "grad_norm": 0.04345703125, + "learning_rate": 0.001936933683021994, + "loss": 0.7888, + "num_input_tokens_seen": 66488720, + "step": 114515 + }, + { + "epoch": 17.05689603812928, + "grad_norm": 0.09619140625, + "learning_rate": 0.001935975516572463, + "loss": 0.7773, + "num_input_tokens_seen": 66491568, + "step": 114520 + }, + { + "epoch": 17.05764075067024, + "grad_norm": 0.07275390625, + "learning_rate": 0.0019350175708263422, + "loss": 0.7882, + "num_input_tokens_seen": 66494416, + "step": 114525 + }, + { + "epoch": 17.058385463211202, + "grad_norm": 0.1025390625, + "learning_rate": 0.001934059845799826, + "loss": 0.7955, + "num_input_tokens_seen": 66497520, + "step": 114530 + }, + { + "epoch": 17.05913017575216, + "grad_norm": 0.4140625, + "learning_rate": 0.0019331023415090842, + "loss": 0.8048, + "num_input_tokens_seen": 66500592, + "step": 114535 + }, + { + "epoch": 17.05987488829312, + "grad_norm": 0.052001953125, + "learning_rate": 0.0019321450579703025, + "loss": 0.7825, + "num_input_tokens_seen": 66503760, + "step": 114540 + }, + { + "epoch": 17.060619600834077, + "grad_norm": 0.0966796875, + "learning_rate": 0.0019311879951996447, + "loss": 0.7948, + "num_input_tokens_seen": 66506704, + "step": 114545 + }, + { + "epoch": 17.061364313375037, + "grad_norm": 0.050048828125, + "learning_rate": 0.001930231153213286, + "loss": 0.7824, + "num_input_tokens_seen": 66509744, + "step": 114550 + }, + { + "epoch": 17.062109025915998, + "grad_norm": 0.0498046875, + "learning_rate": 0.0019292745320273852, + "loss": 0.7996, + "num_input_tokens_seen": 66512944, + "step": 114555 + }, + { + "epoch": 17.062853738456955, + "grad_norm": 0.043212890625, + "learning_rate": 0.001928318131658111, + "loss": 0.8044, + "num_input_tokens_seen": 66516080, + "step": 114560 + }, + { + "epoch": 17.063598450997915, + "grad_norm": 0.060546875, + "learning_rate": 0.0019273619521216155, + "loss": 0.8073, + "num_input_tokens_seen": 66518896, + "step": 114565 + }, + { + "epoch": 17.064343163538872, + "grad_norm": 0.037841796875, + "learning_rate": 0.0019264059934340544, + "loss": 0.8039, + "num_input_tokens_seen": 66521648, + "step": 114570 + }, + { + "epoch": 17.065087876079833, + "grad_norm": 0.11083984375, + "learning_rate": 0.001925450255611576, + "loss": 0.7846, + "num_input_tokens_seen": 66524528, + "step": 114575 + }, + { + "epoch": 17.065832588620793, + "grad_norm": 0.037109375, + "learning_rate": 0.0019244947386703225, + "loss": 0.7881, + "num_input_tokens_seen": 66527856, + "step": 114580 + }, + { + "epoch": 17.06657730116175, + "grad_norm": 0.041015625, + "learning_rate": 0.0019235394426264463, + "loss": 0.7938, + "num_input_tokens_seen": 66530512, + "step": 114585 + }, + { + "epoch": 17.06732201370271, + "grad_norm": 0.0517578125, + "learning_rate": 0.0019225843674960774, + "loss": 0.8004, + "num_input_tokens_seen": 66533360, + "step": 114590 + }, + { + "epoch": 17.06806672624367, + "grad_norm": 0.0751953125, + "learning_rate": 0.0019216295132953563, + "loss": 0.7923, + "num_input_tokens_seen": 66536208, + "step": 114595 + }, + { + "epoch": 17.06881143878463, + "grad_norm": 0.057861328125, + "learning_rate": 0.0019206748800404138, + "loss": 0.7846, + "num_input_tokens_seen": 66538864, + "step": 114600 + }, + { + "epoch": 17.06955615132559, + "grad_norm": 0.0791015625, + "learning_rate": 0.0019197204677473716, + "loss": 0.7868, + "num_input_tokens_seen": 66541584, + "step": 114605 + }, + { + "epoch": 17.070300863866546, + "grad_norm": 0.05419921875, + "learning_rate": 0.001918766276432362, + "loss": 0.7918, + "num_input_tokens_seen": 66544400, + "step": 114610 + }, + { + "epoch": 17.071045576407506, + "grad_norm": 0.12451171875, + "learning_rate": 0.0019178123061114971, + "loss": 0.8141, + "num_input_tokens_seen": 66547056, + "step": 114615 + }, + { + "epoch": 17.071790288948467, + "grad_norm": 0.0341796875, + "learning_rate": 0.0019168585568009005, + "loss": 0.7842, + "num_input_tokens_seen": 66550192, + "step": 114620 + }, + { + "epoch": 17.072535001489424, + "grad_norm": 0.044189453125, + "learning_rate": 0.0019159050285166778, + "loss": 0.8052, + "num_input_tokens_seen": 66553264, + "step": 114625 + }, + { + "epoch": 17.073279714030384, + "grad_norm": 0.08544921875, + "learning_rate": 0.001914951721274946, + "loss": 0.8119, + "num_input_tokens_seen": 66556336, + "step": 114630 + }, + { + "epoch": 17.074024426571345, + "grad_norm": 0.0859375, + "learning_rate": 0.0019139986350918058, + "loss": 0.7708, + "num_input_tokens_seen": 66559440, + "step": 114635 + }, + { + "epoch": 17.074769139112302, + "grad_norm": 0.03125, + "learning_rate": 0.0019130457699833574, + "loss": 0.8296, + "num_input_tokens_seen": 66562416, + "step": 114640 + }, + { + "epoch": 17.075513851653263, + "grad_norm": 0.69140625, + "learning_rate": 0.0019120931259656998, + "loss": 0.8609, + "num_input_tokens_seen": 66565136, + "step": 114645 + }, + { + "epoch": 17.07625856419422, + "grad_norm": 0.054931640625, + "learning_rate": 0.0019111407030549248, + "loss": 0.8032, + "num_input_tokens_seen": 66567984, + "step": 114650 + }, + { + "epoch": 17.07700327673518, + "grad_norm": 0.052978515625, + "learning_rate": 0.0019101885012671282, + "loss": 0.8023, + "num_input_tokens_seen": 66570768, + "step": 114655 + }, + { + "epoch": 17.07774798927614, + "grad_norm": 0.039306640625, + "learning_rate": 0.0019092365206183886, + "loss": 0.8009, + "num_input_tokens_seen": 66573648, + "step": 114660 + }, + { + "epoch": 17.078492701817098, + "grad_norm": 0.03271484375, + "learning_rate": 0.0019082847611247965, + "loss": 0.7964, + "num_input_tokens_seen": 66576752, + "step": 114665 + }, + { + "epoch": 17.079237414358058, + "grad_norm": 0.0299072265625, + "learning_rate": 0.001907333222802424, + "loss": 0.7994, + "num_input_tokens_seen": 66579856, + "step": 114670 + }, + { + "epoch": 17.07998212689902, + "grad_norm": 0.06591796875, + "learning_rate": 0.0019063819056673536, + "loss": 0.7915, + "num_input_tokens_seen": 66582608, + "step": 114675 + }, + { + "epoch": 17.080726839439976, + "grad_norm": 0.042724609375, + "learning_rate": 0.0019054308097356536, + "loss": 0.8296, + "num_input_tokens_seen": 66585840, + "step": 114680 + }, + { + "epoch": 17.081471551980936, + "grad_norm": 0.05224609375, + "learning_rate": 0.0019044799350233866, + "loss": 0.7936, + "num_input_tokens_seen": 66588944, + "step": 114685 + }, + { + "epoch": 17.082216264521893, + "grad_norm": 0.058837890625, + "learning_rate": 0.0019035292815466258, + "loss": 0.8119, + "num_input_tokens_seen": 66591696, + "step": 114690 + }, + { + "epoch": 17.082960977062854, + "grad_norm": 0.08935546875, + "learning_rate": 0.001902578849321424, + "loss": 0.795, + "num_input_tokens_seen": 66595088, + "step": 114695 + }, + { + "epoch": 17.083705689603814, + "grad_norm": 0.04541015625, + "learning_rate": 0.0019016286383638447, + "loss": 0.7881, + "num_input_tokens_seen": 66597872, + "step": 114700 + }, + { + "epoch": 17.08445040214477, + "grad_norm": 0.052490234375, + "learning_rate": 0.0019006786486899351, + "loss": 0.8068, + "num_input_tokens_seen": 66600752, + "step": 114705 + }, + { + "epoch": 17.08519511468573, + "grad_norm": 0.06591796875, + "learning_rate": 0.0018997288803157474, + "loss": 0.7855, + "num_input_tokens_seen": 66603632, + "step": 114710 + }, + { + "epoch": 17.085939827226692, + "grad_norm": 0.0546875, + "learning_rate": 0.001898779333257322, + "loss": 0.7918, + "num_input_tokens_seen": 66606608, + "step": 114715 + }, + { + "epoch": 17.08668453976765, + "grad_norm": 0.05419921875, + "learning_rate": 0.001897830007530708, + "loss": 0.7956, + "num_input_tokens_seen": 66609488, + "step": 114720 + }, + { + "epoch": 17.08742925230861, + "grad_norm": 0.04150390625, + "learning_rate": 0.0018968809031519407, + "loss": 0.7848, + "num_input_tokens_seen": 66612560, + "step": 114725 + }, + { + "epoch": 17.088173964849567, + "grad_norm": 0.11767578125, + "learning_rate": 0.0018959320201370488, + "loss": 0.8069, + "num_input_tokens_seen": 66615440, + "step": 114730 + }, + { + "epoch": 17.088918677390527, + "grad_norm": 0.039794921875, + "learning_rate": 0.0018949833585020697, + "loss": 0.7831, + "num_input_tokens_seen": 66618224, + "step": 114735 + }, + { + "epoch": 17.089663389931488, + "grad_norm": 0.09228515625, + "learning_rate": 0.001894034918263024, + "loss": 0.8186, + "num_input_tokens_seen": 66621008, + "step": 114740 + }, + { + "epoch": 17.090408102472445, + "grad_norm": 0.04931640625, + "learning_rate": 0.0018930866994359419, + "loss": 0.805, + "num_input_tokens_seen": 66623856, + "step": 114745 + }, + { + "epoch": 17.091152815013405, + "grad_norm": 0.07568359375, + "learning_rate": 0.0018921387020368356, + "loss": 0.802, + "num_input_tokens_seen": 66626512, + "step": 114750 + }, + { + "epoch": 17.091897527554362, + "grad_norm": 0.1240234375, + "learning_rate": 0.0018911909260817276, + "loss": 0.7952, + "num_input_tokens_seen": 66629456, + "step": 114755 + }, + { + "epoch": 17.092642240095323, + "grad_norm": 0.04443359375, + "learning_rate": 0.001890243371586625, + "loss": 0.7743, + "num_input_tokens_seen": 66632368, + "step": 114760 + }, + { + "epoch": 17.093386952636283, + "grad_norm": 0.0634765625, + "learning_rate": 0.0018892960385675332, + "loss": 0.8067, + "num_input_tokens_seen": 66635440, + "step": 114765 + }, + { + "epoch": 17.09413166517724, + "grad_norm": 0.05908203125, + "learning_rate": 0.0018883489270404628, + "loss": 0.8075, + "num_input_tokens_seen": 66638480, + "step": 114770 + }, + { + "epoch": 17.0948763777182, + "grad_norm": 0.05615234375, + "learning_rate": 0.0018874020370214112, + "loss": 0.808, + "num_input_tokens_seen": 66641488, + "step": 114775 + }, + { + "epoch": 17.09562109025916, + "grad_norm": 0.07861328125, + "learning_rate": 0.0018864553685263752, + "loss": 0.7927, + "num_input_tokens_seen": 66644432, + "step": 114780 + }, + { + "epoch": 17.09636580280012, + "grad_norm": 0.388671875, + "learning_rate": 0.0018855089215713437, + "loss": 0.8289, + "num_input_tokens_seen": 66647248, + "step": 114785 + }, + { + "epoch": 17.09711051534108, + "grad_norm": 0.09716796875, + "learning_rate": 0.0018845626961723145, + "loss": 0.8051, + "num_input_tokens_seen": 66650064, + "step": 114790 + }, + { + "epoch": 17.097855227882036, + "grad_norm": 0.0400390625, + "learning_rate": 0.0018836166923452656, + "loss": 0.7848, + "num_input_tokens_seen": 66653040, + "step": 114795 + }, + { + "epoch": 17.098599940422996, + "grad_norm": 0.040771484375, + "learning_rate": 0.0018826709101061833, + "loss": 0.7807, + "num_input_tokens_seen": 66655760, + "step": 114800 + }, + { + "epoch": 17.099344652963957, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0018817253494710462, + "loss": 0.7966, + "num_input_tokens_seen": 66658448, + "step": 114805 + }, + { + "epoch": 17.100089365504914, + "grad_norm": 0.09228515625, + "learning_rate": 0.0018807800104558214, + "loss": 0.8024, + "num_input_tokens_seen": 66661456, + "step": 114810 + }, + { + "epoch": 17.100834078045875, + "grad_norm": 0.043212890625, + "learning_rate": 0.0018798348930764896, + "loss": 0.8022, + "num_input_tokens_seen": 66664272, + "step": 114815 + }, + { + "epoch": 17.101578790586835, + "grad_norm": 0.05810546875, + "learning_rate": 0.001878889997349008, + "loss": 0.8042, + "num_input_tokens_seen": 66666992, + "step": 114820 + }, + { + "epoch": 17.102323503127792, + "grad_norm": 0.0595703125, + "learning_rate": 0.0018779453232893471, + "loss": 0.8127, + "num_input_tokens_seen": 66669968, + "step": 114825 + }, + { + "epoch": 17.103068215668753, + "grad_norm": 0.0478515625, + "learning_rate": 0.0018770008709134644, + "loss": 0.7774, + "num_input_tokens_seen": 66672912, + "step": 114830 + }, + { + "epoch": 17.10381292820971, + "grad_norm": 0.0810546875, + "learning_rate": 0.00187605664023731, + "loss": 0.8142, + "num_input_tokens_seen": 66675728, + "step": 114835 + }, + { + "epoch": 17.10455764075067, + "grad_norm": 0.050048828125, + "learning_rate": 0.001875112631276845, + "loss": 0.7983, + "num_input_tokens_seen": 66678832, + "step": 114840 + }, + { + "epoch": 17.10530235329163, + "grad_norm": 0.031494140625, + "learning_rate": 0.0018741688440480114, + "loss": 0.7921, + "num_input_tokens_seen": 66681552, + "step": 114845 + }, + { + "epoch": 17.106047065832588, + "grad_norm": 0.05078125, + "learning_rate": 0.0018732252785667546, + "loss": 0.7878, + "num_input_tokens_seen": 66684240, + "step": 114850 + }, + { + "epoch": 17.106791778373548, + "grad_norm": 0.0693359375, + "learning_rate": 0.0018722819348490122, + "loss": 0.7944, + "num_input_tokens_seen": 66686992, + "step": 114855 + }, + { + "epoch": 17.10753649091451, + "grad_norm": 0.0458984375, + "learning_rate": 0.0018713388129107276, + "loss": 0.8079, + "num_input_tokens_seen": 66690000, + "step": 114860 + }, + { + "epoch": 17.108281203455466, + "grad_norm": 0.03466796875, + "learning_rate": 0.001870395912767827, + "loss": 0.7991, + "num_input_tokens_seen": 66692688, + "step": 114865 + }, + { + "epoch": 17.109025915996426, + "grad_norm": 0.048828125, + "learning_rate": 0.0018694532344362457, + "loss": 0.7828, + "num_input_tokens_seen": 66695632, + "step": 114870 + }, + { + "epoch": 17.109770628537383, + "grad_norm": 0.06494140625, + "learning_rate": 0.0018685107779319043, + "loss": 0.7921, + "num_input_tokens_seen": 66698416, + "step": 114875 + }, + { + "epoch": 17.110515341078344, + "grad_norm": 0.07373046875, + "learning_rate": 0.00186756854327073, + "loss": 0.8098, + "num_input_tokens_seen": 66701520, + "step": 114880 + }, + { + "epoch": 17.111260053619304, + "grad_norm": 0.061767578125, + "learning_rate": 0.0018666265304686386, + "loss": 0.7736, + "num_input_tokens_seen": 66704560, + "step": 114885 + }, + { + "epoch": 17.11200476616026, + "grad_norm": 0.050537109375, + "learning_rate": 0.0018656847395415405, + "loss": 0.8099, + "num_input_tokens_seen": 66707408, + "step": 114890 + }, + { + "epoch": 17.11274947870122, + "grad_norm": 0.0306396484375, + "learning_rate": 0.0018647431705053512, + "loss": 0.7917, + "num_input_tokens_seen": 66710320, + "step": 114895 + }, + { + "epoch": 17.113494191242182, + "grad_norm": 0.068359375, + "learning_rate": 0.0018638018233759783, + "loss": 0.7907, + "num_input_tokens_seen": 66713072, + "step": 114900 + }, + { + "epoch": 17.11423890378314, + "grad_norm": 0.044189453125, + "learning_rate": 0.0018628606981693208, + "loss": 0.7887, + "num_input_tokens_seen": 66716080, + "step": 114905 + }, + { + "epoch": 17.1149836163241, + "grad_norm": 0.04638671875, + "learning_rate": 0.0018619197949012773, + "loss": 0.8045, + "num_input_tokens_seen": 66718992, + "step": 114910 + }, + { + "epoch": 17.115728328865057, + "grad_norm": 0.05224609375, + "learning_rate": 0.0018609791135877484, + "loss": 0.811, + "num_input_tokens_seen": 66721936, + "step": 114915 + }, + { + "epoch": 17.116473041406017, + "grad_norm": 0.03564453125, + "learning_rate": 0.0018600386542446251, + "loss": 0.8197, + "num_input_tokens_seen": 66724496, + "step": 114920 + }, + { + "epoch": 17.117217753946978, + "grad_norm": 0.06494140625, + "learning_rate": 0.0018590984168877893, + "loss": 0.7872, + "num_input_tokens_seen": 66727504, + "step": 114925 + }, + { + "epoch": 17.117962466487935, + "grad_norm": 0.038818359375, + "learning_rate": 0.0018581584015331337, + "loss": 0.7661, + "num_input_tokens_seen": 66730352, + "step": 114930 + }, + { + "epoch": 17.118707179028895, + "grad_norm": 0.037109375, + "learning_rate": 0.0018572186081965301, + "loss": 0.813, + "num_input_tokens_seen": 66733648, + "step": 114935 + }, + { + "epoch": 17.119451891569852, + "grad_norm": 0.10791015625, + "learning_rate": 0.0018562790368938664, + "loss": 0.8012, + "num_input_tokens_seen": 66736784, + "step": 114940 + }, + { + "epoch": 17.120196604110813, + "grad_norm": 0.0546875, + "learning_rate": 0.0018553396876410044, + "loss": 0.7883, + "num_input_tokens_seen": 66739792, + "step": 114945 + }, + { + "epoch": 17.120941316651773, + "grad_norm": 0.07470703125, + "learning_rate": 0.0018544005604538233, + "loss": 0.7914, + "num_input_tokens_seen": 66743088, + "step": 114950 + }, + { + "epoch": 17.12168602919273, + "grad_norm": 0.041015625, + "learning_rate": 0.0018534616553481803, + "loss": 0.809, + "num_input_tokens_seen": 66745840, + "step": 114955 + }, + { + "epoch": 17.12243074173369, + "grad_norm": 0.0771484375, + "learning_rate": 0.0018525229723399443, + "loss": 0.7842, + "num_input_tokens_seen": 66748624, + "step": 114960 + }, + { + "epoch": 17.12317545427465, + "grad_norm": 0.083984375, + "learning_rate": 0.0018515845114449713, + "loss": 0.7933, + "num_input_tokens_seen": 66751536, + "step": 114965 + }, + { + "epoch": 17.12392016681561, + "grad_norm": 0.0576171875, + "learning_rate": 0.0018506462726791133, + "loss": 0.8021, + "num_input_tokens_seen": 66754576, + "step": 114970 + }, + { + "epoch": 17.12466487935657, + "grad_norm": 0.0458984375, + "learning_rate": 0.001849708256058221, + "loss": 0.778, + "num_input_tokens_seen": 66757840, + "step": 114975 + }, + { + "epoch": 17.125409591897526, + "grad_norm": 0.038330078125, + "learning_rate": 0.001848770461598142, + "loss": 0.7815, + "num_input_tokens_seen": 66760720, + "step": 114980 + }, + { + "epoch": 17.126154304438487, + "grad_norm": 0.04638671875, + "learning_rate": 0.0018478328893147217, + "loss": 0.7779, + "num_input_tokens_seen": 66763504, + "step": 114985 + }, + { + "epoch": 17.126899016979447, + "grad_norm": 0.11474609375, + "learning_rate": 0.001846895539223794, + "loss": 0.7895, + "num_input_tokens_seen": 66766256, + "step": 114990 + }, + { + "epoch": 17.127643729520404, + "grad_norm": 0.044677734375, + "learning_rate": 0.0018459584113411998, + "loss": 0.7915, + "num_input_tokens_seen": 66769136, + "step": 114995 + }, + { + "epoch": 17.128388442061365, + "grad_norm": 0.07958984375, + "learning_rate": 0.0018450215056827711, + "loss": 0.8092, + "num_input_tokens_seen": 66772112, + "step": 115000 + }, + { + "epoch": 17.129133154602325, + "grad_norm": 0.032958984375, + "learning_rate": 0.0018440848222643308, + "loss": 0.7874, + "num_input_tokens_seen": 66775024, + "step": 115005 + }, + { + "epoch": 17.129877867143282, + "grad_norm": 0.0732421875, + "learning_rate": 0.0018431483611017073, + "loss": 0.8041, + "num_input_tokens_seen": 66777840, + "step": 115010 + }, + { + "epoch": 17.130622579684243, + "grad_norm": 0.06396484375, + "learning_rate": 0.00184221212221072, + "loss": 0.803, + "num_input_tokens_seen": 66781104, + "step": 115015 + }, + { + "epoch": 17.1313672922252, + "grad_norm": 0.07275390625, + "learning_rate": 0.0018412761056071864, + "loss": 0.8131, + "num_input_tokens_seen": 66783760, + "step": 115020 + }, + { + "epoch": 17.13211200476616, + "grad_norm": 0.024169921875, + "learning_rate": 0.0018403403113069182, + "loss": 0.8062, + "num_input_tokens_seen": 66786544, + "step": 115025 + }, + { + "epoch": 17.13285671730712, + "grad_norm": 0.057373046875, + "learning_rate": 0.0018394047393257267, + "loss": 0.8031, + "num_input_tokens_seen": 66789360, + "step": 115030 + }, + { + "epoch": 17.133601429848078, + "grad_norm": 0.07421875, + "learning_rate": 0.0018384693896794173, + "loss": 0.776, + "num_input_tokens_seen": 66792368, + "step": 115035 + }, + { + "epoch": 17.134346142389038, + "grad_norm": 0.06982421875, + "learning_rate": 0.001837534262383791, + "loss": 0.7841, + "num_input_tokens_seen": 66795216, + "step": 115040 + }, + { + "epoch": 17.13509085493, + "grad_norm": 0.0517578125, + "learning_rate": 0.0018365993574546461, + "loss": 0.7743, + "num_input_tokens_seen": 66798160, + "step": 115045 + }, + { + "epoch": 17.135835567470956, + "grad_norm": 0.046875, + "learning_rate": 0.0018356646749077726, + "loss": 0.8073, + "num_input_tokens_seen": 66800880, + "step": 115050 + }, + { + "epoch": 17.136580280011916, + "grad_norm": 0.0810546875, + "learning_rate": 0.0018347302147589672, + "loss": 0.7954, + "num_input_tokens_seen": 66803952, + "step": 115055 + }, + { + "epoch": 17.137324992552873, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0018337959770240108, + "loss": 0.7978, + "num_input_tokens_seen": 66806672, + "step": 115060 + }, + { + "epoch": 17.138069705093834, + "grad_norm": 0.041748046875, + "learning_rate": 0.0018328619617186941, + "loss": 0.7826, + "num_input_tokens_seen": 66809648, + "step": 115065 + }, + { + "epoch": 17.138814417634794, + "grad_norm": 0.042236328125, + "learning_rate": 0.0018319281688587896, + "loss": 0.782, + "num_input_tokens_seen": 66812656, + "step": 115070 + }, + { + "epoch": 17.13955913017575, + "grad_norm": 0.0615234375, + "learning_rate": 0.0018309945984600778, + "loss": 0.8106, + "num_input_tokens_seen": 66815280, + "step": 115075 + }, + { + "epoch": 17.140303842716712, + "grad_norm": 0.041015625, + "learning_rate": 0.0018300612505383245, + "loss": 0.8012, + "num_input_tokens_seen": 66818384, + "step": 115080 + }, + { + "epoch": 17.14104855525767, + "grad_norm": 0.049560546875, + "learning_rate": 0.0018291281251093037, + "loss": 0.7977, + "num_input_tokens_seen": 66821584, + "step": 115085 + }, + { + "epoch": 17.14179326779863, + "grad_norm": 0.04052734375, + "learning_rate": 0.0018281952221887764, + "loss": 0.7869, + "num_input_tokens_seen": 66824528, + "step": 115090 + }, + { + "epoch": 17.14253798033959, + "grad_norm": 0.0673828125, + "learning_rate": 0.0018272625417925013, + "loss": 0.785, + "num_input_tokens_seen": 66827248, + "step": 115095 + }, + { + "epoch": 17.143282692880547, + "grad_norm": 0.0390625, + "learning_rate": 0.0018263300839362411, + "loss": 0.8161, + "num_input_tokens_seen": 66830224, + "step": 115100 + }, + { + "epoch": 17.144027405421507, + "grad_norm": 0.038330078125, + "learning_rate": 0.0018253978486357447, + "loss": 0.7941, + "num_input_tokens_seen": 66833232, + "step": 115105 + }, + { + "epoch": 17.144772117962468, + "grad_norm": 0.04443359375, + "learning_rate": 0.001824465835906761, + "loss": 0.7973, + "num_input_tokens_seen": 66836432, + "step": 115110 + }, + { + "epoch": 17.145516830503425, + "grad_norm": 0.05712890625, + "learning_rate": 0.0018235340457650345, + "loss": 0.8037, + "num_input_tokens_seen": 66839184, + "step": 115115 + }, + { + "epoch": 17.146261543044385, + "grad_norm": 0.05615234375, + "learning_rate": 0.0018226024782263105, + "loss": 0.7986, + "num_input_tokens_seen": 66841968, + "step": 115120 + }, + { + "epoch": 17.147006255585342, + "grad_norm": 0.0283203125, + "learning_rate": 0.0018216711333063234, + "loss": 0.7814, + "num_input_tokens_seen": 66845072, + "step": 115125 + }, + { + "epoch": 17.147750968126303, + "grad_norm": 0.052978515625, + "learning_rate": 0.001820740011020807, + "loss": 0.8075, + "num_input_tokens_seen": 66847888, + "step": 115130 + }, + { + "epoch": 17.148495680667263, + "grad_norm": 0.0927734375, + "learning_rate": 0.0018198091113854958, + "loss": 0.8193, + "num_input_tokens_seen": 66850768, + "step": 115135 + }, + { + "epoch": 17.14924039320822, + "grad_norm": 0.083984375, + "learning_rate": 0.0018188784344161117, + "loss": 0.8023, + "num_input_tokens_seen": 66853968, + "step": 115140 + }, + { + "epoch": 17.14998510574918, + "grad_norm": 0.06298828125, + "learning_rate": 0.0018179479801283826, + "loss": 0.8138, + "num_input_tokens_seen": 66856816, + "step": 115145 + }, + { + "epoch": 17.15072981829014, + "grad_norm": 0.05224609375, + "learning_rate": 0.0018170177485380206, + "loss": 0.7762, + "num_input_tokens_seen": 66859312, + "step": 115150 + }, + { + "epoch": 17.1514745308311, + "grad_norm": 0.058349609375, + "learning_rate": 0.0018160877396607483, + "loss": 0.8124, + "num_input_tokens_seen": 66862352, + "step": 115155 + }, + { + "epoch": 17.15221924337206, + "grad_norm": 0.04541015625, + "learning_rate": 0.0018151579535122747, + "loss": 0.8029, + "num_input_tokens_seen": 66865104, + "step": 115160 + }, + { + "epoch": 17.152963955913016, + "grad_norm": 0.046630859375, + "learning_rate": 0.001814228390108304, + "loss": 0.7764, + "num_input_tokens_seen": 66868208, + "step": 115165 + }, + { + "epoch": 17.153708668453977, + "grad_norm": 0.0634765625, + "learning_rate": 0.0018132990494645468, + "loss": 0.8087, + "num_input_tokens_seen": 66871248, + "step": 115170 + }, + { + "epoch": 17.154453380994937, + "grad_norm": 0.1162109375, + "learning_rate": 0.001812369931596699, + "loss": 0.8019, + "num_input_tokens_seen": 66874192, + "step": 115175 + }, + { + "epoch": 17.155198093535894, + "grad_norm": 0.04296875, + "learning_rate": 0.001811441036520458, + "loss": 0.7825, + "num_input_tokens_seen": 66877200, + "step": 115180 + }, + { + "epoch": 17.155942806076855, + "grad_norm": 0.058349609375, + "learning_rate": 0.0018105123642515146, + "loss": 0.7976, + "num_input_tokens_seen": 66880176, + "step": 115185 + }, + { + "epoch": 17.156687518617815, + "grad_norm": 0.048828125, + "learning_rate": 0.0018095839148055614, + "loss": 0.7989, + "num_input_tokens_seen": 66882960, + "step": 115190 + }, + { + "epoch": 17.157432231158772, + "grad_norm": 0.0458984375, + "learning_rate": 0.001808655688198279, + "loss": 0.7912, + "num_input_tokens_seen": 66885936, + "step": 115195 + }, + { + "epoch": 17.158176943699733, + "grad_norm": 0.042724609375, + "learning_rate": 0.0018077276844453549, + "loss": 0.7827, + "num_input_tokens_seen": 66888688, + "step": 115200 + }, + { + "epoch": 17.15892165624069, + "grad_norm": 0.080078125, + "learning_rate": 0.0018067999035624631, + "loss": 0.8006, + "num_input_tokens_seen": 66891792, + "step": 115205 + }, + { + "epoch": 17.15966636878165, + "grad_norm": 0.05615234375, + "learning_rate": 0.0018058723455652748, + "loss": 0.7948, + "num_input_tokens_seen": 66894832, + "step": 115210 + }, + { + "epoch": 17.16041108132261, + "grad_norm": 0.05615234375, + "learning_rate": 0.001804945010469467, + "loss": 0.8306, + "num_input_tokens_seen": 66897520, + "step": 115215 + }, + { + "epoch": 17.161155793863568, + "grad_norm": 0.08251953125, + "learning_rate": 0.001804017898290699, + "loss": 0.7991, + "num_input_tokens_seen": 66900240, + "step": 115220 + }, + { + "epoch": 17.16190050640453, + "grad_norm": 0.055419921875, + "learning_rate": 0.00180309100904464, + "loss": 0.8022, + "num_input_tokens_seen": 66903152, + "step": 115225 + }, + { + "epoch": 17.16264521894549, + "grad_norm": 0.0654296875, + "learning_rate": 0.001802164342746944, + "loss": 0.7991, + "num_input_tokens_seen": 66906224, + "step": 115230 + }, + { + "epoch": 17.163389931486446, + "grad_norm": 0.08984375, + "learning_rate": 0.0018012378994132704, + "loss": 0.8014, + "num_input_tokens_seen": 66908816, + "step": 115235 + }, + { + "epoch": 17.164134644027406, + "grad_norm": 0.045166015625, + "learning_rate": 0.0018003116790592632, + "loss": 0.8073, + "num_input_tokens_seen": 66912048, + "step": 115240 + }, + { + "epoch": 17.164879356568363, + "grad_norm": 0.044677734375, + "learning_rate": 0.001799385681700578, + "loss": 0.8111, + "num_input_tokens_seen": 66914864, + "step": 115245 + }, + { + "epoch": 17.165624069109324, + "grad_norm": 0.055419921875, + "learning_rate": 0.0017984599073528561, + "loss": 0.7845, + "num_input_tokens_seen": 66917520, + "step": 115250 + }, + { + "epoch": 17.166368781650284, + "grad_norm": 0.035400390625, + "learning_rate": 0.001797534356031733, + "loss": 0.785, + "num_input_tokens_seen": 66920272, + "step": 115255 + }, + { + "epoch": 17.16711349419124, + "grad_norm": 0.0478515625, + "learning_rate": 0.0017966090277528512, + "loss": 0.7706, + "num_input_tokens_seen": 66923056, + "step": 115260 + }, + { + "epoch": 17.167858206732202, + "grad_norm": 0.06884765625, + "learning_rate": 0.0017956839225318382, + "loss": 0.7881, + "num_input_tokens_seen": 66926064, + "step": 115265 + }, + { + "epoch": 17.16860291927316, + "grad_norm": 0.05908203125, + "learning_rate": 0.0017947590403843267, + "loss": 0.7859, + "num_input_tokens_seen": 66930096, + "step": 115270 + }, + { + "epoch": 17.16934763181412, + "grad_norm": 0.07080078125, + "learning_rate": 0.0017938343813259387, + "loss": 0.7764, + "num_input_tokens_seen": 66932752, + "step": 115275 + }, + { + "epoch": 17.17009234435508, + "grad_norm": 0.078125, + "learning_rate": 0.001792909945372299, + "loss": 0.792, + "num_input_tokens_seen": 66936144, + "step": 115280 + }, + { + "epoch": 17.170837056896037, + "grad_norm": 0.08984375, + "learning_rate": 0.0017919857325390215, + "loss": 0.7947, + "num_input_tokens_seen": 66939120, + "step": 115285 + }, + { + "epoch": 17.171581769436997, + "grad_norm": 0.049072265625, + "learning_rate": 0.0017910617428417203, + "loss": 0.7766, + "num_input_tokens_seen": 66942032, + "step": 115290 + }, + { + "epoch": 17.172326481977958, + "grad_norm": 0.0439453125, + "learning_rate": 0.0017901379762960078, + "loss": 0.8001, + "num_input_tokens_seen": 66944880, + "step": 115295 + }, + { + "epoch": 17.173071194518915, + "grad_norm": 0.07861328125, + "learning_rate": 0.0017892144329174884, + "loss": 0.8438, + "num_input_tokens_seen": 66947760, + "step": 115300 + }, + { + "epoch": 17.173815907059875, + "grad_norm": 0.05615234375, + "learning_rate": 0.0017882911127217631, + "loss": 0.7905, + "num_input_tokens_seen": 66950608, + "step": 115305 + }, + { + "epoch": 17.174560619600832, + "grad_norm": 0.064453125, + "learning_rate": 0.0017873680157244308, + "loss": 0.7984, + "num_input_tokens_seen": 66953424, + "step": 115310 + }, + { + "epoch": 17.175305332141793, + "grad_norm": 0.041748046875, + "learning_rate": 0.0017864451419410893, + "loss": 0.8267, + "num_input_tokens_seen": 66956208, + "step": 115315 + }, + { + "epoch": 17.176050044682754, + "grad_norm": 0.07080078125, + "learning_rate": 0.001785522491387324, + "loss": 0.8021, + "num_input_tokens_seen": 66958928, + "step": 115320 + }, + { + "epoch": 17.17679475722371, + "grad_norm": 0.045654296875, + "learning_rate": 0.001784600064078728, + "loss": 0.8131, + "num_input_tokens_seen": 66961840, + "step": 115325 + }, + { + "epoch": 17.17753946976467, + "grad_norm": 0.043212890625, + "learning_rate": 0.0017836778600308833, + "loss": 0.7843, + "num_input_tokens_seen": 66964816, + "step": 115330 + }, + { + "epoch": 17.17828418230563, + "grad_norm": 0.04248046875, + "learning_rate": 0.001782755879259366, + "loss": 0.802, + "num_input_tokens_seen": 66967760, + "step": 115335 + }, + { + "epoch": 17.17902889484659, + "grad_norm": 0.0439453125, + "learning_rate": 0.0017818341217797572, + "loss": 0.7883, + "num_input_tokens_seen": 66970544, + "step": 115340 + }, + { + "epoch": 17.17977360738755, + "grad_norm": 0.04931640625, + "learning_rate": 0.0017809125876076242, + "loss": 0.8007, + "num_input_tokens_seen": 66973488, + "step": 115345 + }, + { + "epoch": 17.180518319928506, + "grad_norm": 0.07568359375, + "learning_rate": 0.0017799912767585413, + "loss": 0.7846, + "num_input_tokens_seen": 66976240, + "step": 115350 + }, + { + "epoch": 17.181263032469467, + "grad_norm": 0.031005859375, + "learning_rate": 0.0017790701892480643, + "loss": 0.7631, + "num_input_tokens_seen": 66979088, + "step": 115355 + }, + { + "epoch": 17.182007745010427, + "grad_norm": 0.06103515625, + "learning_rate": 0.001778149325091764, + "loss": 0.7913, + "num_input_tokens_seen": 66981488, + "step": 115360 + }, + { + "epoch": 17.182752457551384, + "grad_norm": 0.07763671875, + "learning_rate": 0.001777228684305193, + "loss": 0.8025, + "num_input_tokens_seen": 66984144, + "step": 115365 + }, + { + "epoch": 17.183497170092345, + "grad_norm": 0.052001953125, + "learning_rate": 0.0017763082669039025, + "loss": 0.7852, + "num_input_tokens_seen": 66986960, + "step": 115370 + }, + { + "epoch": 17.184241882633305, + "grad_norm": 0.080078125, + "learning_rate": 0.0017753880729034464, + "loss": 0.7817, + "num_input_tokens_seen": 66989808, + "step": 115375 + }, + { + "epoch": 17.184986595174262, + "grad_norm": 0.056640625, + "learning_rate": 0.0017744681023193626, + "loss": 0.7977, + "num_input_tokens_seen": 66992560, + "step": 115380 + }, + { + "epoch": 17.185731307715223, + "grad_norm": 0.078125, + "learning_rate": 0.0017735483551672032, + "loss": 0.7891, + "num_input_tokens_seen": 66995728, + "step": 115385 + }, + { + "epoch": 17.18647602025618, + "grad_norm": 0.057373046875, + "learning_rate": 0.0017726288314624976, + "loss": 0.8092, + "num_input_tokens_seen": 66998640, + "step": 115390 + }, + { + "epoch": 17.18722073279714, + "grad_norm": 0.043701171875, + "learning_rate": 0.001771709531220787, + "loss": 0.7925, + "num_input_tokens_seen": 67001264, + "step": 115395 + }, + { + "epoch": 17.1879654453381, + "grad_norm": 0.06884765625, + "learning_rate": 0.001770790454457597, + "loss": 0.8155, + "num_input_tokens_seen": 67003920, + "step": 115400 + }, + { + "epoch": 17.188710157879058, + "grad_norm": 0.0791015625, + "learning_rate": 0.0017698716011884585, + "loss": 0.8058, + "num_input_tokens_seen": 67006704, + "step": 115405 + }, + { + "epoch": 17.18945487042002, + "grad_norm": 0.0419921875, + "learning_rate": 0.0017689529714288942, + "loss": 0.8118, + "num_input_tokens_seen": 67009424, + "step": 115410 + }, + { + "epoch": 17.19019958296098, + "grad_norm": 0.064453125, + "learning_rate": 0.0017680345651944183, + "loss": 0.7716, + "num_input_tokens_seen": 67012208, + "step": 115415 + }, + { + "epoch": 17.190944295501936, + "grad_norm": 0.06640625, + "learning_rate": 0.0017671163825005536, + "loss": 0.7721, + "num_input_tokens_seen": 67015088, + "step": 115420 + }, + { + "epoch": 17.191689008042896, + "grad_norm": 0.07763671875, + "learning_rate": 0.001766198423362804, + "loss": 0.7796, + "num_input_tokens_seen": 67018064, + "step": 115425 + }, + { + "epoch": 17.192433720583853, + "grad_norm": 0.042236328125, + "learning_rate": 0.0017652806877966859, + "loss": 0.7896, + "num_input_tokens_seen": 67021008, + "step": 115430 + }, + { + "epoch": 17.193178433124814, + "grad_norm": 0.06103515625, + "learning_rate": 0.0017643631758176997, + "loss": 0.8105, + "num_input_tokens_seen": 67024080, + "step": 115435 + }, + { + "epoch": 17.193923145665774, + "grad_norm": 0.11083984375, + "learning_rate": 0.001763445887441345, + "loss": 0.8233, + "num_input_tokens_seen": 67027056, + "step": 115440 + }, + { + "epoch": 17.19466785820673, + "grad_norm": 0.0869140625, + "learning_rate": 0.0017625288226831193, + "loss": 0.7853, + "num_input_tokens_seen": 67029936, + "step": 115445 + }, + { + "epoch": 17.195412570747692, + "grad_norm": 0.07666015625, + "learning_rate": 0.0017616119815585117, + "loss": 0.7908, + "num_input_tokens_seen": 67032784, + "step": 115450 + }, + { + "epoch": 17.19615728328865, + "grad_norm": 0.06787109375, + "learning_rate": 0.0017606953640830186, + "loss": 0.8217, + "num_input_tokens_seen": 67035408, + "step": 115455 + }, + { + "epoch": 17.19690199582961, + "grad_norm": 0.045654296875, + "learning_rate": 0.0017597789702721172, + "loss": 0.7936, + "num_input_tokens_seen": 67038352, + "step": 115460 + }, + { + "epoch": 17.19764670837057, + "grad_norm": 0.0771484375, + "learning_rate": 0.001758862800141297, + "loss": 0.7831, + "num_input_tokens_seen": 67041264, + "step": 115465 + }, + { + "epoch": 17.198391420911527, + "grad_norm": 0.158203125, + "learning_rate": 0.0017579468537060289, + "loss": 0.8139, + "num_input_tokens_seen": 67044176, + "step": 115470 + }, + { + "epoch": 17.199136133452487, + "grad_norm": 0.036865234375, + "learning_rate": 0.0017570311309817936, + "loss": 0.8063, + "num_input_tokens_seen": 67046768, + "step": 115475 + }, + { + "epoch": 17.199880845993448, + "grad_norm": 0.1416015625, + "learning_rate": 0.0017561156319840542, + "loss": 0.7739, + "num_input_tokens_seen": 67049648, + "step": 115480 + }, + { + "epoch": 17.200625558534405, + "grad_norm": 0.055419921875, + "learning_rate": 0.0017552003567282847, + "loss": 0.7989, + "num_input_tokens_seen": 67052720, + "step": 115485 + }, + { + "epoch": 17.201370271075366, + "grad_norm": 0.05224609375, + "learning_rate": 0.0017542853052299428, + "loss": 0.8179, + "num_input_tokens_seen": 67055344, + "step": 115490 + }, + { + "epoch": 17.202114983616323, + "grad_norm": 0.0693359375, + "learning_rate": 0.0017533704775044845, + "loss": 0.7922, + "num_input_tokens_seen": 67058160, + "step": 115495 + }, + { + "epoch": 17.202859696157283, + "grad_norm": 0.046875, + "learning_rate": 0.001752455873567374, + "loss": 0.796, + "num_input_tokens_seen": 67061072, + "step": 115500 + }, + { + "epoch": 17.203604408698244, + "grad_norm": 0.052001953125, + "learning_rate": 0.0017515414934340557, + "loss": 0.7757, + "num_input_tokens_seen": 67063952, + "step": 115505 + }, + { + "epoch": 17.2043491212392, + "grad_norm": 0.103515625, + "learning_rate": 0.0017506273371199787, + "loss": 0.7685, + "num_input_tokens_seen": 67066960, + "step": 115510 + }, + { + "epoch": 17.20509383378016, + "grad_norm": 0.05517578125, + "learning_rate": 0.0017497134046405842, + "loss": 0.7963, + "num_input_tokens_seen": 67069840, + "step": 115515 + }, + { + "epoch": 17.20583854632112, + "grad_norm": 0.05615234375, + "learning_rate": 0.0017487996960113183, + "loss": 0.8012, + "num_input_tokens_seen": 67072528, + "step": 115520 + }, + { + "epoch": 17.20658325886208, + "grad_norm": 0.07421875, + "learning_rate": 0.0017478862112476133, + "loss": 0.7979, + "num_input_tokens_seen": 67075696, + "step": 115525 + }, + { + "epoch": 17.20732797140304, + "grad_norm": 0.07666015625, + "learning_rate": 0.0017469729503648988, + "loss": 0.821, + "num_input_tokens_seen": 67078352, + "step": 115530 + }, + { + "epoch": 17.208072683943996, + "grad_norm": 0.0517578125, + "learning_rate": 0.0017460599133786092, + "loss": 0.7836, + "num_input_tokens_seen": 67081424, + "step": 115535 + }, + { + "epoch": 17.208817396484957, + "grad_norm": 0.06640625, + "learning_rate": 0.0017451471003041635, + "loss": 0.8169, + "num_input_tokens_seen": 67084304, + "step": 115540 + }, + { + "epoch": 17.209562109025917, + "grad_norm": 0.10009765625, + "learning_rate": 0.0017442345111569895, + "loss": 0.783, + "num_input_tokens_seen": 67087344, + "step": 115545 + }, + { + "epoch": 17.210306821566874, + "grad_norm": 0.051025390625, + "learning_rate": 0.0017433221459524982, + "loss": 0.7882, + "num_input_tokens_seen": 67090448, + "step": 115550 + }, + { + "epoch": 17.211051534107835, + "grad_norm": 0.06982421875, + "learning_rate": 0.0017424100047061075, + "loss": 0.7958, + "num_input_tokens_seen": 67093392, + "step": 115555 + }, + { + "epoch": 17.211796246648795, + "grad_norm": 0.07666015625, + "learning_rate": 0.0017414980874332264, + "loss": 0.8092, + "num_input_tokens_seen": 67096560, + "step": 115560 + }, + { + "epoch": 17.212540959189752, + "grad_norm": 0.123046875, + "learning_rate": 0.0017405863941492594, + "loss": 0.7949, + "num_input_tokens_seen": 67099376, + "step": 115565 + }, + { + "epoch": 17.213285671730713, + "grad_norm": 0.095703125, + "learning_rate": 0.0017396749248696091, + "loss": 0.7889, + "num_input_tokens_seen": 67102096, + "step": 115570 + }, + { + "epoch": 17.21403038427167, + "grad_norm": 0.09521484375, + "learning_rate": 0.00173876367960967, + "loss": 0.8002, + "num_input_tokens_seen": 67104848, + "step": 115575 + }, + { + "epoch": 17.21477509681263, + "grad_norm": 0.068359375, + "learning_rate": 0.0017378526583848447, + "loss": 0.785, + "num_input_tokens_seen": 67107824, + "step": 115580 + }, + { + "epoch": 17.21551980935359, + "grad_norm": 0.04931640625, + "learning_rate": 0.0017369418612105158, + "loss": 0.7953, + "num_input_tokens_seen": 67110640, + "step": 115585 + }, + { + "epoch": 17.216264521894548, + "grad_norm": 0.061767578125, + "learning_rate": 0.0017360312881020779, + "loss": 0.7772, + "num_input_tokens_seen": 67113520, + "step": 115590 + }, + { + "epoch": 17.21700923443551, + "grad_norm": 0.10107421875, + "learning_rate": 0.0017351209390749067, + "loss": 0.7681, + "num_input_tokens_seen": 67116400, + "step": 115595 + }, + { + "epoch": 17.217753946976465, + "grad_norm": 0.330078125, + "learning_rate": 0.0017342108141443883, + "loss": 0.8243, + "num_input_tokens_seen": 67119440, + "step": 115600 + }, + { + "epoch": 17.218498659517426, + "grad_norm": 0.1064453125, + "learning_rate": 0.0017333009133258975, + "loss": 0.8136, + "num_input_tokens_seen": 67122512, + "step": 115605 + }, + { + "epoch": 17.219243372058386, + "grad_norm": 0.080078125, + "learning_rate": 0.0017323912366347981, + "loss": 0.8073, + "num_input_tokens_seen": 67125264, + "step": 115610 + }, + { + "epoch": 17.219988084599343, + "grad_norm": 0.055419921875, + "learning_rate": 0.0017314817840864698, + "loss": 0.8061, + "num_input_tokens_seen": 67128144, + "step": 115615 + }, + { + "epoch": 17.220732797140304, + "grad_norm": 0.06689453125, + "learning_rate": 0.0017305725556962669, + "loss": 0.7782, + "num_input_tokens_seen": 67131248, + "step": 115620 + }, + { + "epoch": 17.221477509681264, + "grad_norm": 0.0703125, + "learning_rate": 0.001729663551479557, + "loss": 0.7809, + "num_input_tokens_seen": 67134256, + "step": 115625 + }, + { + "epoch": 17.22222222222222, + "grad_norm": 0.2158203125, + "learning_rate": 0.0017287547714516947, + "loss": 0.8011, + "num_input_tokens_seen": 67137264, + "step": 115630 + }, + { + "epoch": 17.222966934763182, + "grad_norm": 0.09130859375, + "learning_rate": 0.0017278462156280327, + "loss": 0.807, + "num_input_tokens_seen": 67139984, + "step": 115635 + }, + { + "epoch": 17.22371164730414, + "grad_norm": 0.08935546875, + "learning_rate": 0.0017269378840239153, + "loss": 0.8041, + "num_input_tokens_seen": 67142736, + "step": 115640 + }, + { + "epoch": 17.2244563598451, + "grad_norm": 0.04150390625, + "learning_rate": 0.001726029776654695, + "loss": 0.7818, + "num_input_tokens_seen": 67145648, + "step": 115645 + }, + { + "epoch": 17.22520107238606, + "grad_norm": 0.0771484375, + "learning_rate": 0.0017251218935357114, + "loss": 0.7975, + "num_input_tokens_seen": 67148496, + "step": 115650 + }, + { + "epoch": 17.225945784927017, + "grad_norm": 0.06689453125, + "learning_rate": 0.0017242142346822992, + "loss": 0.7934, + "num_input_tokens_seen": 67151568, + "step": 115655 + }, + { + "epoch": 17.226690497467978, + "grad_norm": 0.07568359375, + "learning_rate": 0.0017233068001097972, + "loss": 0.803, + "num_input_tokens_seen": 67154352, + "step": 115660 + }, + { + "epoch": 17.227435210008938, + "grad_norm": 0.053955078125, + "learning_rate": 0.0017223995898335287, + "loss": 0.7852, + "num_input_tokens_seen": 67157264, + "step": 115665 + }, + { + "epoch": 17.228179922549895, + "grad_norm": 0.1337890625, + "learning_rate": 0.0017214926038688293, + "loss": 0.7733, + "num_input_tokens_seen": 67159824, + "step": 115670 + }, + { + "epoch": 17.228924635090856, + "grad_norm": 0.055908203125, + "learning_rate": 0.001720585842231012, + "loss": 0.8082, + "num_input_tokens_seen": 67162640, + "step": 115675 + }, + { + "epoch": 17.229669347631813, + "grad_norm": 0.0517578125, + "learning_rate": 0.0017196793049354046, + "loss": 0.7972, + "num_input_tokens_seen": 67165456, + "step": 115680 + }, + { + "epoch": 17.230414060172773, + "grad_norm": 0.064453125, + "learning_rate": 0.0017187729919973132, + "loss": 0.7699, + "num_input_tokens_seen": 67168176, + "step": 115685 + }, + { + "epoch": 17.231158772713734, + "grad_norm": 0.10009765625, + "learning_rate": 0.0017178669034320587, + "loss": 0.7767, + "num_input_tokens_seen": 67171184, + "step": 115690 + }, + { + "epoch": 17.23190348525469, + "grad_norm": 0.0654296875, + "learning_rate": 0.001716961039254942, + "loss": 0.797, + "num_input_tokens_seen": 67173808, + "step": 115695 + }, + { + "epoch": 17.23264819779565, + "grad_norm": 0.0947265625, + "learning_rate": 0.0017160553994812681, + "loss": 0.79, + "num_input_tokens_seen": 67176656, + "step": 115700 + }, + { + "epoch": 17.23339291033661, + "grad_norm": 0.057373046875, + "learning_rate": 0.001715149984126338, + "loss": 0.7936, + "num_input_tokens_seen": 67179664, + "step": 115705 + }, + { + "epoch": 17.23413762287757, + "grad_norm": 0.06884765625, + "learning_rate": 0.0017142447932054438, + "loss": 0.7823, + "num_input_tokens_seen": 67182384, + "step": 115710 + }, + { + "epoch": 17.23488233541853, + "grad_norm": 0.048095703125, + "learning_rate": 0.001713339826733884, + "loss": 0.8138, + "num_input_tokens_seen": 67185488, + "step": 115715 + }, + { + "epoch": 17.235627047959486, + "grad_norm": 0.068359375, + "learning_rate": 0.0017124350847269392, + "loss": 0.819, + "num_input_tokens_seen": 67188336, + "step": 115720 + }, + { + "epoch": 17.236371760500447, + "grad_norm": 0.0625, + "learning_rate": 0.0017115305671999042, + "loss": 0.7839, + "num_input_tokens_seen": 67191376, + "step": 115725 + }, + { + "epoch": 17.237116473041407, + "grad_norm": 0.06689453125, + "learning_rate": 0.0017106262741680533, + "loss": 0.8016, + "num_input_tokens_seen": 67194288, + "step": 115730 + }, + { + "epoch": 17.237861185582364, + "grad_norm": 0.0771484375, + "learning_rate": 0.0017097222056466608, + "loss": 0.8199, + "num_input_tokens_seen": 67197296, + "step": 115735 + }, + { + "epoch": 17.238605898123325, + "grad_norm": 0.06640625, + "learning_rate": 0.0017088183616510082, + "loss": 0.7915, + "num_input_tokens_seen": 67200656, + "step": 115740 + }, + { + "epoch": 17.239350610664285, + "grad_norm": 0.08251953125, + "learning_rate": 0.0017079147421963563, + "loss": 0.8178, + "num_input_tokens_seen": 67203568, + "step": 115745 + }, + { + "epoch": 17.240095323205242, + "grad_norm": 0.0419921875, + "learning_rate": 0.0017070113472979797, + "loss": 0.7987, + "num_input_tokens_seen": 67206608, + "step": 115750 + }, + { + "epoch": 17.240840035746203, + "grad_norm": 0.041259765625, + "learning_rate": 0.0017061081769711329, + "loss": 0.8127, + "num_input_tokens_seen": 67209520, + "step": 115755 + }, + { + "epoch": 17.24158474828716, + "grad_norm": 0.07080078125, + "learning_rate": 0.00170520523123108, + "loss": 0.8098, + "num_input_tokens_seen": 67212240, + "step": 115760 + }, + { + "epoch": 17.24232946082812, + "grad_norm": 0.4765625, + "learning_rate": 0.001704302510093074, + "loss": 0.8243, + "num_input_tokens_seen": 67215440, + "step": 115765 + }, + { + "epoch": 17.24307417336908, + "grad_norm": 0.038330078125, + "learning_rate": 0.0017034000135723614, + "loss": 0.7854, + "num_input_tokens_seen": 67218512, + "step": 115770 + }, + { + "epoch": 17.243818885910038, + "grad_norm": 0.048828125, + "learning_rate": 0.0017024977416841929, + "loss": 0.7833, + "num_input_tokens_seen": 67221392, + "step": 115775 + }, + { + "epoch": 17.244563598451, + "grad_norm": 0.0400390625, + "learning_rate": 0.0017015956944438065, + "loss": 0.7868, + "num_input_tokens_seen": 67224336, + "step": 115780 + }, + { + "epoch": 17.245308310991955, + "grad_norm": 0.03564453125, + "learning_rate": 0.0017006938718664483, + "loss": 0.7851, + "num_input_tokens_seen": 67227280, + "step": 115785 + }, + { + "epoch": 17.246053023532916, + "grad_norm": 0.062255859375, + "learning_rate": 0.0016997922739673476, + "loss": 0.8246, + "num_input_tokens_seen": 67230128, + "step": 115790 + }, + { + "epoch": 17.246797736073876, + "grad_norm": 0.038330078125, + "learning_rate": 0.0016988909007617408, + "loss": 0.7977, + "num_input_tokens_seen": 67233072, + "step": 115795 + }, + { + "epoch": 17.247542448614833, + "grad_norm": 0.033447265625, + "learning_rate": 0.0016979897522648524, + "loss": 0.7879, + "num_input_tokens_seen": 67235952, + "step": 115800 + }, + { + "epoch": 17.248287161155794, + "grad_norm": 0.0439453125, + "learning_rate": 0.0016970888284919083, + "loss": 0.7993, + "num_input_tokens_seen": 67238896, + "step": 115805 + }, + { + "epoch": 17.249031873696755, + "grad_norm": 0.08154296875, + "learning_rate": 0.00169618812945813, + "loss": 0.7712, + "num_input_tokens_seen": 67241392, + "step": 115810 + }, + { + "epoch": 17.24977658623771, + "grad_norm": 0.08837890625, + "learning_rate": 0.0016952876551787266, + "loss": 0.8151, + "num_input_tokens_seen": 67244144, + "step": 115815 + }, + { + "epoch": 17.250521298778672, + "grad_norm": 0.04052734375, + "learning_rate": 0.001694387405668921, + "loss": 0.7918, + "num_input_tokens_seen": 67247216, + "step": 115820 + }, + { + "epoch": 17.25126601131963, + "grad_norm": 0.05859375, + "learning_rate": 0.0016934873809439132, + "loss": 0.8221, + "num_input_tokens_seen": 67250256, + "step": 115825 + }, + { + "epoch": 17.25201072386059, + "grad_norm": 0.050537109375, + "learning_rate": 0.0016925875810189154, + "loss": 0.7656, + "num_input_tokens_seen": 67253040, + "step": 115830 + }, + { + "epoch": 17.25275543640155, + "grad_norm": 0.080078125, + "learning_rate": 0.0016916880059091243, + "loss": 0.7908, + "num_input_tokens_seen": 67255952, + "step": 115835 + }, + { + "epoch": 17.253500148942507, + "grad_norm": 0.037353515625, + "learning_rate": 0.001690788655629739, + "loss": 0.7971, + "num_input_tokens_seen": 67258640, + "step": 115840 + }, + { + "epoch": 17.254244861483468, + "grad_norm": 0.044921875, + "learning_rate": 0.0016898895301959492, + "loss": 0.7897, + "num_input_tokens_seen": 67261584, + "step": 115845 + }, + { + "epoch": 17.254989574024428, + "grad_norm": 0.046630859375, + "learning_rate": 0.0016889906296229512, + "loss": 0.8107, + "num_input_tokens_seen": 67264528, + "step": 115850 + }, + { + "epoch": 17.255734286565385, + "grad_norm": 0.07470703125, + "learning_rate": 0.0016880919539259292, + "loss": 0.7869, + "num_input_tokens_seen": 67267536, + "step": 115855 + }, + { + "epoch": 17.256478999106346, + "grad_norm": 0.09228515625, + "learning_rate": 0.0016871935031200595, + "loss": 0.7879, + "num_input_tokens_seen": 67270320, + "step": 115860 + }, + { + "epoch": 17.257223711647303, + "grad_norm": 0.052978515625, + "learning_rate": 0.001686295277220527, + "loss": 0.7855, + "num_input_tokens_seen": 67273104, + "step": 115865 + }, + { + "epoch": 17.257968424188263, + "grad_norm": 0.06591796875, + "learning_rate": 0.0016853972762425039, + "loss": 0.7862, + "num_input_tokens_seen": 67276080, + "step": 115870 + }, + { + "epoch": 17.258713136729224, + "grad_norm": 0.041015625, + "learning_rate": 0.0016844995002011618, + "loss": 0.7941, + "num_input_tokens_seen": 67279152, + "step": 115875 + }, + { + "epoch": 17.25945784927018, + "grad_norm": 0.05126953125, + "learning_rate": 0.0016836019491116671, + "loss": 0.8021, + "num_input_tokens_seen": 67282192, + "step": 115880 + }, + { + "epoch": 17.26020256181114, + "grad_norm": 0.064453125, + "learning_rate": 0.001682704622989184, + "loss": 0.7678, + "num_input_tokens_seen": 67284880, + "step": 115885 + }, + { + "epoch": 17.2609472743521, + "grad_norm": 0.08740234375, + "learning_rate": 0.001681807521848872, + "loss": 0.8194, + "num_input_tokens_seen": 67287728, + "step": 115890 + }, + { + "epoch": 17.26169198689306, + "grad_norm": 0.033447265625, + "learning_rate": 0.0016809106457058858, + "loss": 0.8089, + "num_input_tokens_seen": 67290576, + "step": 115895 + }, + { + "epoch": 17.26243669943402, + "grad_norm": 0.0751953125, + "learning_rate": 0.0016800139945753784, + "loss": 0.8225, + "num_input_tokens_seen": 67293808, + "step": 115900 + }, + { + "epoch": 17.263181411974976, + "grad_norm": 0.0771484375, + "learning_rate": 0.0016791175684724923, + "loss": 0.7962, + "num_input_tokens_seen": 67296432, + "step": 115905 + }, + { + "epoch": 17.263926124515937, + "grad_norm": 0.054931640625, + "learning_rate": 0.0016782213674123808, + "loss": 0.8026, + "num_input_tokens_seen": 67299344, + "step": 115910 + }, + { + "epoch": 17.264670837056897, + "grad_norm": 0.07080078125, + "learning_rate": 0.0016773253914101748, + "loss": 0.7739, + "num_input_tokens_seen": 67302384, + "step": 115915 + }, + { + "epoch": 17.265415549597854, + "grad_norm": 0.03955078125, + "learning_rate": 0.001676429640481019, + "loss": 0.809, + "num_input_tokens_seen": 67305552, + "step": 115920 + }, + { + "epoch": 17.266160262138815, + "grad_norm": 0.052978515625, + "learning_rate": 0.0016755341146400414, + "loss": 0.8269, + "num_input_tokens_seen": 67308336, + "step": 115925 + }, + { + "epoch": 17.266904974679775, + "grad_norm": 0.043212890625, + "learning_rate": 0.0016746388139023747, + "loss": 0.806, + "num_input_tokens_seen": 67311440, + "step": 115930 + }, + { + "epoch": 17.267649687220732, + "grad_norm": 0.10302734375, + "learning_rate": 0.0016737437382831422, + "loss": 0.7833, + "num_input_tokens_seen": 67314352, + "step": 115935 + }, + { + "epoch": 17.268394399761693, + "grad_norm": 0.0498046875, + "learning_rate": 0.0016728488877974612, + "loss": 0.7739, + "num_input_tokens_seen": 67317168, + "step": 115940 + }, + { + "epoch": 17.26913911230265, + "grad_norm": 0.04052734375, + "learning_rate": 0.0016719542624604566, + "loss": 0.7871, + "num_input_tokens_seen": 67319920, + "step": 115945 + }, + { + "epoch": 17.26988382484361, + "grad_norm": 0.06298828125, + "learning_rate": 0.0016710598622872346, + "loss": 0.775, + "num_input_tokens_seen": 67322960, + "step": 115950 + }, + { + "epoch": 17.27062853738457, + "grad_norm": 0.051025390625, + "learning_rate": 0.0016701656872929148, + "loss": 0.8023, + "num_input_tokens_seen": 67325776, + "step": 115955 + }, + { + "epoch": 17.271373249925528, + "grad_norm": 0.0693359375, + "learning_rate": 0.0016692717374925969, + "loss": 0.7967, + "num_input_tokens_seen": 67329008, + "step": 115960 + }, + { + "epoch": 17.27211796246649, + "grad_norm": 0.072265625, + "learning_rate": 0.0016683780129013836, + "loss": 0.7858, + "num_input_tokens_seen": 67331760, + "step": 115965 + }, + { + "epoch": 17.272862675007445, + "grad_norm": 0.09521484375, + "learning_rate": 0.0016674845135343746, + "loss": 0.7782, + "num_input_tokens_seen": 67334864, + "step": 115970 + }, + { + "epoch": 17.273607387548406, + "grad_norm": 0.053955078125, + "learning_rate": 0.0016665912394066595, + "loss": 0.7911, + "num_input_tokens_seen": 67337840, + "step": 115975 + }, + { + "epoch": 17.274352100089367, + "grad_norm": 0.03173828125, + "learning_rate": 0.0016656981905333395, + "loss": 0.7791, + "num_input_tokens_seen": 67340752, + "step": 115980 + }, + { + "epoch": 17.275096812630323, + "grad_norm": 0.04150390625, + "learning_rate": 0.001664805366929491, + "loss": 0.7901, + "num_input_tokens_seen": 67343856, + "step": 115985 + }, + { + "epoch": 17.275841525171284, + "grad_norm": 0.047119140625, + "learning_rate": 0.0016639127686102068, + "loss": 0.8028, + "num_input_tokens_seen": 67346704, + "step": 115990 + }, + { + "epoch": 17.276586237712245, + "grad_norm": 0.06591796875, + "learning_rate": 0.0016630203955905597, + "loss": 0.7945, + "num_input_tokens_seen": 67349360, + "step": 115995 + }, + { + "epoch": 17.2773309502532, + "grad_norm": 0.053466796875, + "learning_rate": 0.0016621282478856297, + "loss": 0.7999, + "num_input_tokens_seen": 67352752, + "step": 116000 + }, + { + "epoch": 17.278075662794162, + "grad_norm": 0.04150390625, + "learning_rate": 0.0016612363255104844, + "loss": 0.8026, + "num_input_tokens_seen": 67355440, + "step": 116005 + }, + { + "epoch": 17.27882037533512, + "grad_norm": 0.03857421875, + "learning_rate": 0.0016603446284801987, + "loss": 0.7852, + "num_input_tokens_seen": 67358256, + "step": 116010 + }, + { + "epoch": 17.27956508787608, + "grad_norm": 0.06103515625, + "learning_rate": 0.0016594531568098303, + "loss": 0.7849, + "num_input_tokens_seen": 67361072, + "step": 116015 + }, + { + "epoch": 17.28030980041704, + "grad_norm": 0.05078125, + "learning_rate": 0.0016585619105144406, + "loss": 0.7897, + "num_input_tokens_seen": 67363888, + "step": 116020 + }, + { + "epoch": 17.281054512957997, + "grad_norm": 0.026611328125, + "learning_rate": 0.0016576708896090908, + "loss": 0.7726, + "num_input_tokens_seen": 67366768, + "step": 116025 + }, + { + "epoch": 17.281799225498958, + "grad_norm": 0.0888671875, + "learning_rate": 0.0016567800941088306, + "loss": 0.7956, + "num_input_tokens_seen": 67369872, + "step": 116030 + }, + { + "epoch": 17.282543938039918, + "grad_norm": 0.0673828125, + "learning_rate": 0.0016558895240287095, + "loss": 0.807, + "num_input_tokens_seen": 67372944, + "step": 116035 + }, + { + "epoch": 17.283288650580875, + "grad_norm": 0.05810546875, + "learning_rate": 0.0016549991793837692, + "loss": 0.782, + "num_input_tokens_seen": 67376208, + "step": 116040 + }, + { + "epoch": 17.284033363121836, + "grad_norm": 0.0634765625, + "learning_rate": 0.0016541090601890573, + "loss": 0.7833, + "num_input_tokens_seen": 67378832, + "step": 116045 + }, + { + "epoch": 17.284778075662793, + "grad_norm": 0.05322265625, + "learning_rate": 0.0016532191664596085, + "loss": 0.7974, + "num_input_tokens_seen": 67381776, + "step": 116050 + }, + { + "epoch": 17.285522788203753, + "grad_norm": 0.07568359375, + "learning_rate": 0.0016523294982104541, + "loss": 0.7982, + "num_input_tokens_seen": 67384784, + "step": 116055 + }, + { + "epoch": 17.286267500744714, + "grad_norm": 0.07568359375, + "learning_rate": 0.0016514400554566305, + "loss": 0.7954, + "num_input_tokens_seen": 67387824, + "step": 116060 + }, + { + "epoch": 17.28701221328567, + "grad_norm": 0.04345703125, + "learning_rate": 0.0016505508382131555, + "loss": 0.8011, + "num_input_tokens_seen": 67390832, + "step": 116065 + }, + { + "epoch": 17.28775692582663, + "grad_norm": 0.058837890625, + "learning_rate": 0.0016496618464950606, + "loss": 0.7942, + "num_input_tokens_seen": 67393808, + "step": 116070 + }, + { + "epoch": 17.288501638367592, + "grad_norm": 0.080078125, + "learning_rate": 0.0016487730803173572, + "loss": 0.7947, + "num_input_tokens_seen": 67396944, + "step": 116075 + }, + { + "epoch": 17.28924635090855, + "grad_norm": 0.05322265625, + "learning_rate": 0.0016478845396950663, + "loss": 0.7932, + "num_input_tokens_seen": 67399696, + "step": 116080 + }, + { + "epoch": 17.28999106344951, + "grad_norm": 0.11328125, + "learning_rate": 0.001646996224643193, + "loss": 0.7903, + "num_input_tokens_seen": 67402288, + "step": 116085 + }, + { + "epoch": 17.290735775990466, + "grad_norm": 0.1025390625, + "learning_rate": 0.0016461081351767496, + "loss": 0.7874, + "num_input_tokens_seen": 67405136, + "step": 116090 + }, + { + "epoch": 17.291480488531427, + "grad_norm": 0.083984375, + "learning_rate": 0.0016452202713107367, + "loss": 0.7888, + "num_input_tokens_seen": 67407920, + "step": 116095 + }, + { + "epoch": 17.292225201072387, + "grad_norm": 0.0634765625, + "learning_rate": 0.0016443326330601532, + "loss": 0.7843, + "num_input_tokens_seen": 67410832, + "step": 116100 + }, + { + "epoch": 17.292969913613344, + "grad_norm": 0.2001953125, + "learning_rate": 0.0016434452204399973, + "loss": 0.8064, + "num_input_tokens_seen": 67413616, + "step": 116105 + }, + { + "epoch": 17.293714626154305, + "grad_norm": 0.072265625, + "learning_rate": 0.0016425580334652556, + "loss": 0.7958, + "num_input_tokens_seen": 67416528, + "step": 116110 + }, + { + "epoch": 17.294459338695262, + "grad_norm": 0.05126953125, + "learning_rate": 0.0016416710721509225, + "loss": 0.7898, + "num_input_tokens_seen": 67419504, + "step": 116115 + }, + { + "epoch": 17.295204051236222, + "grad_norm": 0.048583984375, + "learning_rate": 0.0016407843365119777, + "loss": 0.7828, + "num_input_tokens_seen": 67422384, + "step": 116120 + }, + { + "epoch": 17.295948763777183, + "grad_norm": 0.052734375, + "learning_rate": 0.0016398978265634077, + "loss": 0.8025, + "num_input_tokens_seen": 67425488, + "step": 116125 + }, + { + "epoch": 17.29669347631814, + "grad_norm": 0.04443359375, + "learning_rate": 0.0016390115423201839, + "loss": 0.7926, + "num_input_tokens_seen": 67428592, + "step": 116130 + }, + { + "epoch": 17.2974381888591, + "grad_norm": 0.053466796875, + "learning_rate": 0.0016381254837972775, + "loss": 0.8057, + "num_input_tokens_seen": 67431312, + "step": 116135 + }, + { + "epoch": 17.29818290140006, + "grad_norm": 0.06640625, + "learning_rate": 0.0016372396510096648, + "loss": 0.8132, + "num_input_tokens_seen": 67434128, + "step": 116140 + }, + { + "epoch": 17.298927613941018, + "grad_norm": 0.04736328125, + "learning_rate": 0.0016363540439723022, + "loss": 0.8063, + "num_input_tokens_seen": 67436912, + "step": 116145 + }, + { + "epoch": 17.29967232648198, + "grad_norm": 0.11865234375, + "learning_rate": 0.0016354686627001612, + "loss": 0.7752, + "num_input_tokens_seen": 67439760, + "step": 116150 + }, + { + "epoch": 17.300417039022935, + "grad_norm": 0.091796875, + "learning_rate": 0.00163458350720819, + "loss": 0.7826, + "num_input_tokens_seen": 67442672, + "step": 116155 + }, + { + "epoch": 17.301161751563896, + "grad_norm": 0.033935546875, + "learning_rate": 0.0016336985775113476, + "loss": 0.7788, + "num_input_tokens_seen": 67445328, + "step": 116160 + }, + { + "epoch": 17.301906464104857, + "grad_norm": 0.07080078125, + "learning_rate": 0.0016328138736245845, + "loss": 0.7993, + "num_input_tokens_seen": 67448176, + "step": 116165 + }, + { + "epoch": 17.302651176645814, + "grad_norm": 0.09130859375, + "learning_rate": 0.0016319293955628466, + "loss": 0.7925, + "num_input_tokens_seen": 67450960, + "step": 116170 + }, + { + "epoch": 17.303395889186774, + "grad_norm": 0.04541015625, + "learning_rate": 0.001631045143341072, + "loss": 0.7917, + "num_input_tokens_seen": 67454000, + "step": 116175 + }, + { + "epoch": 17.304140601727735, + "grad_norm": 0.1083984375, + "learning_rate": 0.0016301611169742007, + "loss": 0.7926, + "num_input_tokens_seen": 67456976, + "step": 116180 + }, + { + "epoch": 17.30488531426869, + "grad_norm": 0.0869140625, + "learning_rate": 0.0016292773164771723, + "loss": 0.7951, + "num_input_tokens_seen": 67459856, + "step": 116185 + }, + { + "epoch": 17.305630026809652, + "grad_norm": 0.056640625, + "learning_rate": 0.0016283937418649097, + "loss": 0.7949, + "num_input_tokens_seen": 67462640, + "step": 116190 + }, + { + "epoch": 17.30637473935061, + "grad_norm": 0.09375, + "learning_rate": 0.0016275103931523492, + "loss": 0.7921, + "num_input_tokens_seen": 67465840, + "step": 116195 + }, + { + "epoch": 17.30711945189157, + "grad_norm": 0.053466796875, + "learning_rate": 0.001626627270354406, + "loss": 0.8808, + "num_input_tokens_seen": 67468656, + "step": 116200 + }, + { + "epoch": 17.30786416443253, + "grad_norm": 0.0927734375, + "learning_rate": 0.0016257443734860044, + "loss": 0.7998, + "num_input_tokens_seen": 67471600, + "step": 116205 + }, + { + "epoch": 17.308608876973487, + "grad_norm": 0.047607421875, + "learning_rate": 0.0016248617025620608, + "loss": 0.8312, + "num_input_tokens_seen": 67474256, + "step": 116210 + }, + { + "epoch": 17.309353589514448, + "grad_norm": 0.044189453125, + "learning_rate": 0.0016239792575974802, + "loss": 0.7969, + "num_input_tokens_seen": 67477072, + "step": 116215 + }, + { + "epoch": 17.31009830205541, + "grad_norm": 0.0537109375, + "learning_rate": 0.001623097038607179, + "loss": 0.7768, + "num_input_tokens_seen": 67479920, + "step": 116220 + }, + { + "epoch": 17.310843014596365, + "grad_norm": 0.047607421875, + "learning_rate": 0.001622215045606057, + "loss": 0.7845, + "num_input_tokens_seen": 67482704, + "step": 116225 + }, + { + "epoch": 17.311587727137326, + "grad_norm": 0.040283203125, + "learning_rate": 0.0016213332786090167, + "loss": 0.795, + "num_input_tokens_seen": 67485584, + "step": 116230 + }, + { + "epoch": 17.312332439678283, + "grad_norm": 0.043212890625, + "learning_rate": 0.0016204517376309485, + "loss": 0.7948, + "num_input_tokens_seen": 67488432, + "step": 116235 + }, + { + "epoch": 17.313077152219243, + "grad_norm": 0.05029296875, + "learning_rate": 0.0016195704226867535, + "loss": 0.798, + "num_input_tokens_seen": 67491248, + "step": 116240 + }, + { + "epoch": 17.313821864760204, + "grad_norm": 0.0703125, + "learning_rate": 0.0016186893337913149, + "loss": 0.7837, + "num_input_tokens_seen": 67494256, + "step": 116245 + }, + { + "epoch": 17.31456657730116, + "grad_norm": 0.052734375, + "learning_rate": 0.0016178084709595226, + "loss": 0.8004, + "num_input_tokens_seen": 67496880, + "step": 116250 + }, + { + "epoch": 17.31531128984212, + "grad_norm": 0.05322265625, + "learning_rate": 0.0016169278342062542, + "loss": 0.7717, + "num_input_tokens_seen": 67499760, + "step": 116255 + }, + { + "epoch": 17.316056002383082, + "grad_norm": 0.042724609375, + "learning_rate": 0.001616047423546385, + "loss": 0.8096, + "num_input_tokens_seen": 67502544, + "step": 116260 + }, + { + "epoch": 17.31680071492404, + "grad_norm": 0.142578125, + "learning_rate": 0.0016151672389947945, + "loss": 0.7744, + "num_input_tokens_seen": 67505360, + "step": 116265 + }, + { + "epoch": 17.317545427465, + "grad_norm": 0.05712890625, + "learning_rate": 0.0016142872805663478, + "loss": 0.7795, + "num_input_tokens_seen": 67508336, + "step": 116270 + }, + { + "epoch": 17.318290140005956, + "grad_norm": 0.04833984375, + "learning_rate": 0.0016134075482759158, + "loss": 0.7771, + "num_input_tokens_seen": 67511376, + "step": 116275 + }, + { + "epoch": 17.319034852546917, + "grad_norm": 0.0869140625, + "learning_rate": 0.0016125280421383536, + "loss": 0.8228, + "num_input_tokens_seen": 67514448, + "step": 116280 + }, + { + "epoch": 17.319779565087877, + "grad_norm": 0.03759765625, + "learning_rate": 0.0016116487621685275, + "loss": 0.8, + "num_input_tokens_seen": 67517104, + "step": 116285 + }, + { + "epoch": 17.320524277628834, + "grad_norm": 0.04931640625, + "learning_rate": 0.0016107697083812894, + "loss": 0.8171, + "num_input_tokens_seen": 67519856, + "step": 116290 + }, + { + "epoch": 17.321268990169795, + "grad_norm": 0.05419921875, + "learning_rate": 0.001609890880791487, + "loss": 0.8041, + "num_input_tokens_seen": 67522832, + "step": 116295 + }, + { + "epoch": 17.322013702710752, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00160901227941397, + "loss": 0.8192, + "num_input_tokens_seen": 67525712, + "step": 116300 + }, + { + "epoch": 17.322758415251712, + "grad_norm": 0.0615234375, + "learning_rate": 0.0016081339042635784, + "loss": 0.8008, + "num_input_tokens_seen": 67528528, + "step": 116305 + }, + { + "epoch": 17.323503127792673, + "grad_norm": 0.04052734375, + "learning_rate": 0.0016072557553551553, + "loss": 0.7797, + "num_input_tokens_seen": 67531312, + "step": 116310 + }, + { + "epoch": 17.32424784033363, + "grad_norm": 0.04736328125, + "learning_rate": 0.0016063778327035305, + "loss": 0.7981, + "num_input_tokens_seen": 67534032, + "step": 116315 + }, + { + "epoch": 17.32499255287459, + "grad_norm": 0.049072265625, + "learning_rate": 0.0016055001363235438, + "loss": 0.8366, + "num_input_tokens_seen": 67536752, + "step": 116320 + }, + { + "epoch": 17.32573726541555, + "grad_norm": 0.0380859375, + "learning_rate": 0.0016046226662300167, + "loss": 0.7818, + "num_input_tokens_seen": 67539792, + "step": 116325 + }, + { + "epoch": 17.326481977956508, + "grad_norm": 0.07568359375, + "learning_rate": 0.0016037454224377772, + "loss": 0.7813, + "num_input_tokens_seen": 67542704, + "step": 116330 + }, + { + "epoch": 17.32722669049747, + "grad_norm": 0.09912109375, + "learning_rate": 0.0016028684049616438, + "loss": 0.797, + "num_input_tokens_seen": 67545712, + "step": 116335 + }, + { + "epoch": 17.327971403038426, + "grad_norm": 0.04150390625, + "learning_rate": 0.0016019916138164292, + "loss": 0.7926, + "num_input_tokens_seen": 67548592, + "step": 116340 + }, + { + "epoch": 17.328716115579386, + "grad_norm": 0.049560546875, + "learning_rate": 0.0016011150490169517, + "loss": 0.802, + "num_input_tokens_seen": 67551632, + "step": 116345 + }, + { + "epoch": 17.329460828120347, + "grad_norm": 0.057861328125, + "learning_rate": 0.0016002387105780145, + "loss": 0.7954, + "num_input_tokens_seen": 67554800, + "step": 116350 + }, + { + "epoch": 17.330205540661304, + "grad_norm": 0.06298828125, + "learning_rate": 0.0015993625985144294, + "loss": 0.8172, + "num_input_tokens_seen": 67557552, + "step": 116355 + }, + { + "epoch": 17.330950253202264, + "grad_norm": 0.049560546875, + "learning_rate": 0.0015984867128409923, + "loss": 0.7903, + "num_input_tokens_seen": 67560944, + "step": 116360 + }, + { + "epoch": 17.331694965743225, + "grad_norm": 0.0693359375, + "learning_rate": 0.0015976110535725018, + "loss": 0.8029, + "num_input_tokens_seen": 67563664, + "step": 116365 + }, + { + "epoch": 17.33243967828418, + "grad_norm": 0.056396484375, + "learning_rate": 0.0015967356207237475, + "loss": 0.8139, + "num_input_tokens_seen": 67566320, + "step": 116370 + }, + { + "epoch": 17.333184390825142, + "grad_norm": 0.06884765625, + "learning_rate": 0.001595860414309526, + "loss": 0.8179, + "num_input_tokens_seen": 67569136, + "step": 116375 + }, + { + "epoch": 17.3339291033661, + "grad_norm": 0.03076171875, + "learning_rate": 0.001594985434344619, + "loss": 0.8097, + "num_input_tokens_seen": 67571792, + "step": 116380 + }, + { + "epoch": 17.33467381590706, + "grad_norm": 0.0595703125, + "learning_rate": 0.0015941106808438077, + "loss": 0.8105, + "num_input_tokens_seen": 67574832, + "step": 116385 + }, + { + "epoch": 17.33541852844802, + "grad_norm": 0.043212890625, + "learning_rate": 0.001593236153821872, + "loss": 0.8065, + "num_input_tokens_seen": 67577904, + "step": 116390 + }, + { + "epoch": 17.336163240988977, + "grad_norm": 0.044677734375, + "learning_rate": 0.0015923618532935834, + "loss": 0.7985, + "num_input_tokens_seen": 67580816, + "step": 116395 + }, + { + "epoch": 17.336907953529938, + "grad_norm": 0.041259765625, + "learning_rate": 0.0015914877792737169, + "loss": 0.8031, + "num_input_tokens_seen": 67583760, + "step": 116400 + }, + { + "epoch": 17.3376526660709, + "grad_norm": 0.0771484375, + "learning_rate": 0.001590613931777034, + "loss": 0.8138, + "num_input_tokens_seen": 67586960, + "step": 116405 + }, + { + "epoch": 17.338397378611855, + "grad_norm": 0.0537109375, + "learning_rate": 0.0015897403108183028, + "loss": 0.7923, + "num_input_tokens_seen": 67589776, + "step": 116410 + }, + { + "epoch": 17.339142091152816, + "grad_norm": 0.103515625, + "learning_rate": 0.0015888669164122798, + "loss": 0.8061, + "num_input_tokens_seen": 67592592, + "step": 116415 + }, + { + "epoch": 17.339886803693773, + "grad_norm": 0.0390625, + "learning_rate": 0.0015879937485737167, + "loss": 0.7937, + "num_input_tokens_seen": 67595760, + "step": 116420 + }, + { + "epoch": 17.340631516234733, + "grad_norm": 0.042724609375, + "learning_rate": 0.0015871208073173714, + "loss": 0.7906, + "num_input_tokens_seen": 67598864, + "step": 116425 + }, + { + "epoch": 17.341376228775694, + "grad_norm": 0.0703125, + "learning_rate": 0.0015862480926579874, + "loss": 0.8041, + "num_input_tokens_seen": 67601744, + "step": 116430 + }, + { + "epoch": 17.34212094131665, + "grad_norm": 0.04345703125, + "learning_rate": 0.0015853756046103078, + "loss": 0.7888, + "num_input_tokens_seen": 67604592, + "step": 116435 + }, + { + "epoch": 17.34286565385761, + "grad_norm": 0.05615234375, + "learning_rate": 0.0015845033431890708, + "loss": 0.783, + "num_input_tokens_seen": 67607376, + "step": 116440 + }, + { + "epoch": 17.343610366398572, + "grad_norm": 0.0576171875, + "learning_rate": 0.001583631308409018, + "loss": 0.7892, + "num_input_tokens_seen": 67609872, + "step": 116445 + }, + { + "epoch": 17.34435507893953, + "grad_norm": 0.07421875, + "learning_rate": 0.001582759500284876, + "loss": 0.7867, + "num_input_tokens_seen": 67613456, + "step": 116450 + }, + { + "epoch": 17.34509979148049, + "grad_norm": 0.08349609375, + "learning_rate": 0.0015818879188313794, + "loss": 0.7977, + "num_input_tokens_seen": 67616336, + "step": 116455 + }, + { + "epoch": 17.345844504021446, + "grad_norm": 0.09912109375, + "learning_rate": 0.0015810165640632466, + "loss": 0.8121, + "num_input_tokens_seen": 67618992, + "step": 116460 + }, + { + "epoch": 17.346589216562407, + "grad_norm": 0.048828125, + "learning_rate": 0.0015801454359951994, + "loss": 0.8266, + "num_input_tokens_seen": 67622032, + "step": 116465 + }, + { + "epoch": 17.347333929103367, + "grad_norm": 0.1259765625, + "learning_rate": 0.0015792745346419573, + "loss": 0.7855, + "num_input_tokens_seen": 67624784, + "step": 116470 + }, + { + "epoch": 17.348078641644324, + "grad_norm": 0.044921875, + "learning_rate": 0.0015784038600182304, + "loss": 0.7708, + "num_input_tokens_seen": 67627856, + "step": 116475 + }, + { + "epoch": 17.348823354185285, + "grad_norm": 0.03759765625, + "learning_rate": 0.0015775334121387301, + "loss": 0.793, + "num_input_tokens_seen": 67630544, + "step": 116480 + }, + { + "epoch": 17.349568066726242, + "grad_norm": 0.03857421875, + "learning_rate": 0.0015766631910181582, + "loss": 0.7865, + "num_input_tokens_seen": 67633360, + "step": 116485 + }, + { + "epoch": 17.350312779267203, + "grad_norm": 0.0830078125, + "learning_rate": 0.001575793196671223, + "loss": 0.7835, + "num_input_tokens_seen": 67636240, + "step": 116490 + }, + { + "epoch": 17.351057491808163, + "grad_norm": 0.04443359375, + "learning_rate": 0.0015749234291126174, + "loss": 0.8409, + "num_input_tokens_seen": 67639376, + "step": 116495 + }, + { + "epoch": 17.35180220434912, + "grad_norm": 0.037841796875, + "learning_rate": 0.0015740538883570365, + "loss": 0.8168, + "num_input_tokens_seen": 67642352, + "step": 116500 + }, + { + "epoch": 17.35254691689008, + "grad_norm": 0.046875, + "learning_rate": 0.0015731845744191685, + "loss": 0.804, + "num_input_tokens_seen": 67645136, + "step": 116505 + }, + { + "epoch": 17.35329162943104, + "grad_norm": 0.0517578125, + "learning_rate": 0.0015723154873136984, + "loss": 0.8058, + "num_input_tokens_seen": 67647888, + "step": 116510 + }, + { + "epoch": 17.354036341971998, + "grad_norm": 0.064453125, + "learning_rate": 0.0015714466270553146, + "loss": 0.8062, + "num_input_tokens_seen": 67651184, + "step": 116515 + }, + { + "epoch": 17.35478105451296, + "grad_norm": 0.047119140625, + "learning_rate": 0.0015705779936586867, + "loss": 0.8089, + "num_input_tokens_seen": 67654448, + "step": 116520 + }, + { + "epoch": 17.355525767053916, + "grad_norm": 0.030517578125, + "learning_rate": 0.0015697095871384996, + "loss": 0.791, + "num_input_tokens_seen": 67657232, + "step": 116525 + }, + { + "epoch": 17.356270479594876, + "grad_norm": 0.044189453125, + "learning_rate": 0.0015688414075094154, + "loss": 0.7833, + "num_input_tokens_seen": 67659888, + "step": 116530 + }, + { + "epoch": 17.357015192135837, + "grad_norm": 0.05712890625, + "learning_rate": 0.0015679734547861068, + "loss": 0.7877, + "num_input_tokens_seen": 67662544, + "step": 116535 + }, + { + "epoch": 17.357759904676794, + "grad_norm": 0.050537109375, + "learning_rate": 0.0015671057289832356, + "loss": 0.8088, + "num_input_tokens_seen": 67665552, + "step": 116540 + }, + { + "epoch": 17.358504617217754, + "grad_norm": 0.0556640625, + "learning_rate": 0.0015662382301154586, + "loss": 0.8003, + "num_input_tokens_seen": 67668688, + "step": 116545 + }, + { + "epoch": 17.359249329758715, + "grad_norm": 0.045654296875, + "learning_rate": 0.0015653709581974356, + "loss": 0.8032, + "num_input_tokens_seen": 67671536, + "step": 116550 + }, + { + "epoch": 17.35999404229967, + "grad_norm": 0.064453125, + "learning_rate": 0.0015645039132438148, + "loss": 0.7938, + "num_input_tokens_seen": 67674288, + "step": 116555 + }, + { + "epoch": 17.360738754840632, + "grad_norm": 0.0478515625, + "learning_rate": 0.0015636370952692464, + "loss": 0.7915, + "num_input_tokens_seen": 67677008, + "step": 116560 + }, + { + "epoch": 17.36148346738159, + "grad_norm": 0.0732421875, + "learning_rate": 0.00156277050428837, + "loss": 0.7914, + "num_input_tokens_seen": 67679664, + "step": 116565 + }, + { + "epoch": 17.36222817992255, + "grad_norm": 0.047119140625, + "learning_rate": 0.0015619041403158323, + "loss": 0.7887, + "num_input_tokens_seen": 67682544, + "step": 116570 + }, + { + "epoch": 17.36297289246351, + "grad_norm": 0.04150390625, + "learning_rate": 0.0015610380033662652, + "loss": 0.7788, + "num_input_tokens_seen": 67685616, + "step": 116575 + }, + { + "epoch": 17.363717605004467, + "grad_norm": 0.0458984375, + "learning_rate": 0.0015601720934543, + "loss": 0.7986, + "num_input_tokens_seen": 67688688, + "step": 116580 + }, + { + "epoch": 17.364462317545428, + "grad_norm": 0.04150390625, + "learning_rate": 0.0015593064105945702, + "loss": 0.789, + "num_input_tokens_seen": 67691472, + "step": 116585 + }, + { + "epoch": 17.36520703008639, + "grad_norm": 0.043701171875, + "learning_rate": 0.0015584409548016957, + "loss": 0.794, + "num_input_tokens_seen": 67694352, + "step": 116590 + }, + { + "epoch": 17.365951742627345, + "grad_norm": 0.0654296875, + "learning_rate": 0.0015575757260903017, + "loss": 0.8001, + "num_input_tokens_seen": 67697200, + "step": 116595 + }, + { + "epoch": 17.366696455168306, + "grad_norm": 0.06494140625, + "learning_rate": 0.0015567107244750028, + "loss": 0.8117, + "num_input_tokens_seen": 67699952, + "step": 116600 + }, + { + "epoch": 17.367441167709263, + "grad_norm": 0.0625, + "learning_rate": 0.0015558459499704141, + "loss": 0.8052, + "num_input_tokens_seen": 67702928, + "step": 116605 + }, + { + "epoch": 17.368185880250223, + "grad_norm": 0.05859375, + "learning_rate": 0.0015549814025911423, + "loss": 0.798, + "num_input_tokens_seen": 67705872, + "step": 116610 + }, + { + "epoch": 17.368930592791184, + "grad_norm": 0.038330078125, + "learning_rate": 0.0015541170823517958, + "loss": 0.8028, + "num_input_tokens_seen": 67708528, + "step": 116615 + }, + { + "epoch": 17.36967530533214, + "grad_norm": 0.060546875, + "learning_rate": 0.0015532529892669778, + "loss": 0.8019, + "num_input_tokens_seen": 67711568, + "step": 116620 + }, + { + "epoch": 17.3704200178731, + "grad_norm": 0.054443359375, + "learning_rate": 0.0015523891233512814, + "loss": 0.823, + "num_input_tokens_seen": 67714480, + "step": 116625 + }, + { + "epoch": 17.37116473041406, + "grad_norm": 0.041015625, + "learning_rate": 0.0015515254846193054, + "loss": 0.7963, + "num_input_tokens_seen": 67717296, + "step": 116630 + }, + { + "epoch": 17.37190944295502, + "grad_norm": 0.0830078125, + "learning_rate": 0.0015506620730856345, + "loss": 0.7946, + "num_input_tokens_seen": 67720144, + "step": 116635 + }, + { + "epoch": 17.37265415549598, + "grad_norm": 0.04296875, + "learning_rate": 0.0015497988887648606, + "loss": 0.7944, + "num_input_tokens_seen": 67723056, + "step": 116640 + }, + { + "epoch": 17.373398868036936, + "grad_norm": 0.1552734375, + "learning_rate": 0.0015489359316715617, + "loss": 0.7898, + "num_input_tokens_seen": 67725936, + "step": 116645 + }, + { + "epoch": 17.374143580577897, + "grad_norm": 0.07177734375, + "learning_rate": 0.001548073201820323, + "loss": 0.7933, + "num_input_tokens_seen": 67728880, + "step": 116650 + }, + { + "epoch": 17.374888293118858, + "grad_norm": 0.03515625, + "learning_rate": 0.0015472106992257145, + "loss": 0.7857, + "num_input_tokens_seen": 67731856, + "step": 116655 + }, + { + "epoch": 17.375633005659815, + "grad_norm": 0.0478515625, + "learning_rate": 0.0015463484239023062, + "loss": 0.7827, + "num_input_tokens_seen": 67734448, + "step": 116660 + }, + { + "epoch": 17.376377718200775, + "grad_norm": 0.0732421875, + "learning_rate": 0.0015454863758646714, + "loss": 0.7967, + "num_input_tokens_seen": 67737360, + "step": 116665 + }, + { + "epoch": 17.377122430741732, + "grad_norm": 0.0869140625, + "learning_rate": 0.0015446245551273652, + "loss": 0.8116, + "num_input_tokens_seen": 67740432, + "step": 116670 + }, + { + "epoch": 17.377867143282693, + "grad_norm": 0.044677734375, + "learning_rate": 0.0015437629617049541, + "loss": 0.8042, + "num_input_tokens_seen": 67743344, + "step": 116675 + }, + { + "epoch": 17.378611855823653, + "grad_norm": 0.0498046875, + "learning_rate": 0.00154290159561199, + "loss": 0.793, + "num_input_tokens_seen": 67746128, + "step": 116680 + }, + { + "epoch": 17.37935656836461, + "grad_norm": 0.05810546875, + "learning_rate": 0.0015420404568630296, + "loss": 0.7872, + "num_input_tokens_seen": 67749200, + "step": 116685 + }, + { + "epoch": 17.38010128090557, + "grad_norm": 0.023681640625, + "learning_rate": 0.0015411795454726162, + "loss": 0.7796, + "num_input_tokens_seen": 67751920, + "step": 116690 + }, + { + "epoch": 17.38084599344653, + "grad_norm": 0.05322265625, + "learning_rate": 0.0015403188614552964, + "loss": 0.8077, + "num_input_tokens_seen": 67754864, + "step": 116695 + }, + { + "epoch": 17.381590705987488, + "grad_norm": 0.04833984375, + "learning_rate": 0.0015394584048256086, + "loss": 0.7921, + "num_input_tokens_seen": 67757744, + "step": 116700 + }, + { + "epoch": 17.38233541852845, + "grad_norm": 0.0927734375, + "learning_rate": 0.0015385981755980881, + "loss": 0.83, + "num_input_tokens_seen": 67760400, + "step": 116705 + }, + { + "epoch": 17.383080131069406, + "grad_norm": 0.17578125, + "learning_rate": 0.001537738173787273, + "loss": 0.8011, + "num_input_tokens_seen": 67763280, + "step": 116710 + }, + { + "epoch": 17.383824843610366, + "grad_norm": 0.0390625, + "learning_rate": 0.0015368783994076868, + "loss": 0.7813, + "num_input_tokens_seen": 67766064, + "step": 116715 + }, + { + "epoch": 17.384569556151327, + "grad_norm": 0.0279541015625, + "learning_rate": 0.0015360188524738577, + "loss": 0.7927, + "num_input_tokens_seen": 67768752, + "step": 116720 + }, + { + "epoch": 17.385314268692284, + "grad_norm": 0.047607421875, + "learning_rate": 0.0015351595330003042, + "loss": 0.8047, + "num_input_tokens_seen": 67771728, + "step": 116725 + }, + { + "epoch": 17.386058981233244, + "grad_norm": 0.056640625, + "learning_rate": 0.0015343004410015482, + "loss": 0.7813, + "num_input_tokens_seen": 67774864, + "step": 116730 + }, + { + "epoch": 17.386803693774205, + "grad_norm": 0.056884765625, + "learning_rate": 0.001533441576492101, + "loss": 0.7866, + "num_input_tokens_seen": 67777968, + "step": 116735 + }, + { + "epoch": 17.38754840631516, + "grad_norm": 0.068359375, + "learning_rate": 0.0015325829394864665, + "loss": 0.7907, + "num_input_tokens_seen": 67781232, + "step": 116740 + }, + { + "epoch": 17.388293118856122, + "grad_norm": 0.0458984375, + "learning_rate": 0.001531724529999161, + "loss": 0.7809, + "num_input_tokens_seen": 67784112, + "step": 116745 + }, + { + "epoch": 17.38903783139708, + "grad_norm": 0.049560546875, + "learning_rate": 0.0015308663480446765, + "loss": 0.7956, + "num_input_tokens_seen": 67787120, + "step": 116750 + }, + { + "epoch": 17.38978254393804, + "grad_norm": 0.042236328125, + "learning_rate": 0.0015300083936375197, + "loss": 0.7815, + "num_input_tokens_seen": 67790032, + "step": 116755 + }, + { + "epoch": 17.390527256479, + "grad_norm": 0.06689453125, + "learning_rate": 0.001529150666792179, + "loss": 0.8092, + "num_input_tokens_seen": 67793264, + "step": 116760 + }, + { + "epoch": 17.391271969019957, + "grad_norm": 0.048583984375, + "learning_rate": 0.0015282931675231475, + "loss": 0.785, + "num_input_tokens_seen": 67796016, + "step": 116765 + }, + { + "epoch": 17.392016681560918, + "grad_norm": 0.06640625, + "learning_rate": 0.001527435895844909, + "loss": 0.8036, + "num_input_tokens_seen": 67798896, + "step": 116770 + }, + { + "epoch": 17.39276139410188, + "grad_norm": 0.11767578125, + "learning_rate": 0.0015265788517719503, + "loss": 0.803, + "num_input_tokens_seen": 67801680, + "step": 116775 + }, + { + "epoch": 17.393506106642835, + "grad_norm": 0.054931640625, + "learning_rate": 0.0015257220353187477, + "loss": 0.7857, + "num_input_tokens_seen": 67804592, + "step": 116780 + }, + { + "epoch": 17.394250819183796, + "grad_norm": 0.045654296875, + "learning_rate": 0.0015248654464997751, + "loss": 0.7842, + "num_input_tokens_seen": 67807632, + "step": 116785 + }, + { + "epoch": 17.394995531724753, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0015240090853295073, + "loss": 0.789, + "num_input_tokens_seen": 67810512, + "step": 116790 + }, + { + "epoch": 17.395740244265713, + "grad_norm": 0.07666015625, + "learning_rate": 0.0015231529518224063, + "loss": 0.8029, + "num_input_tokens_seen": 67813168, + "step": 116795 + }, + { + "epoch": 17.396484956806674, + "grad_norm": 0.0615234375, + "learning_rate": 0.0015222970459929418, + "loss": 0.7896, + "num_input_tokens_seen": 67815856, + "step": 116800 + }, + { + "epoch": 17.39722966934763, + "grad_norm": 0.040771484375, + "learning_rate": 0.0015214413678555677, + "loss": 0.781, + "num_input_tokens_seen": 67818576, + "step": 116805 + }, + { + "epoch": 17.39797438188859, + "grad_norm": 0.06005859375, + "learning_rate": 0.0015205859174247471, + "loss": 0.7852, + "num_input_tokens_seen": 67821200, + "step": 116810 + }, + { + "epoch": 17.39871909442955, + "grad_norm": 0.0400390625, + "learning_rate": 0.0015197306947149253, + "loss": 0.7955, + "num_input_tokens_seen": 67823984, + "step": 116815 + }, + { + "epoch": 17.39946380697051, + "grad_norm": 0.042236328125, + "learning_rate": 0.0015188756997405505, + "loss": 0.7939, + "num_input_tokens_seen": 67826704, + "step": 116820 + }, + { + "epoch": 17.40020851951147, + "grad_norm": 0.053466796875, + "learning_rate": 0.0015180209325160732, + "loss": 0.8184, + "num_input_tokens_seen": 67829712, + "step": 116825 + }, + { + "epoch": 17.400953232052427, + "grad_norm": 0.058349609375, + "learning_rate": 0.0015171663930559297, + "loss": 0.7813, + "num_input_tokens_seen": 67832528, + "step": 116830 + }, + { + "epoch": 17.401697944593387, + "grad_norm": 0.0517578125, + "learning_rate": 0.0015163120813745557, + "loss": 0.786, + "num_input_tokens_seen": 67835440, + "step": 116835 + }, + { + "epoch": 17.402442657134348, + "grad_norm": 0.07763671875, + "learning_rate": 0.0015154579974863823, + "loss": 0.7801, + "num_input_tokens_seen": 67838768, + "step": 116840 + }, + { + "epoch": 17.403187369675305, + "grad_norm": 0.08251953125, + "learning_rate": 0.0015146041414058435, + "loss": 0.8036, + "num_input_tokens_seen": 67841776, + "step": 116845 + }, + { + "epoch": 17.403932082216265, + "grad_norm": 0.02783203125, + "learning_rate": 0.0015137505131473594, + "loss": 0.8003, + "num_input_tokens_seen": 67844368, + "step": 116850 + }, + { + "epoch": 17.404676794757222, + "grad_norm": 0.0634765625, + "learning_rate": 0.0015128971127253564, + "loss": 0.8077, + "num_input_tokens_seen": 67847376, + "step": 116855 + }, + { + "epoch": 17.405421507298183, + "grad_norm": 0.0869140625, + "learning_rate": 0.0015120439401542485, + "loss": 0.8252, + "num_input_tokens_seen": 67850096, + "step": 116860 + }, + { + "epoch": 17.406166219839143, + "grad_norm": 0.0634765625, + "learning_rate": 0.001511190995448447, + "loss": 0.7728, + "num_input_tokens_seen": 67852720, + "step": 116865 + }, + { + "epoch": 17.4069109323801, + "grad_norm": 0.04052734375, + "learning_rate": 0.0015103382786223673, + "loss": 0.8058, + "num_input_tokens_seen": 67855376, + "step": 116870 + }, + { + "epoch": 17.40765564492106, + "grad_norm": 0.04638671875, + "learning_rate": 0.0015094857896904095, + "loss": 0.8077, + "num_input_tokens_seen": 67858320, + "step": 116875 + }, + { + "epoch": 17.40840035746202, + "grad_norm": 0.0634765625, + "learning_rate": 0.0015086335286669805, + "loss": 0.7765, + "num_input_tokens_seen": 67861168, + "step": 116880 + }, + { + "epoch": 17.409145070002978, + "grad_norm": 0.07421875, + "learning_rate": 0.0015077814955664754, + "loss": 0.8007, + "num_input_tokens_seen": 67863920, + "step": 116885 + }, + { + "epoch": 17.40988978254394, + "grad_norm": 0.046630859375, + "learning_rate": 0.0015069296904032907, + "loss": 0.7951, + "num_input_tokens_seen": 67866608, + "step": 116890 + }, + { + "epoch": 17.410634495084896, + "grad_norm": 0.03759765625, + "learning_rate": 0.0015060781131918104, + "loss": 0.7945, + "num_input_tokens_seen": 67869616, + "step": 116895 + }, + { + "epoch": 17.411379207625856, + "grad_norm": 0.0771484375, + "learning_rate": 0.0015052267639464312, + "loss": 0.7984, + "num_input_tokens_seen": 67872464, + "step": 116900 + }, + { + "epoch": 17.412123920166817, + "grad_norm": 0.042236328125, + "learning_rate": 0.0015043756426815279, + "loss": 0.8183, + "num_input_tokens_seen": 67875088, + "step": 116905 + }, + { + "epoch": 17.412868632707774, + "grad_norm": 0.057373046875, + "learning_rate": 0.0015035247494114795, + "loss": 0.8032, + "num_input_tokens_seen": 67878320, + "step": 116910 + }, + { + "epoch": 17.413613345248734, + "grad_norm": 0.044677734375, + "learning_rate": 0.0015026740841506674, + "loss": 0.8009, + "num_input_tokens_seen": 67881488, + "step": 116915 + }, + { + "epoch": 17.414358057789695, + "grad_norm": 0.039794921875, + "learning_rate": 0.0015018236469134538, + "loss": 0.7832, + "num_input_tokens_seen": 67884400, + "step": 116920 + }, + { + "epoch": 17.415102770330652, + "grad_norm": 0.06201171875, + "learning_rate": 0.0015009734377142152, + "loss": 0.8053, + "num_input_tokens_seen": 67887504, + "step": 116925 + }, + { + "epoch": 17.415847482871612, + "grad_norm": 0.038818359375, + "learning_rate": 0.001500123456567307, + "loss": 0.7951, + "num_input_tokens_seen": 67890512, + "step": 116930 + }, + { + "epoch": 17.41659219541257, + "grad_norm": 0.0966796875, + "learning_rate": 0.0014992737034870945, + "loss": 0.8095, + "num_input_tokens_seen": 67893712, + "step": 116935 + }, + { + "epoch": 17.41733690795353, + "grad_norm": 0.10009765625, + "learning_rate": 0.0014984241784879325, + "loss": 0.7883, + "num_input_tokens_seen": 67896176, + "step": 116940 + }, + { + "epoch": 17.41808162049449, + "grad_norm": 0.0693359375, + "learning_rate": 0.0014975748815841683, + "loss": 0.7929, + "num_input_tokens_seen": 67899248, + "step": 116945 + }, + { + "epoch": 17.418826333035447, + "grad_norm": 0.07080078125, + "learning_rate": 0.0014967258127901568, + "loss": 0.8068, + "num_input_tokens_seen": 67902032, + "step": 116950 + }, + { + "epoch": 17.419571045576408, + "grad_norm": 0.1484375, + "learning_rate": 0.001495876972120238, + "loss": 0.8195, + "num_input_tokens_seen": 67905168, + "step": 116955 + }, + { + "epoch": 17.42031575811737, + "grad_norm": 0.044921875, + "learning_rate": 0.0014950283595887525, + "loss": 0.7786, + "num_input_tokens_seen": 67907856, + "step": 116960 + }, + { + "epoch": 17.421060470658325, + "grad_norm": 0.04296875, + "learning_rate": 0.0014941799752100338, + "loss": 0.8063, + "num_input_tokens_seen": 67910704, + "step": 116965 + }, + { + "epoch": 17.421805183199286, + "grad_norm": 0.07666015625, + "learning_rate": 0.0014933318189984217, + "loss": 0.8299, + "num_input_tokens_seen": 67913328, + "step": 116970 + }, + { + "epoch": 17.422549895740243, + "grad_norm": 0.07421875, + "learning_rate": 0.0014924838909682369, + "loss": 0.7963, + "num_input_tokens_seen": 67916144, + "step": 116975 + }, + { + "epoch": 17.423294608281203, + "grad_norm": 0.06298828125, + "learning_rate": 0.001491636191133811, + "loss": 0.8049, + "num_input_tokens_seen": 67918832, + "step": 116980 + }, + { + "epoch": 17.424039320822164, + "grad_norm": 0.0498046875, + "learning_rate": 0.0014907887195094642, + "loss": 0.8118, + "num_input_tokens_seen": 67921872, + "step": 116985 + }, + { + "epoch": 17.42478403336312, + "grad_norm": 0.060791015625, + "learning_rate": 0.001489941476109507, + "loss": 0.7944, + "num_input_tokens_seen": 67924688, + "step": 116990 + }, + { + "epoch": 17.42552874590408, + "grad_norm": 0.07470703125, + "learning_rate": 0.0014890944609482625, + "loss": 0.8044, + "num_input_tokens_seen": 67927472, + "step": 116995 + }, + { + "epoch": 17.42627345844504, + "grad_norm": 0.0693359375, + "learning_rate": 0.0014882476740400313, + "loss": 0.7863, + "num_input_tokens_seen": 67930512, + "step": 117000 + }, + { + "epoch": 17.427018170986, + "grad_norm": 0.083984375, + "learning_rate": 0.0014874011153991251, + "loss": 0.807, + "num_input_tokens_seen": 67933392, + "step": 117005 + }, + { + "epoch": 17.42776288352696, + "grad_norm": 0.04248046875, + "learning_rate": 0.0014865547850398408, + "loss": 0.8018, + "num_input_tokens_seen": 67936400, + "step": 117010 + }, + { + "epoch": 17.428507596067917, + "grad_norm": 0.08984375, + "learning_rate": 0.001485708682976482, + "loss": 0.8003, + "num_input_tokens_seen": 67939600, + "step": 117015 + }, + { + "epoch": 17.429252308608877, + "grad_norm": 0.049072265625, + "learning_rate": 0.0014848628092233407, + "loss": 0.7926, + "num_input_tokens_seen": 67942480, + "step": 117020 + }, + { + "epoch": 17.429997021149838, + "grad_norm": 0.08203125, + "learning_rate": 0.0014840171637947052, + "loss": 0.8092, + "num_input_tokens_seen": 67945328, + "step": 117025 + }, + { + "epoch": 17.430741733690795, + "grad_norm": 0.072265625, + "learning_rate": 0.0014831717467048626, + "loss": 0.7958, + "num_input_tokens_seen": 67948080, + "step": 117030 + }, + { + "epoch": 17.431486446231755, + "grad_norm": 0.3984375, + "learning_rate": 0.0014823265579680916, + "loss": 0.8404, + "num_input_tokens_seen": 67950672, + "step": 117035 + }, + { + "epoch": 17.432231158772712, + "grad_norm": 0.043212890625, + "learning_rate": 0.0014814815975986789, + "loss": 0.8183, + "num_input_tokens_seen": 67953648, + "step": 117040 + }, + { + "epoch": 17.432975871313673, + "grad_norm": 0.0947265625, + "learning_rate": 0.0014806368656108914, + "loss": 0.7787, + "num_input_tokens_seen": 67956592, + "step": 117045 + }, + { + "epoch": 17.433720583854633, + "grad_norm": 0.040771484375, + "learning_rate": 0.0014797923620190078, + "loss": 0.7962, + "num_input_tokens_seen": 67959504, + "step": 117050 + }, + { + "epoch": 17.43446529639559, + "grad_norm": 0.0615234375, + "learning_rate": 0.0014789480868372866, + "loss": 0.8199, + "num_input_tokens_seen": 67962320, + "step": 117055 + }, + { + "epoch": 17.43521000893655, + "grad_norm": 0.0693359375, + "learning_rate": 0.00147810404008, + "loss": 0.7941, + "num_input_tokens_seen": 67965040, + "step": 117060 + }, + { + "epoch": 17.43595472147751, + "grad_norm": 0.060302734375, + "learning_rate": 0.0014772602217614011, + "loss": 0.8074, + "num_input_tokens_seen": 67967760, + "step": 117065 + }, + { + "epoch": 17.43669943401847, + "grad_norm": 0.04150390625, + "learning_rate": 0.0014764166318957438, + "loss": 0.7952, + "num_input_tokens_seen": 67970736, + "step": 117070 + }, + { + "epoch": 17.43744414655943, + "grad_norm": 0.05810546875, + "learning_rate": 0.0014755732704972851, + "loss": 0.815, + "num_input_tokens_seen": 67973712, + "step": 117075 + }, + { + "epoch": 17.438188859100386, + "grad_norm": 0.06591796875, + "learning_rate": 0.0014747301375802684, + "loss": 0.7843, + "num_input_tokens_seen": 67976368, + "step": 117080 + }, + { + "epoch": 17.438933571641346, + "grad_norm": 0.02587890625, + "learning_rate": 0.0014738872331589426, + "loss": 0.7965, + "num_input_tokens_seen": 67978928, + "step": 117085 + }, + { + "epoch": 17.439678284182307, + "grad_norm": 0.13671875, + "learning_rate": 0.0014730445572475443, + "loss": 0.8082, + "num_input_tokens_seen": 67981584, + "step": 117090 + }, + { + "epoch": 17.440422996723264, + "grad_norm": 0.059814453125, + "learning_rate": 0.0014722021098603104, + "loss": 0.8094, + "num_input_tokens_seen": 67984464, + "step": 117095 + }, + { + "epoch": 17.441167709264224, + "grad_norm": 0.060546875, + "learning_rate": 0.0014713598910114734, + "loss": 0.8156, + "num_input_tokens_seen": 67987344, + "step": 117100 + }, + { + "epoch": 17.441912421805185, + "grad_norm": 0.09423828125, + "learning_rate": 0.001470517900715258, + "loss": 0.8831, + "num_input_tokens_seen": 67990448, + "step": 117105 + }, + { + "epoch": 17.442657134346142, + "grad_norm": 0.064453125, + "learning_rate": 0.0014696761389858932, + "loss": 0.8194, + "num_input_tokens_seen": 67993360, + "step": 117110 + }, + { + "epoch": 17.443401846887102, + "grad_norm": 0.058837890625, + "learning_rate": 0.0014688346058375971, + "loss": 0.8131, + "num_input_tokens_seen": 67996048, + "step": 117115 + }, + { + "epoch": 17.44414655942806, + "grad_norm": 0.09130859375, + "learning_rate": 0.0014679933012845907, + "loss": 0.7964, + "num_input_tokens_seen": 67998992, + "step": 117120 + }, + { + "epoch": 17.44489127196902, + "grad_norm": 0.06494140625, + "learning_rate": 0.0014671522253410806, + "loss": 0.7877, + "num_input_tokens_seen": 68002192, + "step": 117125 + }, + { + "epoch": 17.44563598450998, + "grad_norm": 0.06005859375, + "learning_rate": 0.0014663113780212821, + "loss": 0.7796, + "num_input_tokens_seen": 68005072, + "step": 117130 + }, + { + "epoch": 17.446380697050937, + "grad_norm": 0.048828125, + "learning_rate": 0.0014654707593393956, + "loss": 0.7864, + "num_input_tokens_seen": 68007984, + "step": 117135 + }, + { + "epoch": 17.447125409591898, + "grad_norm": 0.04638671875, + "learning_rate": 0.001464630369309628, + "loss": 0.7945, + "num_input_tokens_seen": 68010832, + "step": 117140 + }, + { + "epoch": 17.447870122132855, + "grad_norm": 0.060791015625, + "learning_rate": 0.0014637902079461728, + "loss": 0.7996, + "num_input_tokens_seen": 68013520, + "step": 117145 + }, + { + "epoch": 17.448614834673815, + "grad_norm": 0.0615234375, + "learning_rate": 0.0014629502752632223, + "loss": 0.7753, + "num_input_tokens_seen": 68016304, + "step": 117150 + }, + { + "epoch": 17.449359547214776, + "grad_norm": 0.050537109375, + "learning_rate": 0.0014621105712749715, + "loss": 0.7848, + "num_input_tokens_seen": 68019376, + "step": 117155 + }, + { + "epoch": 17.450104259755733, + "grad_norm": 0.040771484375, + "learning_rate": 0.0014612710959956026, + "loss": 0.8104, + "num_input_tokens_seen": 68022192, + "step": 117160 + }, + { + "epoch": 17.450848972296694, + "grad_norm": 0.07177734375, + "learning_rate": 0.0014604318494392976, + "loss": 0.7805, + "num_input_tokens_seen": 68025264, + "step": 117165 + }, + { + "epoch": 17.451593684837654, + "grad_norm": 0.0458984375, + "learning_rate": 0.0014595928316202332, + "loss": 0.79, + "num_input_tokens_seen": 68027984, + "step": 117170 + }, + { + "epoch": 17.45233839737861, + "grad_norm": 0.08056640625, + "learning_rate": 0.0014587540425525902, + "loss": 0.8191, + "num_input_tokens_seen": 68030672, + "step": 117175 + }, + { + "epoch": 17.45308310991957, + "grad_norm": 0.0830078125, + "learning_rate": 0.0014579154822505335, + "loss": 0.8073, + "num_input_tokens_seen": 68033392, + "step": 117180 + }, + { + "epoch": 17.45382782246053, + "grad_norm": 0.04541015625, + "learning_rate": 0.0014570771507282287, + "loss": 0.7964, + "num_input_tokens_seen": 68036272, + "step": 117185 + }, + { + "epoch": 17.45457253500149, + "grad_norm": 0.06494140625, + "learning_rate": 0.0014562390479998443, + "loss": 0.8053, + "num_input_tokens_seen": 68038960, + "step": 117190 + }, + { + "epoch": 17.45531724754245, + "grad_norm": 0.06396484375, + "learning_rate": 0.0014554011740795325, + "loss": 0.7909, + "num_input_tokens_seen": 68041904, + "step": 117195 + }, + { + "epoch": 17.456061960083407, + "grad_norm": 0.10791015625, + "learning_rate": 0.001454563528981455, + "loss": 0.7895, + "num_input_tokens_seen": 68044752, + "step": 117200 + }, + { + "epoch": 17.456806672624367, + "grad_norm": 0.053466796875, + "learning_rate": 0.0014537261127197559, + "loss": 0.8113, + "num_input_tokens_seen": 68047440, + "step": 117205 + }, + { + "epoch": 17.457551385165328, + "grad_norm": 0.08349609375, + "learning_rate": 0.0014528889253085902, + "loss": 0.8056, + "num_input_tokens_seen": 68050448, + "step": 117210 + }, + { + "epoch": 17.458296097706285, + "grad_norm": 0.06396484375, + "learning_rate": 0.0014520519667620983, + "loss": 0.7933, + "num_input_tokens_seen": 68053328, + "step": 117215 + }, + { + "epoch": 17.459040810247245, + "grad_norm": 0.051025390625, + "learning_rate": 0.0014512152370944175, + "loss": 0.8067, + "num_input_tokens_seen": 68056048, + "step": 117220 + }, + { + "epoch": 17.459785522788202, + "grad_norm": 0.03515625, + "learning_rate": 0.001450378736319683, + "loss": 0.7967, + "num_input_tokens_seen": 68058640, + "step": 117225 + }, + { + "epoch": 17.460530235329163, + "grad_norm": 0.0986328125, + "learning_rate": 0.0014495424644520298, + "loss": 0.791, + "num_input_tokens_seen": 68061584, + "step": 117230 + }, + { + "epoch": 17.461274947870123, + "grad_norm": 0.04248046875, + "learning_rate": 0.0014487064215055856, + "loss": 0.7925, + "num_input_tokens_seen": 68064432, + "step": 117235 + }, + { + "epoch": 17.46201966041108, + "grad_norm": 0.028564453125, + "learning_rate": 0.0014478706074944703, + "loss": 0.7843, + "num_input_tokens_seen": 68067312, + "step": 117240 + }, + { + "epoch": 17.46276437295204, + "grad_norm": 0.048095703125, + "learning_rate": 0.001447035022432811, + "loss": 0.7962, + "num_input_tokens_seen": 68070064, + "step": 117245 + }, + { + "epoch": 17.463509085493, + "grad_norm": 0.0869140625, + "learning_rate": 0.0014461996663347166, + "loss": 0.7915, + "num_input_tokens_seen": 68073136, + "step": 117250 + }, + { + "epoch": 17.46425379803396, + "grad_norm": 0.05029296875, + "learning_rate": 0.001445364539214307, + "loss": 0.8006, + "num_input_tokens_seen": 68076304, + "step": 117255 + }, + { + "epoch": 17.46499851057492, + "grad_norm": 0.052001953125, + "learning_rate": 0.0014445296410856867, + "loss": 0.795, + "num_input_tokens_seen": 68079184, + "step": 117260 + }, + { + "epoch": 17.465743223115876, + "grad_norm": 0.13671875, + "learning_rate": 0.001443694971962957, + "loss": 0.7985, + "num_input_tokens_seen": 68082064, + "step": 117265 + }, + { + "epoch": 17.466487935656836, + "grad_norm": 0.042724609375, + "learning_rate": 0.001442860531860227, + "loss": 0.7952, + "num_input_tokens_seen": 68085232, + "step": 117270 + }, + { + "epoch": 17.467232648197797, + "grad_norm": 0.0634765625, + "learning_rate": 0.0014420263207915872, + "loss": 0.7813, + "num_input_tokens_seen": 68088176, + "step": 117275 + }, + { + "epoch": 17.467977360738754, + "grad_norm": 0.04443359375, + "learning_rate": 0.0014411923387711344, + "loss": 0.7943, + "num_input_tokens_seen": 68091504, + "step": 117280 + }, + { + "epoch": 17.468722073279714, + "grad_norm": 0.052978515625, + "learning_rate": 0.0014403585858129575, + "loss": 0.8162, + "num_input_tokens_seen": 68094224, + "step": 117285 + }, + { + "epoch": 17.469466785820675, + "grad_norm": 0.05810546875, + "learning_rate": 0.00143952506193114, + "loss": 0.7805, + "num_input_tokens_seen": 68097456, + "step": 117290 + }, + { + "epoch": 17.470211498361632, + "grad_norm": 0.0537109375, + "learning_rate": 0.0014386917671397625, + "loss": 0.8035, + "num_input_tokens_seen": 68100336, + "step": 117295 + }, + { + "epoch": 17.470956210902592, + "grad_norm": 0.049072265625, + "learning_rate": 0.001437858701452907, + "loss": 0.8047, + "num_input_tokens_seen": 68103184, + "step": 117300 + }, + { + "epoch": 17.47170092344355, + "grad_norm": 0.0673828125, + "learning_rate": 0.001437025864884644, + "loss": 0.7768, + "num_input_tokens_seen": 68105872, + "step": 117305 + }, + { + "epoch": 17.47244563598451, + "grad_norm": 0.04345703125, + "learning_rate": 0.001436193257449042, + "loss": 0.7995, + "num_input_tokens_seen": 68108656, + "step": 117310 + }, + { + "epoch": 17.47319034852547, + "grad_norm": 0.09033203125, + "learning_rate": 0.0014353608791601719, + "loss": 0.8169, + "num_input_tokens_seen": 68111376, + "step": 117315 + }, + { + "epoch": 17.473935061066427, + "grad_norm": 0.031494140625, + "learning_rate": 0.0014345287300320919, + "loss": 0.7955, + "num_input_tokens_seen": 68114384, + "step": 117320 + }, + { + "epoch": 17.474679773607388, + "grad_norm": 0.07958984375, + "learning_rate": 0.0014336968100788626, + "loss": 0.8084, + "num_input_tokens_seen": 68117072, + "step": 117325 + }, + { + "epoch": 17.47542448614835, + "grad_norm": 0.06640625, + "learning_rate": 0.0014328651193145348, + "loss": 0.7797, + "num_input_tokens_seen": 68120080, + "step": 117330 + }, + { + "epoch": 17.476169198689306, + "grad_norm": 0.05859375, + "learning_rate": 0.001432033657753165, + "loss": 0.8118, + "num_input_tokens_seen": 68122992, + "step": 117335 + }, + { + "epoch": 17.476913911230266, + "grad_norm": 0.0703125, + "learning_rate": 0.0014312024254087973, + "loss": 0.7774, + "num_input_tokens_seen": 68125680, + "step": 117340 + }, + { + "epoch": 17.477658623771223, + "grad_norm": 0.06103515625, + "learning_rate": 0.0014303714222954705, + "loss": 0.791, + "num_input_tokens_seen": 68128944, + "step": 117345 + }, + { + "epoch": 17.478403336312184, + "grad_norm": 0.08544921875, + "learning_rate": 0.0014295406484272299, + "loss": 0.7987, + "num_input_tokens_seen": 68131824, + "step": 117350 + }, + { + "epoch": 17.479148048853144, + "grad_norm": 0.043212890625, + "learning_rate": 0.0014287101038181077, + "loss": 0.8006, + "num_input_tokens_seen": 68134832, + "step": 117355 + }, + { + "epoch": 17.4798927613941, + "grad_norm": 0.06787109375, + "learning_rate": 0.0014278797884821359, + "loss": 0.7867, + "num_input_tokens_seen": 68137648, + "step": 117360 + }, + { + "epoch": 17.48063747393506, + "grad_norm": 0.049072265625, + "learning_rate": 0.0014270497024333367, + "loss": 0.8, + "num_input_tokens_seen": 68140368, + "step": 117365 + }, + { + "epoch": 17.48138218647602, + "grad_norm": 0.06494140625, + "learning_rate": 0.0014262198456857405, + "loss": 0.78, + "num_input_tokens_seen": 68143440, + "step": 117370 + }, + { + "epoch": 17.48212689901698, + "grad_norm": 0.036376953125, + "learning_rate": 0.0014253902182533628, + "loss": 0.8063, + "num_input_tokens_seen": 68146064, + "step": 117375 + }, + { + "epoch": 17.48287161155794, + "grad_norm": 0.0556640625, + "learning_rate": 0.0014245608201502223, + "loss": 0.7945, + "num_input_tokens_seen": 68148944, + "step": 117380 + }, + { + "epoch": 17.483616324098897, + "grad_norm": 0.03955078125, + "learning_rate": 0.0014237316513903313, + "loss": 0.7927, + "num_input_tokens_seen": 68151984, + "step": 117385 + }, + { + "epoch": 17.484361036639857, + "grad_norm": 0.0849609375, + "learning_rate": 0.0014229027119876901, + "loss": 0.7984, + "num_input_tokens_seen": 68155120, + "step": 117390 + }, + { + "epoch": 17.485105749180818, + "grad_norm": 0.08203125, + "learning_rate": 0.0014220740019563127, + "loss": 0.7833, + "num_input_tokens_seen": 68157904, + "step": 117395 + }, + { + "epoch": 17.485850461721775, + "grad_norm": 0.056884765625, + "learning_rate": 0.0014212455213101925, + "loss": 0.7854, + "num_input_tokens_seen": 68160528, + "step": 117400 + }, + { + "epoch": 17.486595174262735, + "grad_norm": 0.03857421875, + "learning_rate": 0.0014204172700633321, + "loss": 0.8182, + "num_input_tokens_seen": 68163152, + "step": 117405 + }, + { + "epoch": 17.487339886803692, + "grad_norm": 0.04052734375, + "learning_rate": 0.001419589248229715, + "loss": 0.7969, + "num_input_tokens_seen": 68166064, + "step": 117410 + }, + { + "epoch": 17.488084599344653, + "grad_norm": 0.0517578125, + "learning_rate": 0.0014187614558233402, + "loss": 0.7883, + "num_input_tokens_seen": 68168848, + "step": 117415 + }, + { + "epoch": 17.488829311885613, + "grad_norm": 0.053955078125, + "learning_rate": 0.0014179338928581864, + "loss": 0.8138, + "num_input_tokens_seen": 68172112, + "step": 117420 + }, + { + "epoch": 17.48957402442657, + "grad_norm": 0.064453125, + "learning_rate": 0.001417106559348234, + "loss": 0.7769, + "num_input_tokens_seen": 68174768, + "step": 117425 + }, + { + "epoch": 17.49031873696753, + "grad_norm": 0.052490234375, + "learning_rate": 0.001416279455307462, + "loss": 0.7976, + "num_input_tokens_seen": 68177680, + "step": 117430 + }, + { + "epoch": 17.49106344950849, + "grad_norm": 0.03173828125, + "learning_rate": 0.0014154525807498408, + "loss": 0.7964, + "num_input_tokens_seen": 68180656, + "step": 117435 + }, + { + "epoch": 17.49180816204945, + "grad_norm": 0.053955078125, + "learning_rate": 0.0014146259356893442, + "loss": 0.8026, + "num_input_tokens_seen": 68183504, + "step": 117440 + }, + { + "epoch": 17.49255287459041, + "grad_norm": 0.043212890625, + "learning_rate": 0.001413799520139931, + "loss": 0.8094, + "num_input_tokens_seen": 68186256, + "step": 117445 + }, + { + "epoch": 17.493297587131366, + "grad_norm": 0.04052734375, + "learning_rate": 0.0014129733341155704, + "loss": 0.793, + "num_input_tokens_seen": 68189200, + "step": 117450 + }, + { + "epoch": 17.494042299672326, + "grad_norm": 0.04638671875, + "learning_rate": 0.0014121473776302124, + "loss": 0.79, + "num_input_tokens_seen": 68192304, + "step": 117455 + }, + { + "epoch": 17.494787012213287, + "grad_norm": 0.037109375, + "learning_rate": 0.0014113216506978176, + "loss": 0.793, + "num_input_tokens_seen": 68195440, + "step": 117460 + }, + { + "epoch": 17.495531724754244, + "grad_norm": 0.0693359375, + "learning_rate": 0.0014104961533323317, + "loss": 0.8082, + "num_input_tokens_seen": 68198192, + "step": 117465 + }, + { + "epoch": 17.496276437295204, + "grad_norm": 0.05224609375, + "learning_rate": 0.0014096708855477003, + "loss": 0.7927, + "num_input_tokens_seen": 68200944, + "step": 117470 + }, + { + "epoch": 17.497021149836165, + "grad_norm": 0.0478515625, + "learning_rate": 0.0014088458473578685, + "loss": 0.7915, + "num_input_tokens_seen": 68203856, + "step": 117475 + }, + { + "epoch": 17.497765862377122, + "grad_norm": 0.0751953125, + "learning_rate": 0.0014080210387767704, + "loss": 0.8076, + "num_input_tokens_seen": 68206608, + "step": 117480 + }, + { + "epoch": 17.498510574918082, + "grad_norm": 0.0634765625, + "learning_rate": 0.001407196459818345, + "loss": 0.8173, + "num_input_tokens_seen": 68209456, + "step": 117485 + }, + { + "epoch": 17.49925528745904, + "grad_norm": 0.05810546875, + "learning_rate": 0.001406372110496521, + "loss": 0.8246, + "num_input_tokens_seen": 68212400, + "step": 117490 + }, + { + "epoch": 17.5, + "grad_norm": 0.0400390625, + "learning_rate": 0.0014055479908252238, + "loss": 0.8032, + "num_input_tokens_seen": 68215440, + "step": 117495 + }, + { + "epoch": 17.50074471254096, + "grad_norm": 0.06201171875, + "learning_rate": 0.0014047241008183725, + "loss": 0.8185, + "num_input_tokens_seen": 68218288, + "step": 117500 + }, + { + "epoch": 17.501489425081918, + "grad_norm": 0.049560546875, + "learning_rate": 0.0014039004404898942, + "loss": 0.7743, + "num_input_tokens_seen": 68221520, + "step": 117505 + }, + { + "epoch": 17.502234137622878, + "grad_norm": 0.07080078125, + "learning_rate": 0.0014030770098536992, + "loss": 0.7885, + "num_input_tokens_seen": 68224432, + "step": 117510 + }, + { + "epoch": 17.502978850163835, + "grad_norm": 0.08447265625, + "learning_rate": 0.001402253808923697, + "loss": 0.7877, + "num_input_tokens_seen": 68227408, + "step": 117515 + }, + { + "epoch": 17.503723562704796, + "grad_norm": 0.05224609375, + "learning_rate": 0.0014014308377137974, + "loss": 0.808, + "num_input_tokens_seen": 68230288, + "step": 117520 + }, + { + "epoch": 17.504468275245756, + "grad_norm": 0.0576171875, + "learning_rate": 0.0014006080962379013, + "loss": 0.8259, + "num_input_tokens_seen": 68233168, + "step": 117525 + }, + { + "epoch": 17.505212987786713, + "grad_norm": 0.0576171875, + "learning_rate": 0.0013997855845099111, + "loss": 0.7951, + "num_input_tokens_seen": 68236176, + "step": 117530 + }, + { + "epoch": 17.505957700327674, + "grad_norm": 0.033203125, + "learning_rate": 0.0013989633025437204, + "loss": 0.7841, + "num_input_tokens_seen": 68238992, + "step": 117535 + }, + { + "epoch": 17.506702412868634, + "grad_norm": 0.056640625, + "learning_rate": 0.0013981412503532213, + "loss": 0.7747, + "num_input_tokens_seen": 68242032, + "step": 117540 + }, + { + "epoch": 17.50744712540959, + "grad_norm": 0.0546875, + "learning_rate": 0.001397319427952303, + "loss": 0.8112, + "num_input_tokens_seen": 68245168, + "step": 117545 + }, + { + "epoch": 17.50819183795055, + "grad_norm": 0.04052734375, + "learning_rate": 0.0013964978353548478, + "loss": 0.7922, + "num_input_tokens_seen": 68248048, + "step": 117550 + }, + { + "epoch": 17.50893655049151, + "grad_norm": 0.047119140625, + "learning_rate": 0.0013956764725747328, + "loss": 0.8303, + "num_input_tokens_seen": 68251088, + "step": 117555 + }, + { + "epoch": 17.50968126303247, + "grad_norm": 0.05810546875, + "learning_rate": 0.00139485533962584, + "loss": 0.8085, + "num_input_tokens_seen": 68253872, + "step": 117560 + }, + { + "epoch": 17.51042597557343, + "grad_norm": 0.0244140625, + "learning_rate": 0.001394034436522037, + "loss": 0.7986, + "num_input_tokens_seen": 68256944, + "step": 117565 + }, + { + "epoch": 17.511170688114387, + "grad_norm": 0.039794921875, + "learning_rate": 0.0013932137632771922, + "loss": 0.7975, + "num_input_tokens_seen": 68260112, + "step": 117570 + }, + { + "epoch": 17.511915400655347, + "grad_norm": 0.043701171875, + "learning_rate": 0.0013923933199051753, + "loss": 0.8021, + "num_input_tokens_seen": 68262704, + "step": 117575 + }, + { + "epoch": 17.512660113196308, + "grad_norm": 0.0361328125, + "learning_rate": 0.0013915731064198394, + "loss": 0.799, + "num_input_tokens_seen": 68265552, + "step": 117580 + }, + { + "epoch": 17.513404825737265, + "grad_norm": 0.0546875, + "learning_rate": 0.0013907531228350473, + "loss": 0.7908, + "num_input_tokens_seen": 68268432, + "step": 117585 + }, + { + "epoch": 17.514149538278225, + "grad_norm": 0.115234375, + "learning_rate": 0.0013899333691646493, + "loss": 0.7957, + "num_input_tokens_seen": 68271376, + "step": 117590 + }, + { + "epoch": 17.514894250819182, + "grad_norm": 0.0849609375, + "learning_rate": 0.0013891138454224926, + "loss": 0.8028, + "num_input_tokens_seen": 68274192, + "step": 117595 + }, + { + "epoch": 17.515638963360143, + "grad_norm": 0.0654296875, + "learning_rate": 0.0013882945516224248, + "loss": 0.7701, + "num_input_tokens_seen": 68277328, + "step": 117600 + }, + { + "epoch": 17.516383675901103, + "grad_norm": 0.076171875, + "learning_rate": 0.0013874754877782846, + "loss": 0.7852, + "num_input_tokens_seen": 68280048, + "step": 117605 + }, + { + "epoch": 17.51712838844206, + "grad_norm": 0.05908203125, + "learning_rate": 0.0013866566539039143, + "loss": 0.8079, + "num_input_tokens_seen": 68282896, + "step": 117610 + }, + { + "epoch": 17.51787310098302, + "grad_norm": 0.06201171875, + "learning_rate": 0.0013858380500131428, + "loss": 0.8035, + "num_input_tokens_seen": 68285488, + "step": 117615 + }, + { + "epoch": 17.51861781352398, + "grad_norm": 0.04248046875, + "learning_rate": 0.0013850196761198024, + "loss": 0.811, + "num_input_tokens_seen": 68288240, + "step": 117620 + }, + { + "epoch": 17.51936252606494, + "grad_norm": 0.07275390625, + "learning_rate": 0.0013842015322377153, + "loss": 0.8083, + "num_input_tokens_seen": 68290992, + "step": 117625 + }, + { + "epoch": 17.5201072386059, + "grad_norm": 0.06884765625, + "learning_rate": 0.0013833836183807023, + "loss": 0.8149, + "num_input_tokens_seen": 68293872, + "step": 117630 + }, + { + "epoch": 17.520851951146856, + "grad_norm": 0.06005859375, + "learning_rate": 0.0013825659345625874, + "loss": 0.8003, + "num_input_tokens_seen": 68296816, + "step": 117635 + }, + { + "epoch": 17.521596663687816, + "grad_norm": 0.0654296875, + "learning_rate": 0.0013817484807971774, + "loss": 0.8033, + "num_input_tokens_seen": 68299792, + "step": 117640 + }, + { + "epoch": 17.522341376228777, + "grad_norm": 0.10791015625, + "learning_rate": 0.0013809312570982884, + "loss": 0.7681, + "num_input_tokens_seen": 68302736, + "step": 117645 + }, + { + "epoch": 17.523086088769734, + "grad_norm": 0.0595703125, + "learning_rate": 0.0013801142634797209, + "loss": 0.7673, + "num_input_tokens_seen": 68305680, + "step": 117650 + }, + { + "epoch": 17.523830801310694, + "grad_norm": 0.03369140625, + "learning_rate": 0.0013792974999552836, + "loss": 0.8163, + "num_input_tokens_seen": 68308496, + "step": 117655 + }, + { + "epoch": 17.52457551385165, + "grad_norm": 0.041259765625, + "learning_rate": 0.0013784809665387692, + "loss": 0.8012, + "num_input_tokens_seen": 68311376, + "step": 117660 + }, + { + "epoch": 17.525320226392612, + "grad_norm": 0.059814453125, + "learning_rate": 0.0013776646632439765, + "loss": 0.8353, + "num_input_tokens_seen": 68314096, + "step": 117665 + }, + { + "epoch": 17.526064938933573, + "grad_norm": 0.052001953125, + "learning_rate": 0.0013768485900846944, + "loss": 0.8067, + "num_input_tokens_seen": 68317328, + "step": 117670 + }, + { + "epoch": 17.52680965147453, + "grad_norm": 0.049072265625, + "learning_rate": 0.001376032747074707, + "loss": 0.7901, + "num_input_tokens_seen": 68320208, + "step": 117675 + }, + { + "epoch": 17.52755436401549, + "grad_norm": 0.053955078125, + "learning_rate": 0.001375217134227803, + "loss": 0.8151, + "num_input_tokens_seen": 68323504, + "step": 117680 + }, + { + "epoch": 17.52829907655645, + "grad_norm": 0.0634765625, + "learning_rate": 0.0013744017515577567, + "loss": 0.783, + "num_input_tokens_seen": 68326576, + "step": 117685 + }, + { + "epoch": 17.529043789097408, + "grad_norm": 0.0703125, + "learning_rate": 0.001373586599078345, + "loss": 0.7881, + "num_input_tokens_seen": 68329232, + "step": 117690 + }, + { + "epoch": 17.529788501638368, + "grad_norm": 0.039306640625, + "learning_rate": 0.0013727716768033359, + "loss": 0.8017, + "num_input_tokens_seen": 68332176, + "step": 117695 + }, + { + "epoch": 17.530533214179325, + "grad_norm": 0.052490234375, + "learning_rate": 0.0013719569847465013, + "loss": 0.7955, + "num_input_tokens_seen": 68335472, + "step": 117700 + }, + { + "epoch": 17.531277926720286, + "grad_norm": 0.046875, + "learning_rate": 0.0013711425229216035, + "loss": 0.7867, + "num_input_tokens_seen": 68338384, + "step": 117705 + }, + { + "epoch": 17.532022639261246, + "grad_norm": 0.0673828125, + "learning_rate": 0.0013703282913423965, + "loss": 0.8107, + "num_input_tokens_seen": 68341328, + "step": 117710 + }, + { + "epoch": 17.532767351802203, + "grad_norm": 0.057861328125, + "learning_rate": 0.0013695142900226442, + "loss": 0.8089, + "num_input_tokens_seen": 68344176, + "step": 117715 + }, + { + "epoch": 17.533512064343164, + "grad_norm": 0.0537109375, + "learning_rate": 0.0013687005189760926, + "loss": 0.7962, + "num_input_tokens_seen": 68346832, + "step": 117720 + }, + { + "epoch": 17.534256776884124, + "grad_norm": 0.05029296875, + "learning_rate": 0.0013678869782164937, + "loss": 0.7907, + "num_input_tokens_seen": 68349936, + "step": 117725 + }, + { + "epoch": 17.53500148942508, + "grad_norm": 0.05419921875, + "learning_rate": 0.001367073667757585, + "loss": 0.7955, + "num_input_tokens_seen": 68353168, + "step": 117730 + }, + { + "epoch": 17.53574620196604, + "grad_norm": 0.032958984375, + "learning_rate": 0.0013662605876131155, + "loss": 0.7921, + "num_input_tokens_seen": 68355856, + "step": 117735 + }, + { + "epoch": 17.536490914507, + "grad_norm": 0.06298828125, + "learning_rate": 0.0013654477377968127, + "loss": 0.7957, + "num_input_tokens_seen": 68359120, + "step": 117740 + }, + { + "epoch": 17.53723562704796, + "grad_norm": 0.056396484375, + "learning_rate": 0.0013646351183224169, + "loss": 0.8023, + "num_input_tokens_seen": 68362352, + "step": 117745 + }, + { + "epoch": 17.53798033958892, + "grad_norm": 0.0654296875, + "learning_rate": 0.0013638227292036508, + "loss": 0.7828, + "num_input_tokens_seen": 68365296, + "step": 117750 + }, + { + "epoch": 17.538725052129877, + "grad_norm": 0.04931640625, + "learning_rate": 0.0013630105704542416, + "loss": 0.7938, + "num_input_tokens_seen": 68368240, + "step": 117755 + }, + { + "epoch": 17.539469764670837, + "grad_norm": 0.040283203125, + "learning_rate": 0.0013621986420879085, + "loss": 0.7773, + "num_input_tokens_seen": 68371152, + "step": 117760 + }, + { + "epoch": 17.540214477211798, + "grad_norm": 0.037841796875, + "learning_rate": 0.0013613869441183668, + "loss": 0.8435, + "num_input_tokens_seen": 68373936, + "step": 117765 + }, + { + "epoch": 17.540959189752755, + "grad_norm": 0.061279296875, + "learning_rate": 0.0013605754765593325, + "loss": 0.8021, + "num_input_tokens_seen": 68376848, + "step": 117770 + }, + { + "epoch": 17.541703902293715, + "grad_norm": 0.026123046875, + "learning_rate": 0.0013597642394245096, + "loss": 0.791, + "num_input_tokens_seen": 68379664, + "step": 117775 + }, + { + "epoch": 17.542448614834672, + "grad_norm": 0.07861328125, + "learning_rate": 0.0013589532327276104, + "loss": 0.8073, + "num_input_tokens_seen": 68382480, + "step": 117780 + }, + { + "epoch": 17.543193327375633, + "grad_norm": 0.039794921875, + "learning_rate": 0.0013581424564823323, + "loss": 0.8097, + "num_input_tokens_seen": 68385136, + "step": 117785 + }, + { + "epoch": 17.543938039916593, + "grad_norm": 0.03759765625, + "learning_rate": 0.0013573319107023695, + "loss": 0.7956, + "num_input_tokens_seen": 68387856, + "step": 117790 + }, + { + "epoch": 17.54468275245755, + "grad_norm": 0.051513671875, + "learning_rate": 0.001356521595401421, + "loss": 0.8098, + "num_input_tokens_seen": 68390736, + "step": 117795 + }, + { + "epoch": 17.54542746499851, + "grad_norm": 0.05224609375, + "learning_rate": 0.0013557115105931705, + "loss": 0.8101, + "num_input_tokens_seen": 68393680, + "step": 117800 + }, + { + "epoch": 17.54617217753947, + "grad_norm": 0.056396484375, + "learning_rate": 0.001354901656291309, + "loss": 0.7857, + "num_input_tokens_seen": 68396592, + "step": 117805 + }, + { + "epoch": 17.54691689008043, + "grad_norm": 0.06396484375, + "learning_rate": 0.001354092032509514, + "loss": 0.7877, + "num_input_tokens_seen": 68399472, + "step": 117810 + }, + { + "epoch": 17.54766160262139, + "grad_norm": 0.040283203125, + "learning_rate": 0.0013532826392614677, + "loss": 0.7848, + "num_input_tokens_seen": 68402832, + "step": 117815 + }, + { + "epoch": 17.548406315162346, + "grad_norm": 0.052978515625, + "learning_rate": 0.0013524734765608408, + "loss": 0.7826, + "num_input_tokens_seen": 68405648, + "step": 117820 + }, + { + "epoch": 17.549151027703306, + "grad_norm": 0.04345703125, + "learning_rate": 0.0013516645444213043, + "loss": 0.7995, + "num_input_tokens_seen": 68408528, + "step": 117825 + }, + { + "epoch": 17.549895740244267, + "grad_norm": 0.0732421875, + "learning_rate": 0.0013508558428565237, + "loss": 0.8007, + "num_input_tokens_seen": 68411440, + "step": 117830 + }, + { + "epoch": 17.550640452785224, + "grad_norm": 0.11279296875, + "learning_rate": 0.001350047371880158, + "loss": 0.7929, + "num_input_tokens_seen": 68414480, + "step": 117835 + }, + { + "epoch": 17.551385165326185, + "grad_norm": 0.04443359375, + "learning_rate": 0.0013492391315058716, + "loss": 0.8067, + "num_input_tokens_seen": 68417264, + "step": 117840 + }, + { + "epoch": 17.552129877867145, + "grad_norm": 0.059326171875, + "learning_rate": 0.0013484311217473132, + "loss": 0.803, + "num_input_tokens_seen": 68420272, + "step": 117845 + }, + { + "epoch": 17.552874590408102, + "grad_norm": 0.05029296875, + "learning_rate": 0.0013476233426181388, + "loss": 0.7967, + "num_input_tokens_seen": 68423312, + "step": 117850 + }, + { + "epoch": 17.553619302949063, + "grad_norm": 0.0341796875, + "learning_rate": 0.0013468157941319908, + "loss": 0.7917, + "num_input_tokens_seen": 68426352, + "step": 117855 + }, + { + "epoch": 17.55436401549002, + "grad_norm": 0.07666015625, + "learning_rate": 0.0013460084763025149, + "loss": 0.8373, + "num_input_tokens_seen": 68429456, + "step": 117860 + }, + { + "epoch": 17.55510872803098, + "grad_norm": 0.052978515625, + "learning_rate": 0.00134520138914335, + "loss": 0.8065, + "num_input_tokens_seen": 68431952, + "step": 117865 + }, + { + "epoch": 17.55585344057194, + "grad_norm": 0.057861328125, + "learning_rate": 0.0013443945326681256, + "loss": 0.7898, + "num_input_tokens_seen": 68435120, + "step": 117870 + }, + { + "epoch": 17.556598153112898, + "grad_norm": 0.04833984375, + "learning_rate": 0.0013435879068904788, + "loss": 0.7902, + "num_input_tokens_seen": 68438064, + "step": 117875 + }, + { + "epoch": 17.557342865653858, + "grad_norm": 0.07080078125, + "learning_rate": 0.0013427815118240353, + "loss": 0.8097, + "num_input_tokens_seen": 68440880, + "step": 117880 + }, + { + "epoch": 17.558087578194815, + "grad_norm": 0.056884765625, + "learning_rate": 0.0013419753474824146, + "loss": 0.7828, + "num_input_tokens_seen": 68444080, + "step": 117885 + }, + { + "epoch": 17.558832290735776, + "grad_norm": 0.04052734375, + "learning_rate": 0.0013411694138792424, + "loss": 0.7817, + "num_input_tokens_seen": 68447088, + "step": 117890 + }, + { + "epoch": 17.559577003276736, + "grad_norm": 0.043701171875, + "learning_rate": 0.0013403637110281307, + "loss": 0.8059, + "num_input_tokens_seen": 68449936, + "step": 117895 + }, + { + "epoch": 17.560321715817693, + "grad_norm": 0.0322265625, + "learning_rate": 0.0013395582389426875, + "loss": 0.7954, + "num_input_tokens_seen": 68453168, + "step": 117900 + }, + { + "epoch": 17.561066428358654, + "grad_norm": 0.031494140625, + "learning_rate": 0.001338752997636528, + "loss": 0.8025, + "num_input_tokens_seen": 68456016, + "step": 117905 + }, + { + "epoch": 17.561811140899614, + "grad_norm": 0.037353515625, + "learning_rate": 0.0013379479871232502, + "loss": 0.792, + "num_input_tokens_seen": 68459088, + "step": 117910 + }, + { + "epoch": 17.56255585344057, + "grad_norm": 0.068359375, + "learning_rate": 0.0013371432074164545, + "loss": 0.792, + "num_input_tokens_seen": 68462000, + "step": 117915 + }, + { + "epoch": 17.56330056598153, + "grad_norm": 0.0673828125, + "learning_rate": 0.0013363386585297404, + "loss": 0.818, + "num_input_tokens_seen": 68465040, + "step": 117920 + }, + { + "epoch": 17.56404527852249, + "grad_norm": 0.041015625, + "learning_rate": 0.0013355343404766933, + "loss": 0.7965, + "num_input_tokens_seen": 68468240, + "step": 117925 + }, + { + "epoch": 17.56478999106345, + "grad_norm": 0.039306640625, + "learning_rate": 0.0013347302532709109, + "loss": 0.7856, + "num_input_tokens_seen": 68471120, + "step": 117930 + }, + { + "epoch": 17.56553470360441, + "grad_norm": 0.0400390625, + "learning_rate": 0.0013339263969259674, + "loss": 0.7865, + "num_input_tokens_seen": 68474064, + "step": 117935 + }, + { + "epoch": 17.566279416145367, + "grad_norm": 0.1083984375, + "learning_rate": 0.0013331227714554516, + "loss": 0.7989, + "num_input_tokens_seen": 68476848, + "step": 117940 + }, + { + "epoch": 17.567024128686327, + "grad_norm": 0.059814453125, + "learning_rate": 0.0013323193768729345, + "loss": 0.8112, + "num_input_tokens_seen": 68479920, + "step": 117945 + }, + { + "epoch": 17.567768841227288, + "grad_norm": 0.0537109375, + "learning_rate": 0.001331516213191992, + "loss": 0.7907, + "num_input_tokens_seen": 68482512, + "step": 117950 + }, + { + "epoch": 17.568513553768245, + "grad_norm": 0.0625, + "learning_rate": 0.0013307132804261917, + "loss": 0.8048, + "num_input_tokens_seen": 68485392, + "step": 117955 + }, + { + "epoch": 17.569258266309205, + "grad_norm": 0.037109375, + "learning_rate": 0.0013299105785890923, + "loss": 0.8011, + "num_input_tokens_seen": 68488240, + "step": 117960 + }, + { + "epoch": 17.570002978850162, + "grad_norm": 0.041748046875, + "learning_rate": 0.001329108107694265, + "loss": 0.7762, + "num_input_tokens_seen": 68491120, + "step": 117965 + }, + { + "epoch": 17.570747691391123, + "grad_norm": 0.0830078125, + "learning_rate": 0.0013283058677552588, + "loss": 0.7992, + "num_input_tokens_seen": 68493968, + "step": 117970 + }, + { + "epoch": 17.571492403932083, + "grad_norm": 0.044921875, + "learning_rate": 0.0013275038587856314, + "loss": 0.8032, + "num_input_tokens_seen": 68496848, + "step": 117975 + }, + { + "epoch": 17.57223711647304, + "grad_norm": 0.04443359375, + "learning_rate": 0.0013267020807989282, + "loss": 0.7762, + "num_input_tokens_seen": 68499664, + "step": 117980 + }, + { + "epoch": 17.572981829014, + "grad_norm": 0.039306640625, + "learning_rate": 0.0013259005338086987, + "loss": 0.785, + "num_input_tokens_seen": 68502608, + "step": 117985 + }, + { + "epoch": 17.57372654155496, + "grad_norm": 0.034423828125, + "learning_rate": 0.001325099217828482, + "loss": 0.8165, + "num_input_tokens_seen": 68505296, + "step": 117990 + }, + { + "epoch": 17.57447125409592, + "grad_norm": 0.04443359375, + "learning_rate": 0.0013242981328718138, + "loss": 0.7832, + "num_input_tokens_seen": 68508336, + "step": 117995 + }, + { + "epoch": 17.57521596663688, + "grad_norm": 0.051025390625, + "learning_rate": 0.0013234972789522303, + "loss": 0.7918, + "num_input_tokens_seen": 68512208, + "step": 118000 + }, + { + "epoch": 17.575960679177836, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0013226966560832586, + "loss": 0.8023, + "num_input_tokens_seen": 68515216, + "step": 118005 + }, + { + "epoch": 17.576705391718797, + "grad_norm": 0.055419921875, + "learning_rate": 0.001321896264278428, + "loss": 0.7743, + "num_input_tokens_seen": 68518224, + "step": 118010 + }, + { + "epoch": 17.577450104259757, + "grad_norm": 0.046875, + "learning_rate": 0.0013210961035512597, + "loss": 0.8346, + "num_input_tokens_seen": 68521200, + "step": 118015 + }, + { + "epoch": 17.578194816800714, + "grad_norm": 0.0673828125, + "learning_rate": 0.001320296173915269, + "loss": 0.8056, + "num_input_tokens_seen": 68524272, + "step": 118020 + }, + { + "epoch": 17.578939529341675, + "grad_norm": 0.048828125, + "learning_rate": 0.0013194964753839706, + "loss": 0.7801, + "num_input_tokens_seen": 68527120, + "step": 118025 + }, + { + "epoch": 17.57968424188263, + "grad_norm": 0.05419921875, + "learning_rate": 0.001318697007970873, + "loss": 0.821, + "num_input_tokens_seen": 68529808, + "step": 118030 + }, + { + "epoch": 17.580428954423592, + "grad_norm": 0.052490234375, + "learning_rate": 0.0013178977716894863, + "loss": 0.7968, + "num_input_tokens_seen": 68532912, + "step": 118035 + }, + { + "epoch": 17.581173666964553, + "grad_norm": 0.027587890625, + "learning_rate": 0.001317098766553309, + "loss": 0.8154, + "num_input_tokens_seen": 68535696, + "step": 118040 + }, + { + "epoch": 17.58191837950551, + "grad_norm": 0.05078125, + "learning_rate": 0.0013162999925758422, + "loss": 0.7922, + "num_input_tokens_seen": 68538416, + "step": 118045 + }, + { + "epoch": 17.58266309204647, + "grad_norm": 0.06689453125, + "learning_rate": 0.0013155014497705786, + "loss": 0.7955, + "num_input_tokens_seen": 68541264, + "step": 118050 + }, + { + "epoch": 17.58340780458743, + "grad_norm": 0.072265625, + "learning_rate": 0.001314703138151012, + "loss": 0.7791, + "num_input_tokens_seen": 68544272, + "step": 118055 + }, + { + "epoch": 17.584152517128388, + "grad_norm": 0.049072265625, + "learning_rate": 0.0013139050577306254, + "loss": 0.813, + "num_input_tokens_seen": 68547376, + "step": 118060 + }, + { + "epoch": 17.584897229669348, + "grad_norm": 0.0537109375, + "learning_rate": 0.0013131072085229044, + "loss": 0.8171, + "num_input_tokens_seen": 68550928, + "step": 118065 + }, + { + "epoch": 17.585641942210305, + "grad_norm": 0.09228515625, + "learning_rate": 0.001312309590541325, + "loss": 0.8005, + "num_input_tokens_seen": 68553680, + "step": 118070 + }, + { + "epoch": 17.586386654751266, + "grad_norm": 0.055419921875, + "learning_rate": 0.0013115122037993632, + "loss": 0.7945, + "num_input_tokens_seen": 68556592, + "step": 118075 + }, + { + "epoch": 17.587131367292226, + "grad_norm": 0.0400390625, + "learning_rate": 0.001310715048310491, + "loss": 0.7997, + "num_input_tokens_seen": 68559568, + "step": 118080 + }, + { + "epoch": 17.587876079833183, + "grad_norm": 0.0771484375, + "learning_rate": 0.0013099181240881768, + "loss": 0.8, + "num_input_tokens_seen": 68562096, + "step": 118085 + }, + { + "epoch": 17.588620792374144, + "grad_norm": 0.038330078125, + "learning_rate": 0.0013091214311458808, + "loss": 0.8039, + "num_input_tokens_seen": 68565104, + "step": 118090 + }, + { + "epoch": 17.589365504915104, + "grad_norm": 0.0380859375, + "learning_rate": 0.001308324969497061, + "loss": 0.7976, + "num_input_tokens_seen": 68567920, + "step": 118095 + }, + { + "epoch": 17.59011021745606, + "grad_norm": 0.07470703125, + "learning_rate": 0.0013075287391551781, + "loss": 0.8037, + "num_input_tokens_seen": 68570800, + "step": 118100 + }, + { + "epoch": 17.590854929997022, + "grad_norm": 0.09912109375, + "learning_rate": 0.001306732740133678, + "loss": 0.7844, + "num_input_tokens_seen": 68573616, + "step": 118105 + }, + { + "epoch": 17.59159964253798, + "grad_norm": 0.0654296875, + "learning_rate": 0.0013059369724460134, + "loss": 0.7899, + "num_input_tokens_seen": 68576528, + "step": 118110 + }, + { + "epoch": 17.59234435507894, + "grad_norm": 0.033447265625, + "learning_rate": 0.001305141436105625, + "loss": 0.8136, + "num_input_tokens_seen": 68579408, + "step": 118115 + }, + { + "epoch": 17.5930890676199, + "grad_norm": 0.06005859375, + "learning_rate": 0.0013043461311259524, + "loss": 0.7872, + "num_input_tokens_seen": 68582128, + "step": 118120 + }, + { + "epoch": 17.593833780160857, + "grad_norm": 0.045654296875, + "learning_rate": 0.0013035510575204344, + "loss": 0.838, + "num_input_tokens_seen": 68584752, + "step": 118125 + }, + { + "epoch": 17.594578492701817, + "grad_norm": 0.04833984375, + "learning_rate": 0.0013027562153024974, + "loss": 0.771, + "num_input_tokens_seen": 68587600, + "step": 118130 + }, + { + "epoch": 17.595323205242778, + "grad_norm": 0.046875, + "learning_rate": 0.0013019616044855752, + "loss": 0.7935, + "num_input_tokens_seen": 68590800, + "step": 118135 + }, + { + "epoch": 17.596067917783735, + "grad_norm": 0.045166015625, + "learning_rate": 0.0013011672250830874, + "loss": 0.8117, + "num_input_tokens_seen": 68593712, + "step": 118140 + }, + { + "epoch": 17.596812630324695, + "grad_norm": 0.03515625, + "learning_rate": 0.0013003730771084599, + "loss": 0.7881, + "num_input_tokens_seen": 68596624, + "step": 118145 + }, + { + "epoch": 17.597557342865652, + "grad_norm": 0.038818359375, + "learning_rate": 0.001299579160575105, + "loss": 0.8173, + "num_input_tokens_seen": 68599728, + "step": 118150 + }, + { + "epoch": 17.598302055406613, + "grad_norm": 0.041748046875, + "learning_rate": 0.0012987854754964356, + "loss": 0.7879, + "num_input_tokens_seen": 68602448, + "step": 118155 + }, + { + "epoch": 17.599046767947573, + "grad_norm": 0.060791015625, + "learning_rate": 0.001297992021885861, + "loss": 0.8093, + "num_input_tokens_seen": 68604976, + "step": 118160 + }, + { + "epoch": 17.59979148048853, + "grad_norm": 0.049072265625, + "learning_rate": 0.001297198799756782, + "loss": 0.812, + "num_input_tokens_seen": 68607824, + "step": 118165 + }, + { + "epoch": 17.60053619302949, + "grad_norm": 0.03515625, + "learning_rate": 0.0012964058091226042, + "loss": 0.8215, + "num_input_tokens_seen": 68610512, + "step": 118170 + }, + { + "epoch": 17.601280905570448, + "grad_norm": 0.06298828125, + "learning_rate": 0.0012956130499967194, + "loss": 0.791, + "num_input_tokens_seen": 68613328, + "step": 118175 + }, + { + "epoch": 17.60202561811141, + "grad_norm": 0.038818359375, + "learning_rate": 0.001294820522392528, + "loss": 0.8172, + "num_input_tokens_seen": 68616240, + "step": 118180 + }, + { + "epoch": 17.60277033065237, + "grad_norm": 0.044921875, + "learning_rate": 0.001294028226323411, + "loss": 0.8283, + "num_input_tokens_seen": 68619088, + "step": 118185 + }, + { + "epoch": 17.603515043193326, + "grad_norm": 0.0693359375, + "learning_rate": 0.001293236161802761, + "loss": 0.788, + "num_input_tokens_seen": 68621936, + "step": 118190 + }, + { + "epoch": 17.604259755734287, + "grad_norm": 0.02880859375, + "learning_rate": 0.001292444328843954, + "loss": 0.7925, + "num_input_tokens_seen": 68624656, + "step": 118195 + }, + { + "epoch": 17.605004468275247, + "grad_norm": 0.039306640625, + "learning_rate": 0.0012916527274603644, + "loss": 0.7961, + "num_input_tokens_seen": 68627536, + "step": 118200 + }, + { + "epoch": 17.605749180816204, + "grad_norm": 0.0380859375, + "learning_rate": 0.0012908613576653749, + "loss": 0.8115, + "num_input_tokens_seen": 68630320, + "step": 118205 + }, + { + "epoch": 17.606493893357165, + "grad_norm": 0.0712890625, + "learning_rate": 0.0012900702194723478, + "loss": 0.7824, + "num_input_tokens_seen": 68633168, + "step": 118210 + }, + { + "epoch": 17.60723860589812, + "grad_norm": 0.0625, + "learning_rate": 0.0012892793128946479, + "loss": 0.783, + "num_input_tokens_seen": 68636176, + "step": 118215 + }, + { + "epoch": 17.607983318439082, + "grad_norm": 0.04541015625, + "learning_rate": 0.0012884886379456405, + "loss": 0.787, + "num_input_tokens_seen": 68639056, + "step": 118220 + }, + { + "epoch": 17.608728030980043, + "grad_norm": 0.06494140625, + "learning_rate": 0.0012876981946386822, + "loss": 0.7894, + "num_input_tokens_seen": 68641904, + "step": 118225 + }, + { + "epoch": 17.609472743521, + "grad_norm": 0.06103515625, + "learning_rate": 0.0012869079829871271, + "loss": 0.7961, + "num_input_tokens_seen": 68644656, + "step": 118230 + }, + { + "epoch": 17.61021745606196, + "grad_norm": 0.06298828125, + "learning_rate": 0.0012861180030043211, + "loss": 0.7814, + "num_input_tokens_seen": 68647728, + "step": 118235 + }, + { + "epoch": 17.61096216860292, + "grad_norm": 0.06103515625, + "learning_rate": 0.001285328254703617, + "loss": 0.7959, + "num_input_tokens_seen": 68650352, + "step": 118240 + }, + { + "epoch": 17.611706881143878, + "grad_norm": 0.04638671875, + "learning_rate": 0.0012845387380983491, + "loss": 0.8021, + "num_input_tokens_seen": 68653424, + "step": 118245 + }, + { + "epoch": 17.61245159368484, + "grad_norm": 0.040283203125, + "learning_rate": 0.0012837494532018635, + "loss": 0.8124, + "num_input_tokens_seen": 68656176, + "step": 118250 + }, + { + "epoch": 17.613196306225795, + "grad_norm": 0.0400390625, + "learning_rate": 0.0012829604000274874, + "loss": 0.7818, + "num_input_tokens_seen": 68659120, + "step": 118255 + }, + { + "epoch": 17.613941018766756, + "grad_norm": 0.055908203125, + "learning_rate": 0.0012821715785885557, + "loss": 0.803, + "num_input_tokens_seen": 68662352, + "step": 118260 + }, + { + "epoch": 17.614685731307716, + "grad_norm": 0.07421875, + "learning_rate": 0.0012813829888983907, + "loss": 0.7776, + "num_input_tokens_seen": 68665872, + "step": 118265 + }, + { + "epoch": 17.615430443848673, + "grad_norm": 0.038330078125, + "learning_rate": 0.0012805946309703187, + "loss": 0.7949, + "num_input_tokens_seen": 68668688, + "step": 118270 + }, + { + "epoch": 17.616175156389634, + "grad_norm": 0.0673828125, + "learning_rate": 0.0012798065048176587, + "loss": 0.7973, + "num_input_tokens_seen": 68671760, + "step": 118275 + }, + { + "epoch": 17.616919868930594, + "grad_norm": 0.028564453125, + "learning_rate": 0.0012790186104537221, + "loss": 0.7762, + "num_input_tokens_seen": 68674416, + "step": 118280 + }, + { + "epoch": 17.61766458147155, + "grad_norm": 0.03857421875, + "learning_rate": 0.0012782309478918212, + "loss": 0.8451, + "num_input_tokens_seen": 68677424, + "step": 118285 + }, + { + "epoch": 17.618409294012512, + "grad_norm": 0.032470703125, + "learning_rate": 0.0012774435171452592, + "loss": 0.793, + "num_input_tokens_seen": 68680368, + "step": 118290 + }, + { + "epoch": 17.61915400655347, + "grad_norm": 0.04248046875, + "learning_rate": 0.0012766563182273433, + "loss": 0.7769, + "num_input_tokens_seen": 68683344, + "step": 118295 + }, + { + "epoch": 17.61989871909443, + "grad_norm": 0.06787109375, + "learning_rate": 0.0012758693511513712, + "loss": 0.7972, + "num_input_tokens_seen": 68686256, + "step": 118300 + }, + { + "epoch": 17.62064343163539, + "grad_norm": 0.033447265625, + "learning_rate": 0.0012750826159306377, + "loss": 0.7893, + "num_input_tokens_seen": 68689008, + "step": 118305 + }, + { + "epoch": 17.621388144176347, + "grad_norm": 0.1396484375, + "learning_rate": 0.0012742961125784369, + "loss": 0.7837, + "num_input_tokens_seen": 68691760, + "step": 118310 + }, + { + "epoch": 17.622132856717307, + "grad_norm": 0.055419921875, + "learning_rate": 0.0012735098411080482, + "loss": 0.799, + "num_input_tokens_seen": 68694736, + "step": 118315 + }, + { + "epoch": 17.622877569258268, + "grad_norm": 0.04638671875, + "learning_rate": 0.0012727238015327624, + "loss": 0.8042, + "num_input_tokens_seen": 68697968, + "step": 118320 + }, + { + "epoch": 17.623622281799225, + "grad_norm": 0.07421875, + "learning_rate": 0.0012719379938658558, + "loss": 0.7886, + "num_input_tokens_seen": 68702000, + "step": 118325 + }, + { + "epoch": 17.624366994340185, + "grad_norm": 0.0556640625, + "learning_rate": 0.0012711524181206045, + "loss": 0.7845, + "num_input_tokens_seen": 68704784, + "step": 118330 + }, + { + "epoch": 17.625111706881142, + "grad_norm": 0.05029296875, + "learning_rate": 0.0012703670743102779, + "loss": 0.7987, + "num_input_tokens_seen": 68707536, + "step": 118335 + }, + { + "epoch": 17.625856419422103, + "grad_norm": 0.03125, + "learning_rate": 0.0012695819624481486, + "loss": 0.8069, + "num_input_tokens_seen": 68710544, + "step": 118340 + }, + { + "epoch": 17.626601131963064, + "grad_norm": 0.037109375, + "learning_rate": 0.001268797082547476, + "loss": 0.7903, + "num_input_tokens_seen": 68713424, + "step": 118345 + }, + { + "epoch": 17.62734584450402, + "grad_norm": 0.02734375, + "learning_rate": 0.001268012434621521, + "loss": 0.7985, + "num_input_tokens_seen": 68716432, + "step": 118350 + }, + { + "epoch": 17.62809055704498, + "grad_norm": 0.0751953125, + "learning_rate": 0.00126722801868354, + "loss": 0.8031, + "num_input_tokens_seen": 68719408, + "step": 118355 + }, + { + "epoch": 17.62883526958594, + "grad_norm": 0.10546875, + "learning_rate": 0.001266443834746782, + "loss": 0.8113, + "num_input_tokens_seen": 68722224, + "step": 118360 + }, + { + "epoch": 17.6295799821269, + "grad_norm": 0.03759765625, + "learning_rate": 0.0012656598828245003, + "loss": 0.787, + "num_input_tokens_seen": 68724912, + "step": 118365 + }, + { + "epoch": 17.63032469466786, + "grad_norm": 0.042236328125, + "learning_rate": 0.001264876162929932, + "loss": 0.763, + "num_input_tokens_seen": 68728048, + "step": 118370 + }, + { + "epoch": 17.631069407208816, + "grad_norm": 0.06640625, + "learning_rate": 0.0012640926750763237, + "loss": 0.8164, + "num_input_tokens_seen": 68730768, + "step": 118375 + }, + { + "epoch": 17.631814119749777, + "grad_norm": 0.0947265625, + "learning_rate": 0.0012633094192769062, + "loss": 0.7935, + "num_input_tokens_seen": 68733328, + "step": 118380 + }, + { + "epoch": 17.632558832290737, + "grad_norm": 0.059326171875, + "learning_rate": 0.001262526395544919, + "loss": 0.7821, + "num_input_tokens_seen": 68736624, + "step": 118385 + }, + { + "epoch": 17.633303544831694, + "grad_norm": 0.068359375, + "learning_rate": 0.0012617436038935847, + "loss": 0.8005, + "num_input_tokens_seen": 68739600, + "step": 118390 + }, + { + "epoch": 17.634048257372655, + "grad_norm": 0.050048828125, + "learning_rate": 0.001260961044336128, + "loss": 0.8167, + "num_input_tokens_seen": 68742416, + "step": 118395 + }, + { + "epoch": 17.63479296991361, + "grad_norm": 0.06005859375, + "learning_rate": 0.0012601787168857715, + "loss": 0.807, + "num_input_tokens_seen": 68745328, + "step": 118400 + }, + { + "epoch": 17.635537682454572, + "grad_norm": 0.06103515625, + "learning_rate": 0.0012593966215557295, + "loss": 0.7736, + "num_input_tokens_seen": 68748240, + "step": 118405 + }, + { + "epoch": 17.636282394995533, + "grad_norm": 0.042724609375, + "learning_rate": 0.00125861475835922, + "loss": 0.7986, + "num_input_tokens_seen": 68751184, + "step": 118410 + }, + { + "epoch": 17.63702710753649, + "grad_norm": 0.0654296875, + "learning_rate": 0.0012578331273094456, + "loss": 0.8203, + "num_input_tokens_seen": 68754096, + "step": 118415 + }, + { + "epoch": 17.63777182007745, + "grad_norm": 0.05810546875, + "learning_rate": 0.0012570517284196158, + "loss": 0.7922, + "num_input_tokens_seen": 68757040, + "step": 118420 + }, + { + "epoch": 17.63851653261841, + "grad_norm": 0.08837890625, + "learning_rate": 0.0012562705617029251, + "loss": 0.8362, + "num_input_tokens_seen": 68759888, + "step": 118425 + }, + { + "epoch": 17.639261245159368, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0012554896271725778, + "loss": 0.7923, + "num_input_tokens_seen": 68762800, + "step": 118430 + }, + { + "epoch": 17.64000595770033, + "grad_norm": 0.036865234375, + "learning_rate": 0.0012547089248417652, + "loss": 0.8025, + "num_input_tokens_seen": 68765776, + "step": 118435 + }, + { + "epoch": 17.640750670241285, + "grad_norm": 0.03515625, + "learning_rate": 0.0012539284547236718, + "loss": 0.7937, + "num_input_tokens_seen": 68768592, + "step": 118440 + }, + { + "epoch": 17.641495382782246, + "grad_norm": 0.04638671875, + "learning_rate": 0.0012531482168314882, + "loss": 0.799, + "num_input_tokens_seen": 68771760, + "step": 118445 + }, + { + "epoch": 17.642240095323206, + "grad_norm": 0.0712890625, + "learning_rate": 0.0012523682111783912, + "loss": 0.8221, + "num_input_tokens_seen": 68774576, + "step": 118450 + }, + { + "epoch": 17.642984807864163, + "grad_norm": 0.06201171875, + "learning_rate": 0.001251588437777565, + "loss": 0.7916, + "num_input_tokens_seen": 68777552, + "step": 118455 + }, + { + "epoch": 17.643729520405124, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0012508088966421737, + "loss": 0.7889, + "num_input_tokens_seen": 68780592, + "step": 118460 + }, + { + "epoch": 17.644474232946084, + "grad_norm": 0.05224609375, + "learning_rate": 0.0012500295877853973, + "loss": 0.8047, + "num_input_tokens_seen": 68783536, + "step": 118465 + }, + { + "epoch": 17.64521894548704, + "grad_norm": 0.038330078125, + "learning_rate": 0.001249250511220395, + "loss": 0.8008, + "num_input_tokens_seen": 68786480, + "step": 118470 + }, + { + "epoch": 17.645963658028002, + "grad_norm": 0.03759765625, + "learning_rate": 0.0012484716669603263, + "loss": 0.7979, + "num_input_tokens_seen": 68789424, + "step": 118475 + }, + { + "epoch": 17.64670837056896, + "grad_norm": 0.041259765625, + "learning_rate": 0.0012476930550183556, + "loss": 0.7978, + "num_input_tokens_seen": 68792144, + "step": 118480 + }, + { + "epoch": 17.64745308310992, + "grad_norm": 0.06787109375, + "learning_rate": 0.0012469146754076343, + "loss": 0.7849, + "num_input_tokens_seen": 68795088, + "step": 118485 + }, + { + "epoch": 17.64819779565088, + "grad_norm": 0.0654296875, + "learning_rate": 0.0012461365281413116, + "loss": 0.7971, + "num_input_tokens_seen": 68798160, + "step": 118490 + }, + { + "epoch": 17.648942508191837, + "grad_norm": 0.0458984375, + "learning_rate": 0.001245358613232529, + "loss": 0.7957, + "num_input_tokens_seen": 68801232, + "step": 118495 + }, + { + "epoch": 17.649687220732797, + "grad_norm": 0.036865234375, + "learning_rate": 0.0012445809306944372, + "loss": 0.7858, + "num_input_tokens_seen": 68804176, + "step": 118500 + }, + { + "epoch": 17.650431933273758, + "grad_norm": 0.03759765625, + "learning_rate": 0.0012438034805401677, + "loss": 0.8106, + "num_input_tokens_seen": 68806928, + "step": 118505 + }, + { + "epoch": 17.651176645814715, + "grad_norm": 0.0439453125, + "learning_rate": 0.0012430262627828581, + "loss": 0.7846, + "num_input_tokens_seen": 68809616, + "step": 118510 + }, + { + "epoch": 17.651921358355676, + "grad_norm": 0.037353515625, + "learning_rate": 0.0012422492774356397, + "loss": 0.7907, + "num_input_tokens_seen": 68812496, + "step": 118515 + }, + { + "epoch": 17.652666070896633, + "grad_norm": 0.03515625, + "learning_rate": 0.0012414725245116338, + "loss": 0.8039, + "num_input_tokens_seen": 68815312, + "step": 118520 + }, + { + "epoch": 17.653410783437593, + "grad_norm": 0.06005859375, + "learning_rate": 0.001240696004023968, + "loss": 0.7796, + "num_input_tokens_seen": 68818384, + "step": 118525 + }, + { + "epoch": 17.654155495978554, + "grad_norm": 0.0498046875, + "learning_rate": 0.0012399197159857567, + "loss": 0.7914, + "num_input_tokens_seen": 68820976, + "step": 118530 + }, + { + "epoch": 17.65490020851951, + "grad_norm": 0.064453125, + "learning_rate": 0.0012391436604101197, + "loss": 0.7841, + "num_input_tokens_seen": 68823792, + "step": 118535 + }, + { + "epoch": 17.65564492106047, + "grad_norm": 0.044677734375, + "learning_rate": 0.0012383678373101648, + "loss": 0.7986, + "num_input_tokens_seen": 68826896, + "step": 118540 + }, + { + "epoch": 17.656389633601428, + "grad_norm": 0.059814453125, + "learning_rate": 0.0012375922466989963, + "loss": 0.8188, + "num_input_tokens_seen": 68829808, + "step": 118545 + }, + { + "epoch": 17.65713434614239, + "grad_norm": 0.056396484375, + "learning_rate": 0.0012368168885897206, + "loss": 0.7977, + "num_input_tokens_seen": 68832624, + "step": 118550 + }, + { + "epoch": 17.65787905868335, + "grad_norm": 0.06787109375, + "learning_rate": 0.0012360417629954357, + "loss": 0.7969, + "num_input_tokens_seen": 68835376, + "step": 118555 + }, + { + "epoch": 17.658623771224306, + "grad_norm": 0.052001953125, + "learning_rate": 0.0012352668699292374, + "loss": 0.7951, + "num_input_tokens_seen": 68838800, + "step": 118560 + }, + { + "epoch": 17.659368483765267, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0012344922094042103, + "loss": 0.7918, + "num_input_tokens_seen": 68841520, + "step": 118565 + }, + { + "epoch": 17.660113196306227, + "grad_norm": 0.055908203125, + "learning_rate": 0.001233717781433451, + "loss": 0.8065, + "num_input_tokens_seen": 68844432, + "step": 118570 + }, + { + "epoch": 17.660857908847184, + "grad_norm": 0.04296875, + "learning_rate": 0.0012329435860300353, + "loss": 0.8049, + "num_input_tokens_seen": 68847472, + "step": 118575 + }, + { + "epoch": 17.661602621388145, + "grad_norm": 0.036865234375, + "learning_rate": 0.0012321696232070495, + "loss": 0.7911, + "num_input_tokens_seen": 68850672, + "step": 118580 + }, + { + "epoch": 17.6623473339291, + "grad_norm": 0.056640625, + "learning_rate": 0.0012313958929775614, + "loss": 0.8167, + "num_input_tokens_seen": 68853584, + "step": 118585 + }, + { + "epoch": 17.663092046470062, + "grad_norm": 0.06640625, + "learning_rate": 0.0012306223953546475, + "loss": 0.7894, + "num_input_tokens_seen": 68856528, + "step": 118590 + }, + { + "epoch": 17.663836759011023, + "grad_norm": 0.025390625, + "learning_rate": 0.0012298491303513752, + "loss": 0.7826, + "num_input_tokens_seen": 68859504, + "step": 118595 + }, + { + "epoch": 17.66458147155198, + "grad_norm": 0.0478515625, + "learning_rate": 0.001229076097980803, + "loss": 0.7953, + "num_input_tokens_seen": 68862416, + "step": 118600 + }, + { + "epoch": 17.66532618409294, + "grad_norm": 0.035888671875, + "learning_rate": 0.0012283032982559965, + "loss": 0.7904, + "num_input_tokens_seen": 68865520, + "step": 118605 + }, + { + "epoch": 17.6660708966339, + "grad_norm": 0.025634765625, + "learning_rate": 0.001227530731190009, + "loss": 0.809, + "num_input_tokens_seen": 68868144, + "step": 118610 + }, + { + "epoch": 17.666815609174858, + "grad_norm": 0.40625, + "learning_rate": 0.0012267583967958916, + "loss": 0.8615, + "num_input_tokens_seen": 68870896, + "step": 118615 + }, + { + "epoch": 17.66756032171582, + "grad_norm": 0.05419921875, + "learning_rate": 0.0012259862950866906, + "loss": 0.8002, + "num_input_tokens_seen": 68873520, + "step": 118620 + }, + { + "epoch": 17.668305034256775, + "grad_norm": 0.052490234375, + "learning_rate": 0.0012252144260754554, + "loss": 0.7948, + "num_input_tokens_seen": 68876304, + "step": 118625 + }, + { + "epoch": 17.669049746797736, + "grad_norm": 0.044677734375, + "learning_rate": 0.0012244427897752208, + "loss": 0.7907, + "num_input_tokens_seen": 68879088, + "step": 118630 + }, + { + "epoch": 17.669794459338696, + "grad_norm": 0.037109375, + "learning_rate": 0.0012236713861990228, + "loss": 0.8088, + "num_input_tokens_seen": 68881872, + "step": 118635 + }, + { + "epoch": 17.670539171879653, + "grad_norm": 0.09765625, + "learning_rate": 0.0012229002153598977, + "loss": 0.803, + "num_input_tokens_seen": 68884784, + "step": 118640 + }, + { + "epoch": 17.671283884420614, + "grad_norm": 0.034912109375, + "learning_rate": 0.0012221292772708668, + "loss": 0.8337, + "num_input_tokens_seen": 68887728, + "step": 118645 + }, + { + "epoch": 17.672028596961574, + "grad_norm": 0.0595703125, + "learning_rate": 0.0012213585719449633, + "loss": 0.7999, + "num_input_tokens_seen": 68890768, + "step": 118650 + }, + { + "epoch": 17.67277330950253, + "grad_norm": 0.035888671875, + "learning_rate": 0.0012205880993951994, + "loss": 0.7836, + "num_input_tokens_seen": 68893648, + "step": 118655 + }, + { + "epoch": 17.673518022043492, + "grad_norm": 0.037109375, + "learning_rate": 0.001219817859634597, + "loss": 0.7998, + "num_input_tokens_seen": 68896528, + "step": 118660 + }, + { + "epoch": 17.67426273458445, + "grad_norm": 0.023681640625, + "learning_rate": 0.0012190478526761638, + "loss": 0.7955, + "num_input_tokens_seen": 68899152, + "step": 118665 + }, + { + "epoch": 17.67500744712541, + "grad_norm": 0.061279296875, + "learning_rate": 0.0012182780785329128, + "loss": 0.803, + "num_input_tokens_seen": 68902416, + "step": 118670 + }, + { + "epoch": 17.67575215966637, + "grad_norm": 0.05322265625, + "learning_rate": 0.0012175085372178485, + "loss": 0.771, + "num_input_tokens_seen": 68905328, + "step": 118675 + }, + { + "epoch": 17.676496872207327, + "grad_norm": 0.06103515625, + "learning_rate": 0.001216739228743967, + "loss": 0.7728, + "num_input_tokens_seen": 68907920, + "step": 118680 + }, + { + "epoch": 17.677241584748288, + "grad_norm": 0.06787109375, + "learning_rate": 0.00121597015312427, + "loss": 0.8106, + "num_input_tokens_seen": 68910800, + "step": 118685 + }, + { + "epoch": 17.677986297289245, + "grad_norm": 0.0927734375, + "learning_rate": 0.0012152013103717435, + "loss": 0.7971, + "num_input_tokens_seen": 68913616, + "step": 118690 + }, + { + "epoch": 17.678731009830205, + "grad_norm": 0.06787109375, + "learning_rate": 0.0012144327004993836, + "loss": 0.8331, + "num_input_tokens_seen": 68916336, + "step": 118695 + }, + { + "epoch": 17.679475722371166, + "grad_norm": 0.04296875, + "learning_rate": 0.0012136643235201704, + "loss": 0.7991, + "num_input_tokens_seen": 68919120, + "step": 118700 + }, + { + "epoch": 17.680220434912123, + "grad_norm": 0.08642578125, + "learning_rate": 0.0012128961794470882, + "loss": 0.7817, + "num_input_tokens_seen": 68921968, + "step": 118705 + }, + { + "epoch": 17.680965147453083, + "grad_norm": 0.11572265625, + "learning_rate": 0.0012121282682931117, + "loss": 0.8065, + "num_input_tokens_seen": 68924848, + "step": 118710 + }, + { + "epoch": 17.681709859994044, + "grad_norm": 0.04443359375, + "learning_rate": 0.0012113605900712155, + "loss": 0.784, + "num_input_tokens_seen": 68927696, + "step": 118715 + }, + { + "epoch": 17.682454572535, + "grad_norm": 0.041015625, + "learning_rate": 0.0012105931447943673, + "loss": 0.7943, + "num_input_tokens_seen": 68930352, + "step": 118720 + }, + { + "epoch": 17.68319928507596, + "grad_norm": 0.0419921875, + "learning_rate": 0.0012098259324755322, + "loss": 0.7933, + "num_input_tokens_seen": 68933424, + "step": 118725 + }, + { + "epoch": 17.683943997616918, + "grad_norm": 0.0771484375, + "learning_rate": 0.0012090589531276745, + "loss": 0.795, + "num_input_tokens_seen": 68936336, + "step": 118730 + }, + { + "epoch": 17.68468871015788, + "grad_norm": 0.059814453125, + "learning_rate": 0.0012082922067637458, + "loss": 0.7985, + "num_input_tokens_seen": 68939216, + "step": 118735 + }, + { + "epoch": 17.68543342269884, + "grad_norm": 0.051025390625, + "learning_rate": 0.0012075256933967071, + "loss": 0.7787, + "num_input_tokens_seen": 68942512, + "step": 118740 + }, + { + "epoch": 17.686178135239796, + "grad_norm": 0.046142578125, + "learning_rate": 0.0012067594130395032, + "loss": 0.799, + "num_input_tokens_seen": 68945200, + "step": 118745 + }, + { + "epoch": 17.686922847780757, + "grad_norm": 0.052001953125, + "learning_rate": 0.0012059933657050787, + "loss": 0.8229, + "num_input_tokens_seen": 68948272, + "step": 118750 + }, + { + "epoch": 17.687667560321717, + "grad_norm": 0.031494140625, + "learning_rate": 0.0012052275514063781, + "loss": 0.8104, + "num_input_tokens_seen": 68951472, + "step": 118755 + }, + { + "epoch": 17.688412272862674, + "grad_norm": 0.047607421875, + "learning_rate": 0.0012044619701563346, + "loss": 0.802, + "num_input_tokens_seen": 68954384, + "step": 118760 + }, + { + "epoch": 17.689156985403635, + "grad_norm": 0.039794921875, + "learning_rate": 0.001203696621967888, + "loss": 0.802, + "num_input_tokens_seen": 68957040, + "step": 118765 + }, + { + "epoch": 17.68990169794459, + "grad_norm": 0.048095703125, + "learning_rate": 0.0012029315068539624, + "loss": 0.7957, + "num_input_tokens_seen": 68959664, + "step": 118770 + }, + { + "epoch": 17.690646410485552, + "grad_norm": 0.054931640625, + "learning_rate": 0.001202166624827488, + "loss": 0.8419, + "num_input_tokens_seen": 68962576, + "step": 118775 + }, + { + "epoch": 17.691391123026513, + "grad_norm": 0.033935546875, + "learning_rate": 0.0012014019759013844, + "loss": 0.7848, + "num_input_tokens_seen": 68965712, + "step": 118780 + }, + { + "epoch": 17.69213583556747, + "grad_norm": 0.03369140625, + "learning_rate": 0.001200637560088571, + "loss": 0.7785, + "num_input_tokens_seen": 68968304, + "step": 118785 + }, + { + "epoch": 17.69288054810843, + "grad_norm": 0.05615234375, + "learning_rate": 0.0011998733774019592, + "loss": 0.811, + "num_input_tokens_seen": 68971344, + "step": 118790 + }, + { + "epoch": 17.69362526064939, + "grad_norm": 0.06298828125, + "learning_rate": 0.0011991094278544622, + "loss": 0.8017, + "num_input_tokens_seen": 68974032, + "step": 118795 + }, + { + "epoch": 17.694369973190348, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0011983457114589863, + "loss": 0.802, + "num_input_tokens_seen": 68977040, + "step": 118800 + }, + { + "epoch": 17.69511468573131, + "grad_norm": 0.036865234375, + "learning_rate": 0.0011975822282284292, + "loss": 0.7903, + "num_input_tokens_seen": 68979792, + "step": 118805 + }, + { + "epoch": 17.695859398272265, + "grad_norm": 0.04150390625, + "learning_rate": 0.0011968189781756944, + "loss": 0.8059, + "num_input_tokens_seen": 68982544, + "step": 118810 + }, + { + "epoch": 17.696604110813226, + "grad_norm": 0.0301513671875, + "learning_rate": 0.0011960559613136744, + "loss": 0.8058, + "num_input_tokens_seen": 68985360, + "step": 118815 + }, + { + "epoch": 17.697348823354186, + "grad_norm": 0.09130859375, + "learning_rate": 0.0011952931776552595, + "loss": 0.8016, + "num_input_tokens_seen": 68988208, + "step": 118820 + }, + { + "epoch": 17.698093535895143, + "grad_norm": 0.06005859375, + "learning_rate": 0.0011945306272133321, + "loss": 0.7924, + "num_input_tokens_seen": 68991280, + "step": 118825 + }, + { + "epoch": 17.698838248436104, + "grad_norm": 0.04150390625, + "learning_rate": 0.0011937683100007822, + "loss": 0.781, + "num_input_tokens_seen": 68994288, + "step": 118830 + }, + { + "epoch": 17.699582960977065, + "grad_norm": 0.04931640625, + "learning_rate": 0.0011930062260304846, + "loss": 0.8037, + "num_input_tokens_seen": 68997328, + "step": 118835 + }, + { + "epoch": 17.70032767351802, + "grad_norm": 0.05712890625, + "learning_rate": 0.0011922443753153106, + "loss": 0.7862, + "num_input_tokens_seen": 69000144, + "step": 118840 + }, + { + "epoch": 17.701072386058982, + "grad_norm": 0.05126953125, + "learning_rate": 0.001191482757868138, + "loss": 0.7856, + "num_input_tokens_seen": 69003152, + "step": 118845 + }, + { + "epoch": 17.70181709859994, + "grad_norm": 0.040771484375, + "learning_rate": 0.0011907213737018267, + "loss": 0.7948, + "num_input_tokens_seen": 69005904, + "step": 118850 + }, + { + "epoch": 17.7025618111409, + "grad_norm": 0.047607421875, + "learning_rate": 0.001189960222829245, + "loss": 0.7711, + "num_input_tokens_seen": 69008784, + "step": 118855 + }, + { + "epoch": 17.70330652368186, + "grad_norm": 0.06298828125, + "learning_rate": 0.0011891993052632471, + "loss": 0.7934, + "num_input_tokens_seen": 69011376, + "step": 118860 + }, + { + "epoch": 17.704051236222817, + "grad_norm": 0.05859375, + "learning_rate": 0.001188438621016693, + "loss": 0.7996, + "num_input_tokens_seen": 69014512, + "step": 118865 + }, + { + "epoch": 17.704795948763778, + "grad_norm": 0.035888671875, + "learning_rate": 0.0011876781701024308, + "loss": 0.798, + "num_input_tokens_seen": 69017648, + "step": 118870 + }, + { + "epoch": 17.705540661304738, + "grad_norm": 0.040771484375, + "learning_rate": 0.001186917952533305, + "loss": 0.7794, + "num_input_tokens_seen": 69020400, + "step": 118875 + }, + { + "epoch": 17.706285373845695, + "grad_norm": 0.056396484375, + "learning_rate": 0.001186157968322164, + "loss": 0.7923, + "num_input_tokens_seen": 69023216, + "step": 118880 + }, + { + "epoch": 17.707030086386656, + "grad_norm": 0.0419921875, + "learning_rate": 0.0011853982174818438, + "loss": 0.7883, + "num_input_tokens_seen": 69026192, + "step": 118885 + }, + { + "epoch": 17.707774798927613, + "grad_norm": 0.06640625, + "learning_rate": 0.0011846387000251813, + "loss": 0.7778, + "num_input_tokens_seen": 69028944, + "step": 118890 + }, + { + "epoch": 17.708519511468573, + "grad_norm": 0.095703125, + "learning_rate": 0.001183879415965004, + "loss": 0.8112, + "num_input_tokens_seen": 69031632, + "step": 118895 + }, + { + "epoch": 17.709264224009534, + "grad_norm": 0.040283203125, + "learning_rate": 0.0011831203653141437, + "loss": 0.7893, + "num_input_tokens_seen": 69034288, + "step": 118900 + }, + { + "epoch": 17.71000893655049, + "grad_norm": 0.022216796875, + "learning_rate": 0.00118236154808542, + "loss": 0.8125, + "num_input_tokens_seen": 69037136, + "step": 118905 + }, + { + "epoch": 17.71075364909145, + "grad_norm": 0.041748046875, + "learning_rate": 0.0011816029642916575, + "loss": 0.7913, + "num_input_tokens_seen": 69040048, + "step": 118910 + }, + { + "epoch": 17.711498361632408, + "grad_norm": 0.041015625, + "learning_rate": 0.0011808446139456663, + "loss": 0.7943, + "num_input_tokens_seen": 69042896, + "step": 118915 + }, + { + "epoch": 17.71224307417337, + "grad_norm": 0.05419921875, + "learning_rate": 0.0011800864970602592, + "loss": 0.8066, + "num_input_tokens_seen": 69046032, + "step": 118920 + }, + { + "epoch": 17.71298778671433, + "grad_norm": 0.06298828125, + "learning_rate": 0.0011793286136482478, + "loss": 0.7758, + "num_input_tokens_seen": 69048944, + "step": 118925 + }, + { + "epoch": 17.713732499255286, + "grad_norm": 0.06201171875, + "learning_rate": 0.00117857096372243, + "loss": 0.7965, + "num_input_tokens_seen": 69052080, + "step": 118930 + }, + { + "epoch": 17.714477211796247, + "grad_norm": 0.0380859375, + "learning_rate": 0.0011778135472956107, + "loss": 0.7931, + "num_input_tokens_seen": 69054960, + "step": 118935 + }, + { + "epoch": 17.715221924337207, + "grad_norm": 0.03271484375, + "learning_rate": 0.001177056364380583, + "loss": 0.7992, + "num_input_tokens_seen": 69058256, + "step": 118940 + }, + { + "epoch": 17.715966636878164, + "grad_norm": 0.042236328125, + "learning_rate": 0.0011762994149901384, + "loss": 0.7784, + "num_input_tokens_seen": 69061072, + "step": 118945 + }, + { + "epoch": 17.716711349419125, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0011755426991370633, + "loss": 0.7929, + "num_input_tokens_seen": 69064304, + "step": 118950 + }, + { + "epoch": 17.717456061960082, + "grad_norm": 0.052001953125, + "learning_rate": 0.0011747862168341471, + "loss": 0.7826, + "num_input_tokens_seen": 69067088, + "step": 118955 + }, + { + "epoch": 17.718200774501042, + "grad_norm": 0.05322265625, + "learning_rate": 0.0011740299680941652, + "loss": 0.7781, + "num_input_tokens_seen": 69069872, + "step": 118960 + }, + { + "epoch": 17.718945487042003, + "grad_norm": 0.037353515625, + "learning_rate": 0.0011732739529298935, + "loss": 0.7999, + "num_input_tokens_seen": 69072432, + "step": 118965 + }, + { + "epoch": 17.71969019958296, + "grad_norm": 0.03173828125, + "learning_rate": 0.0011725181713541071, + "loss": 0.8037, + "num_input_tokens_seen": 69075280, + "step": 118970 + }, + { + "epoch": 17.72043491212392, + "grad_norm": 0.0546875, + "learning_rate": 0.0011717626233795709, + "loss": 0.8458, + "num_input_tokens_seen": 69078192, + "step": 118975 + }, + { + "epoch": 17.72117962466488, + "grad_norm": 0.08837890625, + "learning_rate": 0.0011710073090190525, + "loss": 0.8064, + "num_input_tokens_seen": 69081392, + "step": 118980 + }, + { + "epoch": 17.721924337205838, + "grad_norm": 0.0517578125, + "learning_rate": 0.001170252228285309, + "loss": 0.8017, + "num_input_tokens_seen": 69084016, + "step": 118985 + }, + { + "epoch": 17.7226690497468, + "grad_norm": 0.050048828125, + "learning_rate": 0.0011694973811910995, + "loss": 0.801, + "num_input_tokens_seen": 69087088, + "step": 118990 + }, + { + "epoch": 17.723413762287755, + "grad_norm": 0.038818359375, + "learning_rate": 0.0011687427677491763, + "loss": 0.7831, + "num_input_tokens_seen": 69090064, + "step": 118995 + }, + { + "epoch": 17.724158474828716, + "grad_norm": 0.037353515625, + "learning_rate": 0.0011679883879722834, + "loss": 0.8016, + "num_input_tokens_seen": 69092976, + "step": 119000 + }, + { + "epoch": 17.724903187369677, + "grad_norm": 0.0380859375, + "learning_rate": 0.0011672342418731713, + "loss": 0.7956, + "num_input_tokens_seen": 69096240, + "step": 119005 + }, + { + "epoch": 17.725647899910633, + "grad_norm": 0.043701171875, + "learning_rate": 0.0011664803294645775, + "loss": 0.7934, + "num_input_tokens_seen": 69099248, + "step": 119010 + }, + { + "epoch": 17.726392612451594, + "grad_norm": 0.0546875, + "learning_rate": 0.0011657266507592373, + "loss": 0.8057, + "num_input_tokens_seen": 69102288, + "step": 119015 + }, + { + "epoch": 17.727137324992555, + "grad_norm": 0.0556640625, + "learning_rate": 0.0011649732057698835, + "loss": 0.8005, + "num_input_tokens_seen": 69105392, + "step": 119020 + }, + { + "epoch": 17.72788203753351, + "grad_norm": 0.046142578125, + "learning_rate": 0.0011642199945092497, + "loss": 0.8055, + "num_input_tokens_seen": 69108016, + "step": 119025 + }, + { + "epoch": 17.728626750074472, + "grad_norm": 0.0537109375, + "learning_rate": 0.0011634670169900535, + "loss": 0.7989, + "num_input_tokens_seen": 69110704, + "step": 119030 + }, + { + "epoch": 17.72937146261543, + "grad_norm": 0.04931640625, + "learning_rate": 0.0011627142732250216, + "loss": 0.7895, + "num_input_tokens_seen": 69113744, + "step": 119035 + }, + { + "epoch": 17.73011617515639, + "grad_norm": 0.083984375, + "learning_rate": 0.001161961763226869, + "loss": 0.7774, + "num_input_tokens_seen": 69116784, + "step": 119040 + }, + { + "epoch": 17.73086088769735, + "grad_norm": 0.03759765625, + "learning_rate": 0.001161209487008304, + "loss": 0.7865, + "num_input_tokens_seen": 69119856, + "step": 119045 + }, + { + "epoch": 17.731605600238307, + "grad_norm": 0.08349609375, + "learning_rate": 0.0011604574445820443, + "loss": 0.8092, + "num_input_tokens_seen": 69123120, + "step": 119050 + }, + { + "epoch": 17.732350312779268, + "grad_norm": 0.09814453125, + "learning_rate": 0.0011597056359607864, + "loss": 0.7825, + "num_input_tokens_seen": 69126096, + "step": 119055 + }, + { + "epoch": 17.733095025320225, + "grad_norm": 0.05224609375, + "learning_rate": 0.0011589540611572373, + "loss": 0.783, + "num_input_tokens_seen": 69128912, + "step": 119060 + }, + { + "epoch": 17.733839737861185, + "grad_norm": 0.06103515625, + "learning_rate": 0.00115820272018409, + "loss": 0.7952, + "num_input_tokens_seen": 69131920, + "step": 119065 + }, + { + "epoch": 17.734584450402146, + "grad_norm": 0.0654296875, + "learning_rate": 0.0011574516130540423, + "loss": 0.81, + "num_input_tokens_seen": 69134608, + "step": 119070 + }, + { + "epoch": 17.735329162943103, + "grad_norm": 0.05419921875, + "learning_rate": 0.0011567007397797811, + "loss": 0.7901, + "num_input_tokens_seen": 69137616, + "step": 119075 + }, + { + "epoch": 17.736073875484063, + "grad_norm": 0.040283203125, + "learning_rate": 0.0011559501003739897, + "loss": 0.7907, + "num_input_tokens_seen": 69140400, + "step": 119080 + }, + { + "epoch": 17.736818588025024, + "grad_norm": 0.056640625, + "learning_rate": 0.0011551996948493525, + "loss": 0.7915, + "num_input_tokens_seen": 69143248, + "step": 119085 + }, + { + "epoch": 17.73756330056598, + "grad_norm": 0.05078125, + "learning_rate": 0.0011544495232185413, + "loss": 0.8009, + "num_input_tokens_seen": 69146288, + "step": 119090 + }, + { + "epoch": 17.73830801310694, + "grad_norm": 0.035888671875, + "learning_rate": 0.0011536995854942377, + "loss": 0.8068, + "num_input_tokens_seen": 69149136, + "step": 119095 + }, + { + "epoch": 17.7390527256479, + "grad_norm": 0.027587890625, + "learning_rate": 0.0011529498816891031, + "loss": 0.7903, + "num_input_tokens_seen": 69151952, + "step": 119100 + }, + { + "epoch": 17.73979743818886, + "grad_norm": 0.07958984375, + "learning_rate": 0.001152200411815809, + "loss": 0.7873, + "num_input_tokens_seen": 69154576, + "step": 119105 + }, + { + "epoch": 17.74054215072982, + "grad_norm": 0.042724609375, + "learning_rate": 0.0011514511758870122, + "loss": 0.8059, + "num_input_tokens_seen": 69157680, + "step": 119110 + }, + { + "epoch": 17.741286863270776, + "grad_norm": 0.0458984375, + "learning_rate": 0.0011507021739153756, + "loss": 0.786, + "num_input_tokens_seen": 69160528, + "step": 119115 + }, + { + "epoch": 17.742031575811737, + "grad_norm": 0.0537109375, + "learning_rate": 0.0011499534059135508, + "loss": 0.8028, + "num_input_tokens_seen": 69163184, + "step": 119120 + }, + { + "epoch": 17.742776288352697, + "grad_norm": 0.053955078125, + "learning_rate": 0.0011492048718941827, + "loss": 0.8182, + "num_input_tokens_seen": 69166096, + "step": 119125 + }, + { + "epoch": 17.743521000893654, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011484565718699246, + "loss": 0.7961, + "num_input_tokens_seen": 69169072, + "step": 119130 + }, + { + "epoch": 17.744265713434615, + "grad_norm": 0.03369140625, + "learning_rate": 0.0011477085058534113, + "loss": 0.7942, + "num_input_tokens_seen": 69171728, + "step": 119135 + }, + { + "epoch": 17.745010425975572, + "grad_norm": 0.041015625, + "learning_rate": 0.0011469606738572862, + "loss": 0.7929, + "num_input_tokens_seen": 69174352, + "step": 119140 + }, + { + "epoch": 17.745755138516532, + "grad_norm": 0.047607421875, + "learning_rate": 0.0011462130758941823, + "loss": 0.7899, + "num_input_tokens_seen": 69176816, + "step": 119145 + }, + { + "epoch": 17.746499851057493, + "grad_norm": 0.05078125, + "learning_rate": 0.0011454657119767263, + "loss": 0.7866, + "num_input_tokens_seen": 69179920, + "step": 119150 + }, + { + "epoch": 17.74724456359845, + "grad_norm": 0.056640625, + "learning_rate": 0.001144718582117548, + "loss": 0.7724, + "num_input_tokens_seen": 69182960, + "step": 119155 + }, + { + "epoch": 17.74798927613941, + "grad_norm": 0.059814453125, + "learning_rate": 0.0011439716863292627, + "loss": 0.7832, + "num_input_tokens_seen": 69186192, + "step": 119160 + }, + { + "epoch": 17.74873398868037, + "grad_norm": 0.056884765625, + "learning_rate": 0.0011432250246244979, + "loss": 0.7975, + "num_input_tokens_seen": 69188752, + "step": 119165 + }, + { + "epoch": 17.749478701221328, + "grad_norm": 0.031982421875, + "learning_rate": 0.001142478597015859, + "loss": 0.801, + "num_input_tokens_seen": 69192048, + "step": 119170 + }, + { + "epoch": 17.75022341376229, + "grad_norm": 0.05224609375, + "learning_rate": 0.0011417324035159625, + "loss": 0.8072, + "num_input_tokens_seen": 69194928, + "step": 119175 + }, + { + "epoch": 17.750968126303245, + "grad_norm": 0.07080078125, + "learning_rate": 0.00114098644413741, + "loss": 0.7895, + "num_input_tokens_seen": 69198064, + "step": 119180 + }, + { + "epoch": 17.751712838844206, + "grad_norm": 0.060546875, + "learning_rate": 0.0011402407188928082, + "loss": 0.8008, + "num_input_tokens_seen": 69200912, + "step": 119185 + }, + { + "epoch": 17.752457551385167, + "grad_norm": 0.028076171875, + "learning_rate": 0.0011394952277947517, + "loss": 0.8334, + "num_input_tokens_seen": 69203536, + "step": 119190 + }, + { + "epoch": 17.753202263926124, + "grad_norm": 0.0419921875, + "learning_rate": 0.0011387499708558374, + "loss": 0.7945, + "num_input_tokens_seen": 69206384, + "step": 119195 + }, + { + "epoch": 17.753946976467084, + "grad_norm": 0.05419921875, + "learning_rate": 0.0011380049480886533, + "loss": 0.7836, + "num_input_tokens_seen": 69209040, + "step": 119200 + }, + { + "epoch": 17.75469168900804, + "grad_norm": 0.055419921875, + "learning_rate": 0.0011372601595057861, + "loss": 0.7924, + "num_input_tokens_seen": 69212176, + "step": 119205 + }, + { + "epoch": 17.755436401549, + "grad_norm": 0.0294189453125, + "learning_rate": 0.001136515605119821, + "loss": 0.812, + "num_input_tokens_seen": 69214800, + "step": 119210 + }, + { + "epoch": 17.756181114089962, + "grad_norm": 0.0751953125, + "learning_rate": 0.001135771284943336, + "loss": 0.7823, + "num_input_tokens_seen": 69217584, + "step": 119215 + }, + { + "epoch": 17.75692582663092, + "grad_norm": 0.03955078125, + "learning_rate": 0.0011350271989889027, + "loss": 0.8027, + "num_input_tokens_seen": 69220592, + "step": 119220 + }, + { + "epoch": 17.75767053917188, + "grad_norm": 0.076171875, + "learning_rate": 0.0011342833472690894, + "loss": 0.7991, + "num_input_tokens_seen": 69223536, + "step": 119225 + }, + { + "epoch": 17.75841525171284, + "grad_norm": 0.044677734375, + "learning_rate": 0.0011335397297964711, + "loss": 0.7967, + "num_input_tokens_seen": 69226320, + "step": 119230 + }, + { + "epoch": 17.759159964253797, + "grad_norm": 0.041015625, + "learning_rate": 0.0011327963465836027, + "loss": 0.8014, + "num_input_tokens_seen": 69229360, + "step": 119235 + }, + { + "epoch": 17.759904676794758, + "grad_norm": 0.05029296875, + "learning_rate": 0.0011320531976430476, + "loss": 0.8001, + "num_input_tokens_seen": 69232368, + "step": 119240 + }, + { + "epoch": 17.76064938933572, + "grad_norm": 0.05810546875, + "learning_rate": 0.0011313102829873605, + "loss": 0.7989, + "num_input_tokens_seen": 69235792, + "step": 119245 + }, + { + "epoch": 17.761394101876675, + "grad_norm": 0.047119140625, + "learning_rate": 0.0011305676026290867, + "loss": 0.791, + "num_input_tokens_seen": 69238704, + "step": 119250 + }, + { + "epoch": 17.762138814417636, + "grad_norm": 0.04833984375, + "learning_rate": 0.001129825156580781, + "loss": 0.7985, + "num_input_tokens_seen": 69241744, + "step": 119255 + }, + { + "epoch": 17.762883526958593, + "grad_norm": 0.0517578125, + "learning_rate": 0.0011290829448549783, + "loss": 0.7905, + "num_input_tokens_seen": 69244528, + "step": 119260 + }, + { + "epoch": 17.763628239499553, + "grad_norm": 0.052978515625, + "learning_rate": 0.0011283409674642253, + "loss": 0.8012, + "num_input_tokens_seen": 69247440, + "step": 119265 + }, + { + "epoch": 17.764372952040514, + "grad_norm": 0.38671875, + "learning_rate": 0.001127599224421052, + "loss": 0.8286, + "num_input_tokens_seen": 69250256, + "step": 119270 + }, + { + "epoch": 17.76511766458147, + "grad_norm": 0.040771484375, + "learning_rate": 0.0011268577157379916, + "loss": 0.7986, + "num_input_tokens_seen": 69253104, + "step": 119275 + }, + { + "epoch": 17.76586237712243, + "grad_norm": 0.04150390625, + "learning_rate": 0.0011261164414275676, + "loss": 0.8013, + "num_input_tokens_seen": 69255792, + "step": 119280 + }, + { + "epoch": 17.76660708966339, + "grad_norm": 0.04736328125, + "learning_rate": 0.0011253754015023065, + "loss": 0.8108, + "num_input_tokens_seen": 69258768, + "step": 119285 + }, + { + "epoch": 17.76735180220435, + "grad_norm": 0.0286865234375, + "learning_rate": 0.0011246345959747283, + "loss": 0.7936, + "num_input_tokens_seen": 69261616, + "step": 119290 + }, + { + "epoch": 17.76809651474531, + "grad_norm": 0.054443359375, + "learning_rate": 0.0011238940248573415, + "loss": 0.8043, + "num_input_tokens_seen": 69264656, + "step": 119295 + }, + { + "epoch": 17.768841227286266, + "grad_norm": 0.0478515625, + "learning_rate": 0.0011231536881626657, + "loss": 0.7941, + "num_input_tokens_seen": 69267920, + "step": 119300 + }, + { + "epoch": 17.769585939827227, + "grad_norm": 0.0458984375, + "learning_rate": 0.001122413585903203, + "loss": 0.8003, + "num_input_tokens_seen": 69270896, + "step": 119305 + }, + { + "epoch": 17.770330652368187, + "grad_norm": 0.05419921875, + "learning_rate": 0.0011216737180914599, + "loss": 0.7909, + "num_input_tokens_seen": 69274096, + "step": 119310 + }, + { + "epoch": 17.771075364909144, + "grad_norm": 0.052001953125, + "learning_rate": 0.0011209340847399313, + "loss": 0.7867, + "num_input_tokens_seen": 69276880, + "step": 119315 + }, + { + "epoch": 17.771820077450105, + "grad_norm": 0.06689453125, + "learning_rate": 0.001120194685861119, + "loss": 0.7827, + "num_input_tokens_seen": 69279856, + "step": 119320 + }, + { + "epoch": 17.772564789991062, + "grad_norm": 0.047119140625, + "learning_rate": 0.0011194555214675095, + "loss": 0.7805, + "num_input_tokens_seen": 69282832, + "step": 119325 + }, + { + "epoch": 17.773309502532022, + "grad_norm": 0.062255859375, + "learning_rate": 0.0011187165915715895, + "loss": 0.7576, + "num_input_tokens_seen": 69285552, + "step": 119330 + }, + { + "epoch": 17.774054215072983, + "grad_norm": 0.052978515625, + "learning_rate": 0.001117977896185846, + "loss": 0.8153, + "num_input_tokens_seen": 69288464, + "step": 119335 + }, + { + "epoch": 17.77479892761394, + "grad_norm": 0.05322265625, + "learning_rate": 0.0011172394353227589, + "loss": 0.8054, + "num_input_tokens_seen": 69291280, + "step": 119340 + }, + { + "epoch": 17.7755436401549, + "grad_norm": 0.04736328125, + "learning_rate": 0.0011165012089947995, + "loss": 0.8017, + "num_input_tokens_seen": 69294320, + "step": 119345 + }, + { + "epoch": 17.77628835269586, + "grad_norm": 0.05810546875, + "learning_rate": 0.0011157632172144399, + "loss": 0.792, + "num_input_tokens_seen": 69297296, + "step": 119350 + }, + { + "epoch": 17.777033065236818, + "grad_norm": 0.06884765625, + "learning_rate": 0.0011150254599941517, + "loss": 0.8009, + "num_input_tokens_seen": 69300528, + "step": 119355 + }, + { + "epoch": 17.77777777777778, + "grad_norm": 0.03955078125, + "learning_rate": 0.0011142879373463965, + "loss": 0.7885, + "num_input_tokens_seen": 69303344, + "step": 119360 + }, + { + "epoch": 17.778522490318736, + "grad_norm": 0.0419921875, + "learning_rate": 0.0011135506492836328, + "loss": 0.7984, + "num_input_tokens_seen": 69306000, + "step": 119365 + }, + { + "epoch": 17.779267202859696, + "grad_norm": 0.059326171875, + "learning_rate": 0.001112813595818317, + "loss": 0.8073, + "num_input_tokens_seen": 69309008, + "step": 119370 + }, + { + "epoch": 17.780011915400657, + "grad_norm": 0.080078125, + "learning_rate": 0.0011120767769629014, + "loss": 0.7882, + "num_input_tokens_seen": 69311888, + "step": 119375 + }, + { + "epoch": 17.780756627941614, + "grad_norm": 0.049072265625, + "learning_rate": 0.0011113401927298338, + "loss": 0.787, + "num_input_tokens_seen": 69314768, + "step": 119380 + }, + { + "epoch": 17.781501340482574, + "grad_norm": 0.07470703125, + "learning_rate": 0.0011106038431315563, + "loss": 0.7764, + "num_input_tokens_seen": 69317808, + "step": 119385 + }, + { + "epoch": 17.782246053023535, + "grad_norm": 0.04052734375, + "learning_rate": 0.0011098677281805136, + "loss": 0.8023, + "num_input_tokens_seen": 69320784, + "step": 119390 + }, + { + "epoch": 17.78299076556449, + "grad_norm": 0.041015625, + "learning_rate": 0.0011091318478891343, + "loss": 0.7762, + "num_input_tokens_seen": 69323472, + "step": 119395 + }, + { + "epoch": 17.783735478105452, + "grad_norm": 0.06982421875, + "learning_rate": 0.0011083962022698584, + "loss": 0.8082, + "num_input_tokens_seen": 69326352, + "step": 119400 + }, + { + "epoch": 17.78448019064641, + "grad_norm": 0.05126953125, + "learning_rate": 0.0011076607913351093, + "loss": 0.8238, + "num_input_tokens_seen": 69329136, + "step": 119405 + }, + { + "epoch": 17.78522490318737, + "grad_norm": 0.0634765625, + "learning_rate": 0.0011069256150973105, + "loss": 0.8007, + "num_input_tokens_seen": 69331952, + "step": 119410 + }, + { + "epoch": 17.78596961572833, + "grad_norm": 0.03515625, + "learning_rate": 0.0011061906735688853, + "loss": 0.8286, + "num_input_tokens_seen": 69334768, + "step": 119415 + }, + { + "epoch": 17.786714328269287, + "grad_norm": 0.07275390625, + "learning_rate": 0.0011054559667622438, + "loss": 0.8106, + "num_input_tokens_seen": 69337552, + "step": 119420 + }, + { + "epoch": 17.787459040810248, + "grad_norm": 0.04443359375, + "learning_rate": 0.0011047214946898042, + "loss": 0.789, + "num_input_tokens_seen": 69340656, + "step": 119425 + }, + { + "epoch": 17.788203753351205, + "grad_norm": 0.03076171875, + "learning_rate": 0.0011039872573639703, + "loss": 0.8079, + "num_input_tokens_seen": 69343536, + "step": 119430 + }, + { + "epoch": 17.788948465892165, + "grad_norm": 0.033935546875, + "learning_rate": 0.0011032532547971502, + "loss": 0.8323, + "num_input_tokens_seen": 69346480, + "step": 119435 + }, + { + "epoch": 17.789693178433126, + "grad_norm": 0.04541015625, + "learning_rate": 0.001102519487001744, + "loss": 0.7938, + "num_input_tokens_seen": 69349584, + "step": 119440 + }, + { + "epoch": 17.790437890974083, + "grad_norm": 0.049072265625, + "learning_rate": 0.001101785953990142, + "loss": 0.8231, + "num_input_tokens_seen": 69352688, + "step": 119445 + }, + { + "epoch": 17.791182603515043, + "grad_norm": 0.040771484375, + "learning_rate": 0.0011010526557747441, + "loss": 0.7999, + "num_input_tokens_seen": 69355504, + "step": 119450 + }, + { + "epoch": 17.791927316056004, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0011003195923679338, + "loss": 0.7852, + "num_input_tokens_seen": 69358192, + "step": 119455 + }, + { + "epoch": 17.79267202859696, + "grad_norm": 0.0556640625, + "learning_rate": 0.0010995867637820977, + "loss": 0.8306, + "num_input_tokens_seen": 69360880, + "step": 119460 + }, + { + "epoch": 17.79341674113792, + "grad_norm": 0.053955078125, + "learning_rate": 0.0010988541700296145, + "loss": 0.7797, + "num_input_tokens_seen": 69363696, + "step": 119465 + }, + { + "epoch": 17.79416145367888, + "grad_norm": 0.0576171875, + "learning_rate": 0.0010981218111228623, + "loss": 0.8107, + "num_input_tokens_seen": 69366512, + "step": 119470 + }, + { + "epoch": 17.79490616621984, + "grad_norm": 0.05029296875, + "learning_rate": 0.001097389687074215, + "loss": 0.815, + "num_input_tokens_seen": 69369296, + "step": 119475 + }, + { + "epoch": 17.7956508787608, + "grad_norm": 0.037841796875, + "learning_rate": 0.0010966577978960372, + "loss": 0.7957, + "num_input_tokens_seen": 69372080, + "step": 119480 + }, + { + "epoch": 17.796395591301756, + "grad_norm": 0.037109375, + "learning_rate": 0.0010959261436006961, + "loss": 0.7746, + "num_input_tokens_seen": 69374672, + "step": 119485 + }, + { + "epoch": 17.797140303842717, + "grad_norm": 0.049560546875, + "learning_rate": 0.0010951947242005483, + "loss": 0.7828, + "num_input_tokens_seen": 69377584, + "step": 119490 + }, + { + "epoch": 17.797885016383677, + "grad_norm": 0.03271484375, + "learning_rate": 0.001094463539707957, + "loss": 0.8097, + "num_input_tokens_seen": 69380240, + "step": 119495 + }, + { + "epoch": 17.798629728924634, + "grad_norm": 0.056396484375, + "learning_rate": 0.001093732590135268, + "loss": 0.7784, + "num_input_tokens_seen": 69382992, + "step": 119500 + }, + { + "epoch": 17.799374441465595, + "grad_norm": 0.0625, + "learning_rate": 0.0010930018754948356, + "loss": 0.8041, + "num_input_tokens_seen": 69385584, + "step": 119505 + }, + { + "epoch": 17.800119154006552, + "grad_norm": 0.0673828125, + "learning_rate": 0.0010922713957989987, + "loss": 0.7717, + "num_input_tokens_seen": 69388720, + "step": 119510 + }, + { + "epoch": 17.800863866547513, + "grad_norm": 0.0400390625, + "learning_rate": 0.0010915411510601057, + "loss": 0.7865, + "num_input_tokens_seen": 69391920, + "step": 119515 + }, + { + "epoch": 17.801608579088473, + "grad_norm": 0.041015625, + "learning_rate": 0.0010908111412904885, + "loss": 0.8015, + "num_input_tokens_seen": 69394576, + "step": 119520 + }, + { + "epoch": 17.80235329162943, + "grad_norm": 0.034912109375, + "learning_rate": 0.0010900813665024771, + "loss": 0.8039, + "num_input_tokens_seen": 69397424, + "step": 119525 + }, + { + "epoch": 17.80309800417039, + "grad_norm": 0.06298828125, + "learning_rate": 0.0010893518267084067, + "loss": 0.7924, + "num_input_tokens_seen": 69400400, + "step": 119530 + }, + { + "epoch": 17.80384271671135, + "grad_norm": 0.0419921875, + "learning_rate": 0.0010886225219205957, + "loss": 0.7796, + "num_input_tokens_seen": 69403152, + "step": 119535 + }, + { + "epoch": 17.804587429252308, + "grad_norm": 0.064453125, + "learning_rate": 0.0010878934521513727, + "loss": 0.7798, + "num_input_tokens_seen": 69406064, + "step": 119540 + }, + { + "epoch": 17.80533214179327, + "grad_norm": 0.0400390625, + "learning_rate": 0.0010871646174130478, + "loss": 0.7921, + "num_input_tokens_seen": 69408720, + "step": 119545 + }, + { + "epoch": 17.806076854334226, + "grad_norm": 0.03466796875, + "learning_rate": 0.0010864360177179377, + "loss": 0.7942, + "num_input_tokens_seen": 69411600, + "step": 119550 + }, + { + "epoch": 17.806821566875186, + "grad_norm": 0.06396484375, + "learning_rate": 0.0010857076530783477, + "loss": 0.8057, + "num_input_tokens_seen": 69414416, + "step": 119555 + }, + { + "epoch": 17.807566279416147, + "grad_norm": 0.048828125, + "learning_rate": 0.001084979523506588, + "loss": 0.7799, + "num_input_tokens_seen": 69417424, + "step": 119560 + }, + { + "epoch": 17.808310991957104, + "grad_norm": 0.04833984375, + "learning_rate": 0.001084251629014955, + "loss": 0.7955, + "num_input_tokens_seen": 69420688, + "step": 119565 + }, + { + "epoch": 17.809055704498064, + "grad_norm": 0.04345703125, + "learning_rate": 0.001083523969615746, + "loss": 0.8095, + "num_input_tokens_seen": 69423376, + "step": 119570 + }, + { + "epoch": 17.80980041703902, + "grad_norm": 0.037353515625, + "learning_rate": 0.0010827965453212578, + "loss": 0.8052, + "num_input_tokens_seen": 69426128, + "step": 119575 + }, + { + "epoch": 17.81054512957998, + "grad_norm": 0.02490234375, + "learning_rate": 0.0010820693561437737, + "loss": 0.8013, + "num_input_tokens_seen": 69429040, + "step": 119580 + }, + { + "epoch": 17.811289842120942, + "grad_norm": 0.09765625, + "learning_rate": 0.0010813424020955859, + "loss": 0.8061, + "num_input_tokens_seen": 69431856, + "step": 119585 + }, + { + "epoch": 17.8120345546619, + "grad_norm": 0.038818359375, + "learning_rate": 0.001080615683188969, + "loss": 0.7943, + "num_input_tokens_seen": 69434768, + "step": 119590 + }, + { + "epoch": 17.81277926720286, + "grad_norm": 0.04052734375, + "learning_rate": 0.0010798891994362053, + "loss": 0.7843, + "num_input_tokens_seen": 69437456, + "step": 119595 + }, + { + "epoch": 17.81352397974382, + "grad_norm": 0.05908203125, + "learning_rate": 0.0010791629508495664, + "loss": 0.8244, + "num_input_tokens_seen": 69440432, + "step": 119600 + }, + { + "epoch": 17.814268692284777, + "grad_norm": 0.0380859375, + "learning_rate": 0.001078436937441321, + "loss": 0.7744, + "num_input_tokens_seen": 69443472, + "step": 119605 + }, + { + "epoch": 17.815013404825738, + "grad_norm": 0.0673828125, + "learning_rate": 0.0010777111592237308, + "loss": 0.7922, + "num_input_tokens_seen": 69446736, + "step": 119610 + }, + { + "epoch": 17.815758117366695, + "grad_norm": 0.044189453125, + "learning_rate": 0.0010769856162090646, + "loss": 0.8017, + "num_input_tokens_seen": 69449520, + "step": 119615 + }, + { + "epoch": 17.816502829907655, + "grad_norm": 0.033203125, + "learning_rate": 0.0010762603084095755, + "loss": 0.8128, + "num_input_tokens_seen": 69452432, + "step": 119620 + }, + { + "epoch": 17.817247542448616, + "grad_norm": 0.0517578125, + "learning_rate": 0.001075535235837514, + "loss": 0.7789, + "num_input_tokens_seen": 69455152, + "step": 119625 + }, + { + "epoch": 17.817992254989573, + "grad_norm": 0.049072265625, + "learning_rate": 0.0010748103985051339, + "loss": 0.7967, + "num_input_tokens_seen": 69457936, + "step": 119630 + }, + { + "epoch": 17.818736967530533, + "grad_norm": 0.051025390625, + "learning_rate": 0.001074085796424678, + "loss": 0.785, + "num_input_tokens_seen": 69460912, + "step": 119635 + }, + { + "epoch": 17.819481680071494, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0010733614296083922, + "loss": 0.7976, + "num_input_tokens_seen": 69463472, + "step": 119640 + }, + { + "epoch": 17.82022639261245, + "grad_norm": 0.034912109375, + "learning_rate": 0.0010726372980685095, + "loss": 0.7985, + "num_input_tokens_seen": 69466384, + "step": 119645 + }, + { + "epoch": 17.82097110515341, + "grad_norm": 0.039794921875, + "learning_rate": 0.001071913401817262, + "loss": 0.7947, + "num_input_tokens_seen": 69469392, + "step": 119650 + }, + { + "epoch": 17.82171581769437, + "grad_norm": 0.055419921875, + "learning_rate": 0.0010711897408668834, + "loss": 0.8, + "num_input_tokens_seen": 69472272, + "step": 119655 + }, + { + "epoch": 17.82246053023533, + "grad_norm": 0.04638671875, + "learning_rate": 0.0010704663152295956, + "loss": 0.7937, + "num_input_tokens_seen": 69475696, + "step": 119660 + }, + { + "epoch": 17.82320524277629, + "grad_norm": 0.0654296875, + "learning_rate": 0.0010697431249176252, + "loss": 0.793, + "num_input_tokens_seen": 69478704, + "step": 119665 + }, + { + "epoch": 17.823949955317246, + "grad_norm": 0.044921875, + "learning_rate": 0.0010690201699431862, + "loss": 0.7935, + "num_input_tokens_seen": 69481648, + "step": 119670 + }, + { + "epoch": 17.824694667858207, + "grad_norm": 0.0260009765625, + "learning_rate": 0.0010682974503184934, + "loss": 0.7906, + "num_input_tokens_seen": 69484400, + "step": 119675 + }, + { + "epoch": 17.825439380399168, + "grad_norm": 0.064453125, + "learning_rate": 0.0010675749660557553, + "loss": 0.8039, + "num_input_tokens_seen": 69487312, + "step": 119680 + }, + { + "epoch": 17.826184092940125, + "grad_norm": 0.042724609375, + "learning_rate": 0.0010668527171671742, + "loss": 0.8101, + "num_input_tokens_seen": 69490256, + "step": 119685 + }, + { + "epoch": 17.826928805481085, + "grad_norm": 0.07421875, + "learning_rate": 0.0010661307036649585, + "loss": 0.8123, + "num_input_tokens_seen": 69493072, + "step": 119690 + }, + { + "epoch": 17.827673518022042, + "grad_norm": 0.052978515625, + "learning_rate": 0.0010654089255613002, + "loss": 0.8012, + "num_input_tokens_seen": 69495696, + "step": 119695 + }, + { + "epoch": 17.828418230563003, + "grad_norm": 0.2197265625, + "learning_rate": 0.0010646873828683995, + "loss": 0.8326, + "num_input_tokens_seen": 69498576, + "step": 119700 + }, + { + "epoch": 17.829162943103963, + "grad_norm": 0.031494140625, + "learning_rate": 0.001063966075598438, + "loss": 0.7948, + "num_input_tokens_seen": 69501648, + "step": 119705 + }, + { + "epoch": 17.82990765564492, + "grad_norm": 0.033447265625, + "learning_rate": 0.0010632450037636099, + "loss": 0.7903, + "num_input_tokens_seen": 69504688, + "step": 119710 + }, + { + "epoch": 17.83065236818588, + "grad_norm": 0.0498046875, + "learning_rate": 0.0010625241673760883, + "loss": 0.7991, + "num_input_tokens_seen": 69507536, + "step": 119715 + }, + { + "epoch": 17.83139708072684, + "grad_norm": 0.030029296875, + "learning_rate": 0.0010618035664480584, + "loss": 0.7962, + "num_input_tokens_seen": 69510544, + "step": 119720 + }, + { + "epoch": 17.832141793267798, + "grad_norm": 0.0498046875, + "learning_rate": 0.0010610832009916927, + "loss": 0.7974, + "num_input_tokens_seen": 69513456, + "step": 119725 + }, + { + "epoch": 17.83288650580876, + "grad_norm": 0.0634765625, + "learning_rate": 0.0010603630710191558, + "loss": 0.8124, + "num_input_tokens_seen": 69516304, + "step": 119730 + }, + { + "epoch": 17.833631218349716, + "grad_norm": 0.0281982421875, + "learning_rate": 0.00105964317654262, + "loss": 0.7937, + "num_input_tokens_seen": 69519088, + "step": 119735 + }, + { + "epoch": 17.834375930890676, + "grad_norm": 0.06689453125, + "learning_rate": 0.001058923517574244, + "loss": 0.805, + "num_input_tokens_seen": 69521936, + "step": 119740 + }, + { + "epoch": 17.835120643431637, + "grad_norm": 0.03955078125, + "learning_rate": 0.001058204094126186, + "loss": 0.7948, + "num_input_tokens_seen": 69525104, + "step": 119745 + }, + { + "epoch": 17.835865355972594, + "grad_norm": 0.049072265625, + "learning_rate": 0.0010574849062105983, + "loss": 0.7955, + "num_input_tokens_seen": 69528112, + "step": 119750 + }, + { + "epoch": 17.836610068513554, + "grad_norm": 0.03955078125, + "learning_rate": 0.0010567659538396345, + "loss": 0.7884, + "num_input_tokens_seen": 69530608, + "step": 119755 + }, + { + "epoch": 17.837354781054515, + "grad_norm": 0.03759765625, + "learning_rate": 0.0010560472370254382, + "loss": 0.8023, + "num_input_tokens_seen": 69533680, + "step": 119760 + }, + { + "epoch": 17.83809949359547, + "grad_norm": 0.06640625, + "learning_rate": 0.0010553287557801493, + "loss": 0.8093, + "num_input_tokens_seen": 69536400, + "step": 119765 + }, + { + "epoch": 17.838844206136432, + "grad_norm": 0.053466796875, + "learning_rate": 0.001054610510115912, + "loss": 0.7839, + "num_input_tokens_seen": 69539216, + "step": 119770 + }, + { + "epoch": 17.83958891867739, + "grad_norm": 0.078125, + "learning_rate": 0.0010538925000448528, + "loss": 0.8251, + "num_input_tokens_seen": 69542096, + "step": 119775 + }, + { + "epoch": 17.84033363121835, + "grad_norm": 0.036376953125, + "learning_rate": 0.001053174725579109, + "loss": 0.7915, + "num_input_tokens_seen": 69544912, + "step": 119780 + }, + { + "epoch": 17.84107834375931, + "grad_norm": 0.0361328125, + "learning_rate": 0.0010524571867308007, + "loss": 0.8059, + "num_input_tokens_seen": 69547760, + "step": 119785 + }, + { + "epoch": 17.841823056300267, + "grad_norm": 0.0791015625, + "learning_rate": 0.001051739883512055, + "loss": 0.8094, + "num_input_tokens_seen": 69550640, + "step": 119790 + }, + { + "epoch": 17.842567768841228, + "grad_norm": 0.0400390625, + "learning_rate": 0.0010510228159349854, + "loss": 0.7988, + "num_input_tokens_seen": 69553520, + "step": 119795 + }, + { + "epoch": 17.843312481382185, + "grad_norm": 0.0263671875, + "learning_rate": 0.0010503059840117107, + "loss": 0.805, + "num_input_tokens_seen": 69556560, + "step": 119800 + }, + { + "epoch": 17.844057193923145, + "grad_norm": 0.044677734375, + "learning_rate": 0.0010495893877543376, + "loss": 0.8069, + "num_input_tokens_seen": 69559472, + "step": 119805 + }, + { + "epoch": 17.844801906464106, + "grad_norm": 0.054931640625, + "learning_rate": 0.0010488730271749752, + "loss": 0.8118, + "num_input_tokens_seen": 69562416, + "step": 119810 + }, + { + "epoch": 17.845546619005063, + "grad_norm": 0.07177734375, + "learning_rate": 0.0010481569022857218, + "loss": 0.7748, + "num_input_tokens_seen": 69565328, + "step": 119815 + }, + { + "epoch": 17.846291331546023, + "grad_norm": 0.060546875, + "learning_rate": 0.001047441013098676, + "loss": 0.8027, + "num_input_tokens_seen": 69568208, + "step": 119820 + }, + { + "epoch": 17.847036044086984, + "grad_norm": 0.039794921875, + "learning_rate": 0.001046725359625935, + "loss": 0.7984, + "num_input_tokens_seen": 69571344, + "step": 119825 + }, + { + "epoch": 17.84778075662794, + "grad_norm": 0.0751953125, + "learning_rate": 0.001046009941879586, + "loss": 0.7911, + "num_input_tokens_seen": 69574288, + "step": 119830 + }, + { + "epoch": 17.8485254691689, + "grad_norm": 0.056396484375, + "learning_rate": 0.0010452947598717187, + "loss": 0.8098, + "num_input_tokens_seen": 69577296, + "step": 119835 + }, + { + "epoch": 17.84927018170986, + "grad_norm": 0.078125, + "learning_rate": 0.0010445798136144123, + "loss": 0.797, + "num_input_tokens_seen": 69580144, + "step": 119840 + }, + { + "epoch": 17.85001489425082, + "grad_norm": 0.054443359375, + "learning_rate": 0.001043865103119747, + "loss": 0.8031, + "num_input_tokens_seen": 69582992, + "step": 119845 + }, + { + "epoch": 17.85075960679178, + "grad_norm": 0.054931640625, + "learning_rate": 0.0010431506283997982, + "loss": 0.8048, + "num_input_tokens_seen": 69586032, + "step": 119850 + }, + { + "epoch": 17.851504319332737, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0010424363894666311, + "loss": 0.7907, + "num_input_tokens_seen": 69589040, + "step": 119855 + }, + { + "epoch": 17.852249031873697, + "grad_norm": 0.0458984375, + "learning_rate": 0.0010417223863323176, + "loss": 0.7938, + "num_input_tokens_seen": 69591920, + "step": 119860 + }, + { + "epoch": 17.852993744414658, + "grad_norm": 0.037841796875, + "learning_rate": 0.001041008619008915, + "loss": 0.7928, + "num_input_tokens_seen": 69594640, + "step": 119865 + }, + { + "epoch": 17.853738456955615, + "grad_norm": 0.060302734375, + "learning_rate": 0.0010402950875084871, + "loss": 0.7919, + "num_input_tokens_seen": 69597616, + "step": 119870 + }, + { + "epoch": 17.854483169496575, + "grad_norm": 0.040771484375, + "learning_rate": 0.0010395817918430871, + "loss": 0.794, + "num_input_tokens_seen": 69600368, + "step": 119875 + }, + { + "epoch": 17.855227882037532, + "grad_norm": 0.04541015625, + "learning_rate": 0.0010388687320247625, + "loss": 0.8135, + "num_input_tokens_seen": 69603312, + "step": 119880 + }, + { + "epoch": 17.855972594578493, + "grad_norm": 0.05859375, + "learning_rate": 0.0010381559080655616, + "loss": 0.821, + "num_input_tokens_seen": 69606064, + "step": 119885 + }, + { + "epoch": 17.856717307119453, + "grad_norm": 0.033935546875, + "learning_rate": 0.001037443319977525, + "loss": 0.8056, + "num_input_tokens_seen": 69608976, + "step": 119890 + }, + { + "epoch": 17.85746201966041, + "grad_norm": 0.036865234375, + "learning_rate": 0.0010367309677726932, + "loss": 0.7945, + "num_input_tokens_seen": 69611984, + "step": 119895 + }, + { + "epoch": 17.85820673220137, + "grad_norm": 0.07568359375, + "learning_rate": 0.0010360188514630996, + "loss": 0.8036, + "num_input_tokens_seen": 69614704, + "step": 119900 + }, + { + "epoch": 17.85895144474233, + "grad_norm": 0.048095703125, + "learning_rate": 0.0010353069710607764, + "loss": 0.7988, + "num_input_tokens_seen": 69617488, + "step": 119905 + }, + { + "epoch": 17.859696157283288, + "grad_norm": 0.04833984375, + "learning_rate": 0.0010345953265777457, + "loss": 0.7851, + "num_input_tokens_seen": 69620368, + "step": 119910 + }, + { + "epoch": 17.86044086982425, + "grad_norm": 0.0673828125, + "learning_rate": 0.0010338839180260362, + "loss": 0.7868, + "num_input_tokens_seen": 69623472, + "step": 119915 + }, + { + "epoch": 17.861185582365206, + "grad_norm": 0.055908203125, + "learning_rate": 0.0010331727454176614, + "loss": 0.8027, + "num_input_tokens_seen": 69626192, + "step": 119920 + }, + { + "epoch": 17.861930294906166, + "grad_norm": 0.06298828125, + "learning_rate": 0.0010324618087646403, + "loss": 0.8055, + "num_input_tokens_seen": 69629200, + "step": 119925 + }, + { + "epoch": 17.862675007447127, + "grad_norm": 0.04052734375, + "learning_rate": 0.0010317511080789816, + "loss": 0.7951, + "num_input_tokens_seen": 69632112, + "step": 119930 + }, + { + "epoch": 17.863419719988084, + "grad_norm": 0.04150390625, + "learning_rate": 0.0010310406433726887, + "loss": 0.7828, + "num_input_tokens_seen": 69635088, + "step": 119935 + }, + { + "epoch": 17.864164432529044, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0010303304146577674, + "loss": 0.7965, + "num_input_tokens_seen": 69638160, + "step": 119940 + }, + { + "epoch": 17.86490914507, + "grad_norm": 0.03515625, + "learning_rate": 0.0010296204219462162, + "loss": 0.8046, + "num_input_tokens_seen": 69640752, + "step": 119945 + }, + { + "epoch": 17.865653857610962, + "grad_norm": 0.03173828125, + "learning_rate": 0.001028910665250029, + "loss": 0.7899, + "num_input_tokens_seen": 69643824, + "step": 119950 + }, + { + "epoch": 17.866398570151922, + "grad_norm": 0.03564453125, + "learning_rate": 0.0010282011445811945, + "loss": 0.7906, + "num_input_tokens_seen": 69646768, + "step": 119955 + }, + { + "epoch": 17.86714328269288, + "grad_norm": 0.06689453125, + "learning_rate": 0.0010274918599517046, + "loss": 0.7791, + "num_input_tokens_seen": 69649552, + "step": 119960 + }, + { + "epoch": 17.86788799523384, + "grad_norm": 0.062255859375, + "learning_rate": 0.0010267828113735367, + "loss": 0.794, + "num_input_tokens_seen": 69652400, + "step": 119965 + }, + { + "epoch": 17.8686327077748, + "grad_norm": 0.04150390625, + "learning_rate": 0.0010260739988586709, + "loss": 0.7986, + "num_input_tokens_seen": 69655120, + "step": 119970 + }, + { + "epoch": 17.869377420315757, + "grad_norm": 0.056884765625, + "learning_rate": 0.0010253654224190844, + "loss": 0.8078, + "num_input_tokens_seen": 69658480, + "step": 119975 + }, + { + "epoch": 17.870122132856718, + "grad_norm": 0.050048828125, + "learning_rate": 0.0010246570820667427, + "loss": 0.7919, + "num_input_tokens_seen": 69661552, + "step": 119980 + }, + { + "epoch": 17.870866845397675, + "grad_norm": 0.09423828125, + "learning_rate": 0.0010239489778136196, + "loss": 0.7789, + "num_input_tokens_seen": 69664368, + "step": 119985 + }, + { + "epoch": 17.871611557938635, + "grad_norm": 0.05712890625, + "learning_rate": 0.0010232411096716704, + "loss": 0.8048, + "num_input_tokens_seen": 69667248, + "step": 119990 + }, + { + "epoch": 17.872356270479596, + "grad_norm": 0.07080078125, + "learning_rate": 0.0010225334776528606, + "loss": 0.7894, + "num_input_tokens_seen": 69670160, + "step": 119995 + }, + { + "epoch": 17.873100983020553, + "grad_norm": 0.031982421875, + "learning_rate": 0.0010218260817691405, + "loss": 0.8172, + "num_input_tokens_seen": 69673008, + "step": 120000 + }, + { + "epoch": 17.873845695561513, + "grad_norm": 0.044677734375, + "learning_rate": 0.0010211189220324622, + "loss": 0.8309, + "num_input_tokens_seen": 69675888, + "step": 120005 + }, + { + "epoch": 17.874590408102474, + "grad_norm": 0.158203125, + "learning_rate": 0.0010204119984547715, + "loss": 0.8227, + "num_input_tokens_seen": 69678608, + "step": 120010 + }, + { + "epoch": 17.87533512064343, + "grad_norm": 0.06591796875, + "learning_rate": 0.0010197053110480119, + "loss": 0.7756, + "num_input_tokens_seen": 69681648, + "step": 120015 + }, + { + "epoch": 17.87607983318439, + "grad_norm": 0.0751953125, + "learning_rate": 0.0010189988598241222, + "loss": 0.7667, + "num_input_tokens_seen": 69684560, + "step": 120020 + }, + { + "epoch": 17.87682454572535, + "grad_norm": 0.03662109375, + "learning_rate": 0.0010182926447950362, + "loss": 0.8307, + "num_input_tokens_seen": 69687696, + "step": 120025 + }, + { + "epoch": 17.87756925826631, + "grad_norm": 0.04052734375, + "learning_rate": 0.001017586665972686, + "loss": 0.784, + "num_input_tokens_seen": 69690448, + "step": 120030 + }, + { + "epoch": 17.87831397080727, + "grad_norm": 0.04736328125, + "learning_rate": 0.0010168809233689973, + "loss": 0.7899, + "num_input_tokens_seen": 69693424, + "step": 120035 + }, + { + "epoch": 17.879058683348227, + "grad_norm": 0.044921875, + "learning_rate": 0.001016175416995897, + "loss": 0.7867, + "num_input_tokens_seen": 69696336, + "step": 120040 + }, + { + "epoch": 17.879803395889187, + "grad_norm": 0.04541015625, + "learning_rate": 0.0010154701468652987, + "loss": 0.7825, + "num_input_tokens_seen": 69699376, + "step": 120045 + }, + { + "epoch": 17.880548108430148, + "grad_norm": 0.0439453125, + "learning_rate": 0.0010147651129891167, + "loss": 0.7949, + "num_input_tokens_seen": 69702128, + "step": 120050 + }, + { + "epoch": 17.881292820971105, + "grad_norm": 0.06689453125, + "learning_rate": 0.0010140603153792676, + "loss": 0.8002, + "num_input_tokens_seen": 69704880, + "step": 120055 + }, + { + "epoch": 17.882037533512065, + "grad_norm": 0.053955078125, + "learning_rate": 0.0010133557540476506, + "loss": 0.7884, + "num_input_tokens_seen": 69707760, + "step": 120060 + }, + { + "epoch": 17.882782246053022, + "grad_norm": 0.041015625, + "learning_rate": 0.0010126514290061761, + "loss": 0.8116, + "num_input_tokens_seen": 69710672, + "step": 120065 + }, + { + "epoch": 17.883526958593983, + "grad_norm": 0.0267333984375, + "learning_rate": 0.001011947340266741, + "loss": 0.7984, + "num_input_tokens_seen": 69713712, + "step": 120070 + }, + { + "epoch": 17.884271671134943, + "grad_norm": 0.04052734375, + "learning_rate": 0.0010112434878412364, + "loss": 0.8026, + "num_input_tokens_seen": 69716560, + "step": 120075 + }, + { + "epoch": 17.8850163836759, + "grad_norm": 0.1162109375, + "learning_rate": 0.0010105398717415552, + "loss": 0.7847, + "num_input_tokens_seen": 69719408, + "step": 120080 + }, + { + "epoch": 17.88576109621686, + "grad_norm": 0.05029296875, + "learning_rate": 0.0010098364919795854, + "loss": 0.8129, + "num_input_tokens_seen": 69722096, + "step": 120085 + }, + { + "epoch": 17.886505808757818, + "grad_norm": 0.04345703125, + "learning_rate": 0.0010091333485672104, + "loss": 0.7927, + "num_input_tokens_seen": 69725200, + "step": 120090 + }, + { + "epoch": 17.88725052129878, + "grad_norm": 0.04443359375, + "learning_rate": 0.0010084304415163042, + "loss": 0.7903, + "num_input_tokens_seen": 69728368, + "step": 120095 + }, + { + "epoch": 17.88799523383974, + "grad_norm": 0.0556640625, + "learning_rate": 0.0010077277708387489, + "loss": 0.7995, + "num_input_tokens_seen": 69731280, + "step": 120100 + }, + { + "epoch": 17.888739946380696, + "grad_norm": 0.03369140625, + "learning_rate": 0.0010070253365464083, + "loss": 0.8056, + "num_input_tokens_seen": 69734352, + "step": 120105 + }, + { + "epoch": 17.889484658921656, + "grad_norm": 0.051025390625, + "learning_rate": 0.0010063231386511563, + "loss": 0.8359, + "num_input_tokens_seen": 69737200, + "step": 120110 + }, + { + "epoch": 17.890229371462617, + "grad_norm": 0.0537109375, + "learning_rate": 0.0010056211771648483, + "loss": 0.8118, + "num_input_tokens_seen": 69740112, + "step": 120115 + }, + { + "epoch": 17.890974084003574, + "grad_norm": 0.04638671875, + "learning_rate": 0.0010049194520993514, + "loss": 0.7998, + "num_input_tokens_seen": 69743024, + "step": 120120 + }, + { + "epoch": 17.891718796544534, + "grad_norm": 0.05859375, + "learning_rate": 0.0010042179634665149, + "loss": 0.8062, + "num_input_tokens_seen": 69745840, + "step": 120125 + }, + { + "epoch": 17.89246350908549, + "grad_norm": 0.07470703125, + "learning_rate": 0.0010035167112781905, + "loss": 0.8011, + "num_input_tokens_seen": 69748752, + "step": 120130 + }, + { + "epoch": 17.893208221626452, + "grad_norm": 0.06640625, + "learning_rate": 0.0010028156955462274, + "loss": 0.7775, + "num_input_tokens_seen": 69751760, + "step": 120135 + }, + { + "epoch": 17.893952934167412, + "grad_norm": 0.0498046875, + "learning_rate": 0.001002114916282466, + "loss": 0.8015, + "num_input_tokens_seen": 69754672, + "step": 120140 + }, + { + "epoch": 17.89469764670837, + "grad_norm": 0.059814453125, + "learning_rate": 0.0010014143734987484, + "loss": 0.7935, + "num_input_tokens_seen": 69757584, + "step": 120145 + }, + { + "epoch": 17.89544235924933, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0010007140672069037, + "loss": 0.7931, + "num_input_tokens_seen": 69760528, + "step": 120150 + }, + { + "epoch": 17.89618707179029, + "grad_norm": 0.041259765625, + "learning_rate": 0.0010000139974187688, + "loss": 0.7677, + "num_input_tokens_seen": 69763216, + "step": 120155 + }, + { + "epoch": 17.896931784331247, + "grad_norm": 0.0625, + "learning_rate": 0.0009993141641461661, + "loss": 0.7923, + "num_input_tokens_seen": 69766160, + "step": 120160 + }, + { + "epoch": 17.897676496872208, + "grad_norm": 0.04150390625, + "learning_rate": 0.0009986145674009245, + "loss": 0.7893, + "num_input_tokens_seen": 69768944, + "step": 120165 + }, + { + "epoch": 17.898421209413165, + "grad_norm": 0.07373046875, + "learning_rate": 0.0009979152071948577, + "loss": 0.7889, + "num_input_tokens_seen": 69772144, + "step": 120170 + }, + { + "epoch": 17.899165921954125, + "grad_norm": 0.036865234375, + "learning_rate": 0.0009972160835397814, + "loss": 0.7941, + "num_input_tokens_seen": 69774992, + "step": 120175 + }, + { + "epoch": 17.899910634495086, + "grad_norm": 0.044189453125, + "learning_rate": 0.0009965171964475095, + "loss": 0.8176, + "num_input_tokens_seen": 69777936, + "step": 120180 + }, + { + "epoch": 17.900655347036043, + "grad_norm": 0.07421875, + "learning_rate": 0.000995818545929844, + "loss": 0.7966, + "num_input_tokens_seen": 69780688, + "step": 120185 + }, + { + "epoch": 17.901400059577004, + "grad_norm": 0.036865234375, + "learning_rate": 0.000995120131998594, + "loss": 0.793, + "num_input_tokens_seen": 69783536, + "step": 120190 + }, + { + "epoch": 17.902144772117964, + "grad_norm": 0.03759765625, + "learning_rate": 0.0009944219546655535, + "loss": 0.7936, + "num_input_tokens_seen": 69786512, + "step": 120195 + }, + { + "epoch": 17.90288948465892, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0009937240139425213, + "loss": 0.8316, + "num_input_tokens_seen": 69789360, + "step": 120200 + }, + { + "epoch": 17.90363419719988, + "grad_norm": 0.0400390625, + "learning_rate": 0.000993026309841286, + "loss": 0.812, + "num_input_tokens_seen": 69792304, + "step": 120205 + }, + { + "epoch": 17.90437890974084, + "grad_norm": 0.03662109375, + "learning_rate": 0.000992328842373637, + "loss": 0.8193, + "num_input_tokens_seen": 69795184, + "step": 120210 + }, + { + "epoch": 17.9051236222818, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0009916316115513545, + "loss": 0.7887, + "num_input_tokens_seen": 69798000, + "step": 120215 + }, + { + "epoch": 17.90586833482276, + "grad_norm": 0.034912109375, + "learning_rate": 0.0009909346173862144, + "loss": 0.8177, + "num_input_tokens_seen": 69801072, + "step": 120220 + }, + { + "epoch": 17.906613047363717, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009902378598900003, + "loss": 0.7925, + "num_input_tokens_seen": 69803568, + "step": 120225 + }, + { + "epoch": 17.907357759904677, + "grad_norm": 0.05029296875, + "learning_rate": 0.0009895413390744762, + "loss": 0.7966, + "num_input_tokens_seen": 69806544, + "step": 120230 + }, + { + "epoch": 17.908102472445638, + "grad_norm": 0.06640625, + "learning_rate": 0.0009888450549514127, + "loss": 0.7844, + "num_input_tokens_seen": 69809616, + "step": 120235 + }, + { + "epoch": 17.908847184986595, + "grad_norm": 0.02880859375, + "learning_rate": 0.0009881490075325705, + "loss": 0.7784, + "num_input_tokens_seen": 69812432, + "step": 120240 + }, + { + "epoch": 17.909591897527555, + "grad_norm": 0.234375, + "learning_rate": 0.00098745319682971, + "loss": 0.8177, + "num_input_tokens_seen": 69815344, + "step": 120245 + }, + { + "epoch": 17.910336610068512, + "grad_norm": 0.059326171875, + "learning_rate": 0.0009867576228545887, + "loss": 0.7876, + "num_input_tokens_seen": 69818224, + "step": 120250 + }, + { + "epoch": 17.911081322609473, + "grad_norm": 0.03076171875, + "learning_rate": 0.0009860622856189504, + "loss": 0.7811, + "num_input_tokens_seen": 69821104, + "step": 120255 + }, + { + "epoch": 17.911826035150433, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009853671851345507, + "loss": 0.7938, + "num_input_tokens_seen": 69823792, + "step": 120260 + }, + { + "epoch": 17.91257074769139, + "grad_norm": 0.037353515625, + "learning_rate": 0.0009846723214131286, + "loss": 0.796, + "num_input_tokens_seen": 69826384, + "step": 120265 + }, + { + "epoch": 17.91331546023235, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009839776944664197, + "loss": 0.8103, + "num_input_tokens_seen": 69829200, + "step": 120270 + }, + { + "epoch": 17.91406017277331, + "grad_norm": 0.03955078125, + "learning_rate": 0.0009832833043061644, + "loss": 0.8166, + "num_input_tokens_seen": 69832048, + "step": 120275 + }, + { + "epoch": 17.91480488531427, + "grad_norm": 0.0341796875, + "learning_rate": 0.0009825891509440937, + "loss": 0.7756, + "num_input_tokens_seen": 69834800, + "step": 120280 + }, + { + "epoch": 17.91554959785523, + "grad_norm": 0.060546875, + "learning_rate": 0.0009818952343919312, + "loss": 0.8152, + "num_input_tokens_seen": 69837488, + "step": 120285 + }, + { + "epoch": 17.916294310396186, + "grad_norm": 0.050048828125, + "learning_rate": 0.0009812015546613994, + "loss": 0.8083, + "num_input_tokens_seen": 69840368, + "step": 120290 + }, + { + "epoch": 17.917039022937146, + "grad_norm": 0.042236328125, + "learning_rate": 0.0009805081117642207, + "loss": 0.791, + "num_input_tokens_seen": 69843280, + "step": 120295 + }, + { + "epoch": 17.917783735478107, + "grad_norm": 0.056884765625, + "learning_rate": 0.000979814905712107, + "loss": 0.8082, + "num_input_tokens_seen": 69846224, + "step": 120300 + }, + { + "epoch": 17.918528448019064, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009791219365167742, + "loss": 0.8018, + "num_input_tokens_seen": 69849008, + "step": 120305 + }, + { + "epoch": 17.919273160560024, + "grad_norm": 0.07958984375, + "learning_rate": 0.0009784292041899228, + "loss": 0.8138, + "num_input_tokens_seen": 69851792, + "step": 120310 + }, + { + "epoch": 17.92001787310098, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009777367087432604, + "loss": 0.7976, + "num_input_tokens_seen": 69854672, + "step": 120315 + }, + { + "epoch": 17.920762585641942, + "grad_norm": 0.052734375, + "learning_rate": 0.000977044450188484, + "loss": 0.7976, + "num_input_tokens_seen": 69857744, + "step": 120320 + }, + { + "epoch": 17.921507298182902, + "grad_norm": 0.189453125, + "learning_rate": 0.0009763524285372909, + "loss": 0.8028, + "num_input_tokens_seen": 69861008, + "step": 120325 + }, + { + "epoch": 17.92225201072386, + "grad_norm": 0.06591796875, + "learning_rate": 0.0009756606438013703, + "loss": 0.8067, + "num_input_tokens_seen": 69863728, + "step": 120330 + }, + { + "epoch": 17.92299672326482, + "grad_norm": 0.0400390625, + "learning_rate": 0.0009749690959924112, + "loss": 0.7877, + "num_input_tokens_seen": 69866608, + "step": 120335 + }, + { + "epoch": 17.92374143580578, + "grad_norm": 0.03759765625, + "learning_rate": 0.000974277785122094, + "loss": 0.8135, + "num_input_tokens_seen": 69869008, + "step": 120340 + }, + { + "epoch": 17.924486148346737, + "grad_norm": 0.04833984375, + "learning_rate": 0.0009735867112020945, + "loss": 0.798, + "num_input_tokens_seen": 69871760, + "step": 120345 + }, + { + "epoch": 17.925230860887698, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0009728958742440952, + "loss": 0.7852, + "num_input_tokens_seen": 69874608, + "step": 120350 + }, + { + "epoch": 17.925975573428655, + "grad_norm": 0.04296875, + "learning_rate": 0.0009722052742597615, + "loss": 0.8108, + "num_input_tokens_seen": 69877136, + "step": 120355 + }, + { + "epoch": 17.926720285969616, + "grad_norm": 0.0615234375, + "learning_rate": 0.0009715149112607657, + "loss": 0.8038, + "num_input_tokens_seen": 69879952, + "step": 120360 + }, + { + "epoch": 17.927464998510576, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009708247852587654, + "loss": 0.8035, + "num_input_tokens_seen": 69882672, + "step": 120365 + }, + { + "epoch": 17.928209711051533, + "grad_norm": 0.0556640625, + "learning_rate": 0.000970134896265421, + "loss": 0.8235, + "num_input_tokens_seen": 69885520, + "step": 120370 + }, + { + "epoch": 17.928954423592494, + "grad_norm": 0.037109375, + "learning_rate": 0.00096944524429239, + "loss": 0.7872, + "num_input_tokens_seen": 69888560, + "step": 120375 + }, + { + "epoch": 17.929699136133454, + "grad_norm": 0.031494140625, + "learning_rate": 0.0009687558293513198, + "loss": 0.8391, + "num_input_tokens_seen": 69891472, + "step": 120380 + }, + { + "epoch": 17.93044384867441, + "grad_norm": 0.0419921875, + "learning_rate": 0.000968066651453861, + "loss": 0.8224, + "num_input_tokens_seen": 69894672, + "step": 120385 + }, + { + "epoch": 17.93118856121537, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009673777106116526, + "loss": 0.7784, + "num_input_tokens_seen": 69897456, + "step": 120390 + }, + { + "epoch": 17.93193327375633, + "grad_norm": 0.03173828125, + "learning_rate": 0.000966689006836337, + "loss": 0.7882, + "num_input_tokens_seen": 69900720, + "step": 120395 + }, + { + "epoch": 17.93267798629729, + "grad_norm": 0.0380859375, + "learning_rate": 0.0009660005401395483, + "loss": 0.7941, + "num_input_tokens_seen": 69903536, + "step": 120400 + }, + { + "epoch": 17.93342269883825, + "grad_norm": 0.0712890625, + "learning_rate": 0.0009653123105329169, + "loss": 0.8056, + "num_input_tokens_seen": 69906288, + "step": 120405 + }, + { + "epoch": 17.934167411379207, + "grad_norm": 0.06591796875, + "learning_rate": 0.0009646243180280705, + "loss": 0.7834, + "num_input_tokens_seen": 69909136, + "step": 120410 + }, + { + "epoch": 17.934912123920167, + "grad_norm": 0.040283203125, + "learning_rate": 0.000963936562636628, + "loss": 0.802, + "num_input_tokens_seen": 69911888, + "step": 120415 + }, + { + "epoch": 17.935656836461128, + "grad_norm": 0.06689453125, + "learning_rate": 0.0009632490443702152, + "loss": 0.8049, + "num_input_tokens_seen": 69914864, + "step": 120420 + }, + { + "epoch": 17.936401549002085, + "grad_norm": 0.04296875, + "learning_rate": 0.0009625617632404409, + "loss": 0.7946, + "num_input_tokens_seen": 69918064, + "step": 120425 + }, + { + "epoch": 17.937146261543045, + "grad_norm": 0.0361328125, + "learning_rate": 0.000961874719258921, + "loss": 0.7829, + "num_input_tokens_seen": 69920752, + "step": 120430 + }, + { + "epoch": 17.937890974084002, + "grad_norm": 0.036376953125, + "learning_rate": 0.0009611879124372562, + "loss": 0.8128, + "num_input_tokens_seen": 69923760, + "step": 120435 + }, + { + "epoch": 17.938635686624963, + "grad_norm": 0.08056640625, + "learning_rate": 0.0009605013427870573, + "loss": 0.7985, + "num_input_tokens_seen": 69926640, + "step": 120440 + }, + { + "epoch": 17.939380399165923, + "grad_norm": 0.050537109375, + "learning_rate": 0.0009598150103199149, + "loss": 0.8016, + "num_input_tokens_seen": 69929488, + "step": 120445 + }, + { + "epoch": 17.94012511170688, + "grad_norm": 0.03955078125, + "learning_rate": 0.000959128915047433, + "loss": 0.7994, + "num_input_tokens_seen": 69932528, + "step": 120450 + }, + { + "epoch": 17.94086982424784, + "grad_norm": 0.030029296875, + "learning_rate": 0.0009584430569811958, + "loss": 0.8112, + "num_input_tokens_seen": 69935184, + "step": 120455 + }, + { + "epoch": 17.941614536788798, + "grad_norm": 0.0277099609375, + "learning_rate": 0.0009577574361327889, + "loss": 0.784, + "num_input_tokens_seen": 69938160, + "step": 120460 + }, + { + "epoch": 17.94235924932976, + "grad_norm": 0.058837890625, + "learning_rate": 0.0009570720525138032, + "loss": 0.8268, + "num_input_tokens_seen": 69940848, + "step": 120465 + }, + { + "epoch": 17.94310396187072, + "grad_norm": 0.06982421875, + "learning_rate": 0.000956386906135811, + "loss": 0.7949, + "num_input_tokens_seen": 69943696, + "step": 120470 + }, + { + "epoch": 17.943848674411676, + "grad_norm": 0.1044921875, + "learning_rate": 0.000955701997010388, + "loss": 0.7893, + "num_input_tokens_seen": 69946544, + "step": 120475 + }, + { + "epoch": 17.944593386952636, + "grad_norm": 0.047119140625, + "learning_rate": 0.0009550173251491033, + "loss": 0.779, + "num_input_tokens_seen": 69949712, + "step": 120480 + }, + { + "epoch": 17.945338099493597, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009543328905635295, + "loss": 0.7892, + "num_input_tokens_seen": 69952560, + "step": 120485 + }, + { + "epoch": 17.946082812034554, + "grad_norm": 0.0576171875, + "learning_rate": 0.0009536486932652271, + "loss": 0.8193, + "num_input_tokens_seen": 69955472, + "step": 120490 + }, + { + "epoch": 17.946827524575514, + "grad_norm": 0.048828125, + "learning_rate": 0.0009529647332657504, + "loss": 0.7934, + "num_input_tokens_seen": 69958480, + "step": 120495 + }, + { + "epoch": 17.94757223711647, + "grad_norm": 0.0849609375, + "learning_rate": 0.0009522810105766599, + "loss": 0.8128, + "num_input_tokens_seen": 69961552, + "step": 120500 + }, + { + "epoch": 17.948316949657432, + "grad_norm": 0.051513671875, + "learning_rate": 0.0009515975252095033, + "loss": 0.8041, + "num_input_tokens_seen": 69964496, + "step": 120505 + }, + { + "epoch": 17.949061662198392, + "grad_norm": 0.052001953125, + "learning_rate": 0.0009509142771758293, + "loss": 0.7851, + "num_input_tokens_seen": 69967248, + "step": 120510 + }, + { + "epoch": 17.94980637473935, + "grad_norm": 0.08544921875, + "learning_rate": 0.0009502312664871775, + "loss": 0.7849, + "num_input_tokens_seen": 69970160, + "step": 120515 + }, + { + "epoch": 17.95055108728031, + "grad_norm": 0.02734375, + "learning_rate": 0.0009495484931550918, + "loss": 0.7873, + "num_input_tokens_seen": 69972976, + "step": 120520 + }, + { + "epoch": 17.95129579982127, + "grad_norm": 0.05126953125, + "learning_rate": 0.0009488659571911012, + "loss": 0.8062, + "num_input_tokens_seen": 69975920, + "step": 120525 + }, + { + "epoch": 17.952040512362228, + "grad_norm": 0.07666015625, + "learning_rate": 0.0009481836586067415, + "loss": 0.7836, + "num_input_tokens_seen": 69978832, + "step": 120530 + }, + { + "epoch": 17.952785224903188, + "grad_norm": 0.056396484375, + "learning_rate": 0.0009475015974135387, + "loss": 0.7951, + "num_input_tokens_seen": 69981872, + "step": 120535 + }, + { + "epoch": 17.953529937444145, + "grad_norm": 0.041259765625, + "learning_rate": 0.0009468197736230116, + "loss": 0.8132, + "num_input_tokens_seen": 69984656, + "step": 120540 + }, + { + "epoch": 17.954274649985106, + "grad_norm": 0.055908203125, + "learning_rate": 0.0009461381872466828, + "loss": 0.778, + "num_input_tokens_seen": 69987568, + "step": 120545 + }, + { + "epoch": 17.955019362526066, + "grad_norm": 0.040771484375, + "learning_rate": 0.0009454568382960631, + "loss": 0.7959, + "num_input_tokens_seen": 69990352, + "step": 120550 + }, + { + "epoch": 17.955764075067023, + "grad_norm": 0.061279296875, + "learning_rate": 0.0009447757267826684, + "loss": 0.789, + "num_input_tokens_seen": 69993136, + "step": 120555 + }, + { + "epoch": 17.956508787607984, + "grad_norm": 0.0498046875, + "learning_rate": 0.000944094852718001, + "loss": 0.7905, + "num_input_tokens_seen": 69995984, + "step": 120560 + }, + { + "epoch": 17.957253500148944, + "grad_norm": 0.033935546875, + "learning_rate": 0.0009434142161135667, + "loss": 0.8124, + "num_input_tokens_seen": 69998736, + "step": 120565 + }, + { + "epoch": 17.9579982126899, + "grad_norm": 0.043212890625, + "learning_rate": 0.000942733816980863, + "loss": 0.8103, + "num_input_tokens_seen": 70001680, + "step": 120570 + }, + { + "epoch": 17.95874292523086, + "grad_norm": 0.0284423828125, + "learning_rate": 0.0009420536553313824, + "loss": 0.8114, + "num_input_tokens_seen": 70004400, + "step": 120575 + }, + { + "epoch": 17.95948763777182, + "grad_norm": 0.0400390625, + "learning_rate": 0.000941373731176619, + "loss": 0.8046, + "num_input_tokens_seen": 70007216, + "step": 120580 + }, + { + "epoch": 17.96023235031278, + "grad_norm": 0.06103515625, + "learning_rate": 0.0009406940445280553, + "loss": 0.8171, + "num_input_tokens_seen": 70010384, + "step": 120585 + }, + { + "epoch": 17.96097706285374, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009400145953971789, + "loss": 0.8085, + "num_input_tokens_seen": 70013232, + "step": 120590 + }, + { + "epoch": 17.961721775394697, + "grad_norm": 0.057861328125, + "learning_rate": 0.0009393353837954671, + "loss": 0.7862, + "num_input_tokens_seen": 70016080, + "step": 120595 + }, + { + "epoch": 17.962466487935657, + "grad_norm": 0.052490234375, + "learning_rate": 0.0009386564097343908, + "loss": 0.7819, + "num_input_tokens_seen": 70018928, + "step": 120600 + }, + { + "epoch": 17.963211200476614, + "grad_norm": 0.03466796875, + "learning_rate": 0.0009379776732254241, + "loss": 0.8021, + "num_input_tokens_seen": 70022288, + "step": 120605 + }, + { + "epoch": 17.963955913017575, + "grad_norm": 0.03857421875, + "learning_rate": 0.0009372991742800346, + "loss": 0.803, + "num_input_tokens_seen": 70025040, + "step": 120610 + }, + { + "epoch": 17.964700625558535, + "grad_norm": 0.08349609375, + "learning_rate": 0.0009366209129096814, + "loss": 0.7997, + "num_input_tokens_seen": 70028208, + "step": 120615 + }, + { + "epoch": 17.965445338099492, + "grad_norm": 0.0269775390625, + "learning_rate": 0.000935942889125822, + "loss": 0.7877, + "num_input_tokens_seen": 70031088, + "step": 120620 + }, + { + "epoch": 17.966190050640453, + "grad_norm": 0.039794921875, + "learning_rate": 0.0009352651029399172, + "loss": 0.7912, + "num_input_tokens_seen": 70034096, + "step": 120625 + }, + { + "epoch": 17.966934763181413, + "grad_norm": 0.04736328125, + "learning_rate": 0.0009345875543634096, + "loss": 0.8302, + "num_input_tokens_seen": 70036976, + "step": 120630 + }, + { + "epoch": 17.96767947572237, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009339102434077534, + "loss": 0.7701, + "num_input_tokens_seen": 70039856, + "step": 120635 + }, + { + "epoch": 17.96842418826333, + "grad_norm": 0.0791015625, + "learning_rate": 0.0009332331700843843, + "loss": 0.7895, + "num_input_tokens_seen": 70042736, + "step": 120640 + }, + { + "epoch": 17.969168900804288, + "grad_norm": 0.07421875, + "learning_rate": 0.0009325563344047466, + "loss": 0.8017, + "num_input_tokens_seen": 70045936, + "step": 120645 + }, + { + "epoch": 17.96991361334525, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009318797363802728, + "loss": 0.7845, + "num_input_tokens_seen": 70049296, + "step": 120650 + }, + { + "epoch": 17.97065832588621, + "grad_norm": 0.035400390625, + "learning_rate": 0.0009312033760223903, + "loss": 0.7797, + "num_input_tokens_seen": 70052304, + "step": 120655 + }, + { + "epoch": 17.971403038427166, + "grad_norm": 0.044677734375, + "learning_rate": 0.0009305272533425285, + "loss": 0.7598, + "num_input_tokens_seen": 70055216, + "step": 120660 + }, + { + "epoch": 17.972147750968126, + "grad_norm": 0.0302734375, + "learning_rate": 0.0009298513683521114, + "loss": 0.788, + "num_input_tokens_seen": 70058384, + "step": 120665 + }, + { + "epoch": 17.972892463509087, + "grad_norm": 0.03369140625, + "learning_rate": 0.0009291757210625534, + "loss": 0.7973, + "num_input_tokens_seen": 70061200, + "step": 120670 + }, + { + "epoch": 17.973637176050044, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009285003114852702, + "loss": 0.8108, + "num_input_tokens_seen": 70064080, + "step": 120675 + }, + { + "epoch": 17.974381888591004, + "grad_norm": 0.037353515625, + "learning_rate": 0.0009278251396316727, + "loss": 0.8007, + "num_input_tokens_seen": 70067088, + "step": 120680 + }, + { + "epoch": 17.97512660113196, + "grad_norm": 0.06298828125, + "learning_rate": 0.0009271502055131669, + "loss": 0.8013, + "num_input_tokens_seen": 70070384, + "step": 120685 + }, + { + "epoch": 17.975871313672922, + "grad_norm": 0.041748046875, + "learning_rate": 0.0009264755091411569, + "loss": 0.7973, + "num_input_tokens_seen": 70073072, + "step": 120690 + }, + { + "epoch": 17.976616026213883, + "grad_norm": 0.031982421875, + "learning_rate": 0.0009258010505270386, + "loss": 0.8127, + "num_input_tokens_seen": 70076368, + "step": 120695 + }, + { + "epoch": 17.97736073875484, + "grad_norm": 0.044921875, + "learning_rate": 0.0009251268296822063, + "loss": 0.7789, + "num_input_tokens_seen": 70079216, + "step": 120700 + }, + { + "epoch": 17.9781054512958, + "grad_norm": 0.048095703125, + "learning_rate": 0.0009244528466180523, + "loss": 0.7932, + "num_input_tokens_seen": 70081840, + "step": 120705 + }, + { + "epoch": 17.97885016383676, + "grad_norm": 0.039794921875, + "learning_rate": 0.0009237791013459595, + "loss": 0.7941, + "num_input_tokens_seen": 70084944, + "step": 120710 + }, + { + "epoch": 17.979594876377718, + "grad_norm": 0.05908203125, + "learning_rate": 0.0009231055938773153, + "loss": 0.8061, + "num_input_tokens_seen": 70087792, + "step": 120715 + }, + { + "epoch": 17.980339588918678, + "grad_norm": 0.05419921875, + "learning_rate": 0.0009224323242234922, + "loss": 0.7985, + "num_input_tokens_seen": 70090544, + "step": 120720 + }, + { + "epoch": 17.981084301459635, + "grad_norm": 0.0390625, + "learning_rate": 0.0009217592923958695, + "loss": 0.8161, + "num_input_tokens_seen": 70093488, + "step": 120725 + }, + { + "epoch": 17.981829014000596, + "grad_norm": 0.033935546875, + "learning_rate": 0.0009210864984058164, + "loss": 0.7916, + "num_input_tokens_seen": 70096272, + "step": 120730 + }, + { + "epoch": 17.982573726541556, + "grad_norm": 0.05224609375, + "learning_rate": 0.0009204139422646957, + "loss": 0.7997, + "num_input_tokens_seen": 70098992, + "step": 120735 + }, + { + "epoch": 17.983318439082513, + "grad_norm": 0.038330078125, + "learning_rate": 0.0009197416239838729, + "loss": 0.8065, + "num_input_tokens_seen": 70101936, + "step": 120740 + }, + { + "epoch": 17.984063151623474, + "grad_norm": 0.048583984375, + "learning_rate": 0.0009190695435747026, + "loss": 0.7839, + "num_input_tokens_seen": 70104976, + "step": 120745 + }, + { + "epoch": 17.984807864164434, + "grad_norm": 0.05126953125, + "learning_rate": 0.0009183977010485422, + "loss": 0.7947, + "num_input_tokens_seen": 70107824, + "step": 120750 + }, + { + "epoch": 17.98555257670539, + "grad_norm": 0.0546875, + "learning_rate": 0.0009177260964167394, + "loss": 0.7888, + "num_input_tokens_seen": 70110736, + "step": 120755 + }, + { + "epoch": 17.98629728924635, + "grad_norm": 0.0634765625, + "learning_rate": 0.0009170547296906433, + "loss": 0.8081, + "num_input_tokens_seen": 70114000, + "step": 120760 + }, + { + "epoch": 17.98704200178731, + "grad_norm": 0.040283203125, + "learning_rate": 0.0009163836008815934, + "loss": 0.7937, + "num_input_tokens_seen": 70117136, + "step": 120765 + }, + { + "epoch": 17.98778671432827, + "grad_norm": 0.05615234375, + "learning_rate": 0.0009157127100009288, + "loss": 0.7845, + "num_input_tokens_seen": 70120080, + "step": 120770 + }, + { + "epoch": 17.98853142686923, + "grad_norm": 0.05517578125, + "learning_rate": 0.0009150420570599853, + "loss": 0.8039, + "num_input_tokens_seen": 70123088, + "step": 120775 + }, + { + "epoch": 17.989276139410187, + "grad_norm": 0.0498046875, + "learning_rate": 0.0009143716420700875, + "loss": 0.7943, + "num_input_tokens_seen": 70125936, + "step": 120780 + }, + { + "epoch": 17.990020851951147, + "grad_norm": 0.053955078125, + "learning_rate": 0.0009137014650425679, + "loss": 0.8034, + "num_input_tokens_seen": 70128656, + "step": 120785 + }, + { + "epoch": 17.990765564492108, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0009130315259887423, + "loss": 0.7857, + "num_input_tokens_seen": 70131568, + "step": 120790 + }, + { + "epoch": 17.991510277033065, + "grad_norm": 0.044921875, + "learning_rate": 0.0009123618249199333, + "loss": 0.7936, + "num_input_tokens_seen": 70134608, + "step": 120795 + }, + { + "epoch": 17.992254989574025, + "grad_norm": 0.059814453125, + "learning_rate": 0.0009116923618474554, + "loss": 0.797, + "num_input_tokens_seen": 70137648, + "step": 120800 + }, + { + "epoch": 17.992999702114982, + "grad_norm": 0.031982421875, + "learning_rate": 0.0009110231367826144, + "loss": 0.7976, + "num_input_tokens_seen": 70140624, + "step": 120805 + }, + { + "epoch": 17.993744414655943, + "grad_norm": 0.036376953125, + "learning_rate": 0.0009103541497367195, + "loss": 0.7787, + "num_input_tokens_seen": 70143248, + "step": 120810 + }, + { + "epoch": 17.994489127196903, + "grad_norm": 0.056884765625, + "learning_rate": 0.0009096854007210669, + "loss": 0.8062, + "num_input_tokens_seen": 70146384, + "step": 120815 + }, + { + "epoch": 17.99523383973786, + "grad_norm": 0.0400390625, + "learning_rate": 0.0009090168897469624, + "loss": 0.812, + "num_input_tokens_seen": 70149488, + "step": 120820 + }, + { + "epoch": 17.99597855227882, + "grad_norm": 0.036865234375, + "learning_rate": 0.0009083486168256937, + "loss": 0.7907, + "num_input_tokens_seen": 70152208, + "step": 120825 + }, + { + "epoch": 17.996723264819778, + "grad_norm": 0.04296875, + "learning_rate": 0.0009076805819685568, + "loss": 0.7877, + "num_input_tokens_seen": 70155152, + "step": 120830 + }, + { + "epoch": 17.99746797736074, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009070127851868292, + "loss": 0.8062, + "num_input_tokens_seen": 70158160, + "step": 120835 + }, + { + "epoch": 17.9982126899017, + "grad_norm": 0.1044921875, + "learning_rate": 0.0009063452264918004, + "loss": 0.7905, + "num_input_tokens_seen": 70161072, + "step": 120840 + }, + { + "epoch": 17.998957402442656, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0009056779058947412, + "loss": 0.8015, + "num_input_tokens_seen": 70163856, + "step": 120845 + }, + { + "epoch": 17.999702114983616, + "grad_norm": 0.06005859375, + "learning_rate": 0.0009050108234069326, + "loss": 0.81, + "num_input_tokens_seen": 70166640, + "step": 120850 + }, + { + "epoch": 18.0, + "eval_loss": 0.7993520498275757, + "eval_runtime": 70.495, + "eval_samples_per_second": 42.329, + "eval_steps_per_second": 10.582, + "num_input_tokens_seen": 70167432, + "step": 120852 + }, + { + "epoch": 18.000446827524577, + "grad_norm": 0.052734375, + "learning_rate": 0.0009043439790396406, + "loss": 0.793, + "num_input_tokens_seen": 70169320, + "step": 120855 + }, + { + "epoch": 18.001191540065534, + "grad_norm": 0.0869140625, + "learning_rate": 0.000903677372804128, + "loss": 0.8064, + "num_input_tokens_seen": 70172264, + "step": 120860 + }, + { + "epoch": 18.001936252606495, + "grad_norm": 0.036376953125, + "learning_rate": 0.0009030110047116624, + "loss": 0.8034, + "num_input_tokens_seen": 70174984, + "step": 120865 + }, + { + "epoch": 18.00268096514745, + "grad_norm": 0.05224609375, + "learning_rate": 0.000902344874773498, + "loss": 0.787, + "num_input_tokens_seen": 70177832, + "step": 120870 + }, + { + "epoch": 18.003425677688412, + "grad_norm": 0.052978515625, + "learning_rate": 0.0009016789830008892, + "loss": 0.7993, + "num_input_tokens_seen": 70180872, + "step": 120875 + }, + { + "epoch": 18.004170390229373, + "grad_norm": 0.03955078125, + "learning_rate": 0.000901013329405082, + "loss": 0.7974, + "num_input_tokens_seen": 70183784, + "step": 120880 + }, + { + "epoch": 18.00491510277033, + "grad_norm": 0.042236328125, + "learning_rate": 0.0009003479139973275, + "loss": 0.804, + "num_input_tokens_seen": 70186536, + "step": 120885 + }, + { + "epoch": 18.00565981531129, + "grad_norm": 0.060546875, + "learning_rate": 0.0008996827367888649, + "loss": 0.8347, + "num_input_tokens_seen": 70190024, + "step": 120890 + }, + { + "epoch": 18.00640452785225, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008990177977909286, + "loss": 0.7969, + "num_input_tokens_seen": 70192680, + "step": 120895 + }, + { + "epoch": 18.007149240393208, + "grad_norm": 0.07763671875, + "learning_rate": 0.0008983530970147579, + "loss": 0.7943, + "num_input_tokens_seen": 70195784, + "step": 120900 + }, + { + "epoch": 18.007893952934168, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008976886344715773, + "loss": 0.7889, + "num_input_tokens_seen": 70198728, + "step": 120905 + }, + { + "epoch": 18.008638665475125, + "grad_norm": 0.059814453125, + "learning_rate": 0.000897024410172616, + "loss": 0.812, + "num_input_tokens_seen": 70201544, + "step": 120910 + }, + { + "epoch": 18.009383378016086, + "grad_norm": 0.0556640625, + "learning_rate": 0.0008963604241290918, + "loss": 0.8042, + "num_input_tokens_seen": 70204232, + "step": 120915 + }, + { + "epoch": 18.010128090557046, + "grad_norm": 0.055419921875, + "learning_rate": 0.0008956966763522256, + "loss": 0.779, + "num_input_tokens_seen": 70206856, + "step": 120920 + }, + { + "epoch": 18.010872803098003, + "grad_norm": 0.09375, + "learning_rate": 0.0008950331668532302, + "loss": 0.816, + "num_input_tokens_seen": 70209512, + "step": 120925 + }, + { + "epoch": 18.011617515638964, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008943698956433098, + "loss": 0.797, + "num_input_tokens_seen": 70212488, + "step": 120930 + }, + { + "epoch": 18.012362228179924, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008937068627336774, + "loss": 0.7913, + "num_input_tokens_seen": 70216008, + "step": 120935 + }, + { + "epoch": 18.01310694072088, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008930440681355306, + "loss": 0.7797, + "num_input_tokens_seen": 70218920, + "step": 120940 + }, + { + "epoch": 18.013851653261842, + "grad_norm": 0.09814453125, + "learning_rate": 0.0008923815118600653, + "loss": 0.7846, + "num_input_tokens_seen": 70221704, + "step": 120945 + }, + { + "epoch": 18.0145963658028, + "grad_norm": 0.049072265625, + "learning_rate": 0.0008917191939184759, + "loss": 0.798, + "num_input_tokens_seen": 70224552, + "step": 120950 + }, + { + "epoch": 18.01534107834376, + "grad_norm": 0.0390625, + "learning_rate": 0.000891057114321952, + "loss": 0.7828, + "num_input_tokens_seen": 70227240, + "step": 120955 + }, + { + "epoch": 18.01608579088472, + "grad_norm": 0.0361328125, + "learning_rate": 0.0008903952730816761, + "loss": 0.8104, + "num_input_tokens_seen": 70230088, + "step": 120960 + }, + { + "epoch": 18.016830503425677, + "grad_norm": 0.041015625, + "learning_rate": 0.000889733670208836, + "loss": 0.8117, + "num_input_tokens_seen": 70232904, + "step": 120965 + }, + { + "epoch": 18.017575215966637, + "grad_norm": 0.052734375, + "learning_rate": 0.0008890723057146027, + "loss": 0.7906, + "num_input_tokens_seen": 70235432, + "step": 120970 + }, + { + "epoch": 18.018319928507594, + "grad_norm": 0.0380859375, + "learning_rate": 0.000888411179610149, + "loss": 0.7914, + "num_input_tokens_seen": 70238120, + "step": 120975 + }, + { + "epoch": 18.019064641048555, + "grad_norm": 0.109375, + "learning_rate": 0.0008877502919066493, + "loss": 0.7946, + "num_input_tokens_seen": 70241032, + "step": 120980 + }, + { + "epoch": 18.019809353589515, + "grad_norm": 0.05078125, + "learning_rate": 0.000887089642615263, + "loss": 0.7992, + "num_input_tokens_seen": 70243912, + "step": 120985 + }, + { + "epoch": 18.020554066130472, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008864292317471544, + "loss": 0.8061, + "num_input_tokens_seen": 70246792, + "step": 120990 + }, + { + "epoch": 18.021298778671433, + "grad_norm": 0.375, + "learning_rate": 0.0008857690593134798, + "loss": 0.8256, + "num_input_tokens_seen": 70249576, + "step": 120995 + }, + { + "epoch": 18.022043491212393, + "grad_norm": 0.0274658203125, + "learning_rate": 0.0008851091253253934, + "loss": 0.8124, + "num_input_tokens_seen": 70252520, + "step": 121000 + }, + { + "epoch": 18.02278820375335, + "grad_norm": 0.053466796875, + "learning_rate": 0.0008844494297940397, + "loss": 0.7867, + "num_input_tokens_seen": 70255336, + "step": 121005 + }, + { + "epoch": 18.02353291629431, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008837899727305681, + "loss": 0.8084, + "num_input_tokens_seen": 70258440, + "step": 121010 + }, + { + "epoch": 18.024277628835268, + "grad_norm": 0.037841796875, + "learning_rate": 0.000883130754146118, + "loss": 0.8052, + "num_input_tokens_seen": 70261128, + "step": 121015 + }, + { + "epoch": 18.02502234137623, + "grad_norm": 0.038330078125, + "learning_rate": 0.000882471774051824, + "loss": 0.8001, + "num_input_tokens_seen": 70263976, + "step": 121020 + }, + { + "epoch": 18.02576705391719, + "grad_norm": 0.0576171875, + "learning_rate": 0.0008818130324588236, + "loss": 0.8193, + "num_input_tokens_seen": 70266920, + "step": 121025 + }, + { + "epoch": 18.026511766458146, + "grad_norm": 0.045654296875, + "learning_rate": 0.0008811545293782397, + "loss": 0.7904, + "num_input_tokens_seen": 70269832, + "step": 121030 + }, + { + "epoch": 18.027256478999107, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008804962648212033, + "loss": 0.8124, + "num_input_tokens_seen": 70272808, + "step": 121035 + }, + { + "epoch": 18.028001191540067, + "grad_norm": 0.0498046875, + "learning_rate": 0.000879838238798829, + "loss": 0.7845, + "num_input_tokens_seen": 70275560, + "step": 121040 + }, + { + "epoch": 18.028745904081024, + "grad_norm": 0.051025390625, + "learning_rate": 0.0008791804513222395, + "loss": 0.8067, + "num_input_tokens_seen": 70278312, + "step": 121045 + }, + { + "epoch": 18.029490616621985, + "grad_norm": 0.046875, + "learning_rate": 0.0008785229024025426, + "loss": 0.8091, + "num_input_tokens_seen": 70281480, + "step": 121050 + }, + { + "epoch": 18.03023532916294, + "grad_norm": 0.056396484375, + "learning_rate": 0.000877865592050851, + "loss": 0.81, + "num_input_tokens_seen": 70284392, + "step": 121055 + }, + { + "epoch": 18.030980041703902, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008772085202782659, + "loss": 0.803, + "num_input_tokens_seen": 70287336, + "step": 121060 + }, + { + "epoch": 18.031724754244863, + "grad_norm": 0.054443359375, + "learning_rate": 0.00087655168709589, + "loss": 0.7699, + "num_input_tokens_seen": 70290408, + "step": 121065 + }, + { + "epoch": 18.03246946678582, + "grad_norm": 0.05029296875, + "learning_rate": 0.0008758950925148195, + "loss": 0.7913, + "num_input_tokens_seen": 70293192, + "step": 121070 + }, + { + "epoch": 18.03321417932678, + "grad_norm": 0.033935546875, + "learning_rate": 0.0008752387365461439, + "loss": 0.8008, + "num_input_tokens_seen": 70295880, + "step": 121075 + }, + { + "epoch": 18.03395889186774, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008745826192009559, + "loss": 0.8062, + "num_input_tokens_seen": 70298632, + "step": 121080 + }, + { + "epoch": 18.034703604408698, + "grad_norm": 0.03955078125, + "learning_rate": 0.000873926740490335, + "loss": 0.7908, + "num_input_tokens_seen": 70301608, + "step": 121085 + }, + { + "epoch": 18.035448316949658, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008732711004253674, + "loss": 0.8268, + "num_input_tokens_seen": 70304616, + "step": 121090 + }, + { + "epoch": 18.036193029490615, + "grad_norm": 0.06201171875, + "learning_rate": 0.0008726156990171274, + "loss": 0.7973, + "num_input_tokens_seen": 70307496, + "step": 121095 + }, + { + "epoch": 18.036937742031576, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008719605362766847, + "loss": 0.8052, + "num_input_tokens_seen": 70310760, + "step": 121100 + }, + { + "epoch": 18.037682454572536, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008713056122151119, + "loss": 0.8253, + "num_input_tokens_seen": 70313704, + "step": 121105 + }, + { + "epoch": 18.038427167113493, + "grad_norm": 0.056640625, + "learning_rate": 0.0008706509268434687, + "loss": 0.8025, + "num_input_tokens_seen": 70316840, + "step": 121110 + }, + { + "epoch": 18.039171879654454, + "grad_norm": 0.07568359375, + "learning_rate": 0.0008699964801728193, + "loss": 0.8089, + "num_input_tokens_seen": 70319624, + "step": 121115 + }, + { + "epoch": 18.039916592195414, + "grad_norm": 0.0458984375, + "learning_rate": 0.0008693422722142152, + "loss": 0.8005, + "num_input_tokens_seen": 70322408, + "step": 121120 + }, + { + "epoch": 18.04066130473637, + "grad_norm": 0.06884765625, + "learning_rate": 0.0008686883029787156, + "loss": 0.7878, + "num_input_tokens_seen": 70325224, + "step": 121125 + }, + { + "epoch": 18.041406017277332, + "grad_norm": 0.04296875, + "learning_rate": 0.0008680345724773635, + "loss": 0.8071, + "num_input_tokens_seen": 70327944, + "step": 121130 + }, + { + "epoch": 18.04215072981829, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008673810807212051, + "loss": 0.8066, + "num_input_tokens_seen": 70330504, + "step": 121135 + }, + { + "epoch": 18.04289544235925, + "grad_norm": 0.053466796875, + "learning_rate": 0.0008667278277212797, + "loss": 0.7741, + "num_input_tokens_seen": 70334696, + "step": 121140 + }, + { + "epoch": 18.04364015490021, + "grad_norm": 0.043701171875, + "learning_rate": 0.0008660748134886203, + "loss": 0.7914, + "num_input_tokens_seen": 70337448, + "step": 121145 + }, + { + "epoch": 18.044384867441167, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008654220380342648, + "loss": 0.8066, + "num_input_tokens_seen": 70340360, + "step": 121150 + }, + { + "epoch": 18.045129579982127, + "grad_norm": 0.0478515625, + "learning_rate": 0.0008647695013692358, + "loss": 0.8244, + "num_input_tokens_seen": 70343592, + "step": 121155 + }, + { + "epoch": 18.045874292523084, + "grad_norm": 0.041015625, + "learning_rate": 0.0008641172035045613, + "loss": 0.7996, + "num_input_tokens_seen": 70346504, + "step": 121160 + }, + { + "epoch": 18.046619005064045, + "grad_norm": 0.0556640625, + "learning_rate": 0.0008634651444512575, + "loss": 0.7821, + "num_input_tokens_seen": 70349544, + "step": 121165 + }, + { + "epoch": 18.047363717605005, + "grad_norm": 0.03857421875, + "learning_rate": 0.0008628133242203456, + "loss": 0.8074, + "num_input_tokens_seen": 70352360, + "step": 121170 + }, + { + "epoch": 18.048108430145962, + "grad_norm": 0.061767578125, + "learning_rate": 0.0008621617428228334, + "loss": 0.7857, + "num_input_tokens_seen": 70355528, + "step": 121175 + }, + { + "epoch": 18.048853142686923, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008615104002697271, + "loss": 0.8026, + "num_input_tokens_seen": 70358280, + "step": 121180 + }, + { + "epoch": 18.049597855227884, + "grad_norm": 0.05517578125, + "learning_rate": 0.0008608592965720346, + "loss": 0.7902, + "num_input_tokens_seen": 70361288, + "step": 121185 + }, + { + "epoch": 18.05034256776884, + "grad_norm": 0.040771484375, + "learning_rate": 0.0008602084317407521, + "loss": 0.7845, + "num_input_tokens_seen": 70364296, + "step": 121190 + }, + { + "epoch": 18.0510872803098, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008595578057868807, + "loss": 0.8034, + "num_input_tokens_seen": 70367240, + "step": 121195 + }, + { + "epoch": 18.051831992850758, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008589074187214068, + "loss": 0.7922, + "num_input_tokens_seen": 70370056, + "step": 121200 + }, + { + "epoch": 18.05257670539172, + "grad_norm": 0.043212890625, + "learning_rate": 0.0008582572705553199, + "loss": 0.8026, + "num_input_tokens_seen": 70373064, + "step": 121205 + }, + { + "epoch": 18.05332141793268, + "grad_norm": 0.10400390625, + "learning_rate": 0.0008576073612996009, + "loss": 0.7928, + "num_input_tokens_seen": 70375816, + "step": 121210 + }, + { + "epoch": 18.054066130473636, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008569576909652349, + "loss": 0.7954, + "num_input_tokens_seen": 70378920, + "step": 121215 + }, + { + "epoch": 18.054810843014597, + "grad_norm": 0.055908203125, + "learning_rate": 0.0008563082595631944, + "loss": 0.7943, + "num_input_tokens_seen": 70382344, + "step": 121220 + }, + { + "epoch": 18.055555555555557, + "grad_norm": 0.034912109375, + "learning_rate": 0.0008556590671044489, + "loss": 0.7989, + "num_input_tokens_seen": 70385288, + "step": 121225 + }, + { + "epoch": 18.056300268096514, + "grad_norm": 0.054931640625, + "learning_rate": 0.0008550101135999699, + "loss": 0.7854, + "num_input_tokens_seen": 70388008, + "step": 121230 + }, + { + "epoch": 18.057044980637475, + "grad_norm": 0.0546875, + "learning_rate": 0.0008543613990607151, + "loss": 0.8031, + "num_input_tokens_seen": 70390952, + "step": 121235 + }, + { + "epoch": 18.05778969317843, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008537129234976509, + "loss": 0.8091, + "num_input_tokens_seen": 70393704, + "step": 121240 + }, + { + "epoch": 18.058534405719392, + "grad_norm": 0.0380859375, + "learning_rate": 0.0008530646869217267, + "loss": 0.7936, + "num_input_tokens_seen": 70396808, + "step": 121245 + }, + { + "epoch": 18.059279118260353, + "grad_norm": 0.035400390625, + "learning_rate": 0.0008524166893438988, + "loss": 0.805, + "num_input_tokens_seen": 70399560, + "step": 121250 + }, + { + "epoch": 18.06002383080131, + "grad_norm": 0.041748046875, + "learning_rate": 0.000851768930775112, + "loss": 0.7917, + "num_input_tokens_seen": 70402568, + "step": 121255 + }, + { + "epoch": 18.06076854334227, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008511214112263071, + "loss": 0.7921, + "num_input_tokens_seen": 70405512, + "step": 121260 + }, + { + "epoch": 18.06151325588323, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008504741307084273, + "loss": 0.7908, + "num_input_tokens_seen": 70408264, + "step": 121265 + }, + { + "epoch": 18.062257968424188, + "grad_norm": 0.07275390625, + "learning_rate": 0.0008498270892324072, + "loss": 0.795, + "num_input_tokens_seen": 70411112, + "step": 121270 + }, + { + "epoch": 18.06300268096515, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008491802868091763, + "loss": 0.8104, + "num_input_tokens_seen": 70413768, + "step": 121275 + }, + { + "epoch": 18.063747393506105, + "grad_norm": 0.056640625, + "learning_rate": 0.0008485337234496593, + "loss": 0.7864, + "num_input_tokens_seen": 70416744, + "step": 121280 + }, + { + "epoch": 18.064492106047066, + "grad_norm": 0.04638671875, + "learning_rate": 0.0008478873991647856, + "loss": 0.8099, + "num_input_tokens_seen": 70419528, + "step": 121285 + }, + { + "epoch": 18.065236818588026, + "grad_norm": 0.037109375, + "learning_rate": 0.0008472413139654666, + "loss": 0.7845, + "num_input_tokens_seen": 70422536, + "step": 121290 + }, + { + "epoch": 18.065981531128983, + "grad_norm": 0.061279296875, + "learning_rate": 0.0008465954678626253, + "loss": 0.7974, + "num_input_tokens_seen": 70425704, + "step": 121295 + }, + { + "epoch": 18.066726243669944, + "grad_norm": 0.036865234375, + "learning_rate": 0.0008459498608671678, + "loss": 0.8071, + "num_input_tokens_seen": 70428872, + "step": 121300 + }, + { + "epoch": 18.0674709562109, + "grad_norm": 0.038818359375, + "learning_rate": 0.0008453044929900006, + "loss": 0.7954, + "num_input_tokens_seen": 70431560, + "step": 121305 + }, + { + "epoch": 18.06821566875186, + "grad_norm": 0.050537109375, + "learning_rate": 0.0008446593642420296, + "loss": 0.7928, + "num_input_tokens_seen": 70434408, + "step": 121310 + }, + { + "epoch": 18.068960381292822, + "grad_norm": 0.0390625, + "learning_rate": 0.0008440144746341499, + "loss": 0.811, + "num_input_tokens_seen": 70437320, + "step": 121315 + }, + { + "epoch": 18.06970509383378, + "grad_norm": 0.031005859375, + "learning_rate": 0.0008433698241772608, + "loss": 0.7977, + "num_input_tokens_seen": 70440296, + "step": 121320 + }, + { + "epoch": 18.07044980637474, + "grad_norm": 0.14453125, + "learning_rate": 0.0008427254128822487, + "loss": 0.7763, + "num_input_tokens_seen": 70443240, + "step": 121325 + }, + { + "epoch": 18.0711945189157, + "grad_norm": 0.03759765625, + "learning_rate": 0.0008420812407600031, + "loss": 0.8031, + "num_input_tokens_seen": 70445960, + "step": 121330 + }, + { + "epoch": 18.071939231456657, + "grad_norm": 0.0751953125, + "learning_rate": 0.0008414373078214054, + "loss": 0.8185, + "num_input_tokens_seen": 70449320, + "step": 121335 + }, + { + "epoch": 18.072683943997617, + "grad_norm": 0.0400390625, + "learning_rate": 0.000840793614077332, + "loss": 0.8172, + "num_input_tokens_seen": 70452168, + "step": 121340 + }, + { + "epoch": 18.073428656538574, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008401501595386606, + "loss": 0.7912, + "num_input_tokens_seen": 70455080, + "step": 121345 + }, + { + "epoch": 18.074173369079535, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008395069442162611, + "loss": 0.8081, + "num_input_tokens_seen": 70457896, + "step": 121350 + }, + { + "epoch": 18.074918081620496, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008388639681209997, + "loss": 0.8023, + "num_input_tokens_seen": 70460904, + "step": 121355 + }, + { + "epoch": 18.075662794161452, + "grad_norm": 0.041748046875, + "learning_rate": 0.0008382212312637376, + "loss": 0.8093, + "num_input_tokens_seen": 70463816, + "step": 121360 + }, + { + "epoch": 18.076407506702413, + "grad_norm": 0.04443359375, + "learning_rate": 0.0008375787336553364, + "loss": 0.7771, + "num_input_tokens_seen": 70466856, + "step": 121365 + }, + { + "epoch": 18.077152219243374, + "grad_norm": 0.03466796875, + "learning_rate": 0.0008369364753066472, + "loss": 0.7941, + "num_input_tokens_seen": 70469800, + "step": 121370 + }, + { + "epoch": 18.07789693178433, + "grad_norm": 0.048828125, + "learning_rate": 0.000836294456228523, + "loss": 0.7811, + "num_input_tokens_seen": 70472840, + "step": 121375 + }, + { + "epoch": 18.07864164432529, + "grad_norm": 0.06396484375, + "learning_rate": 0.0008356526764318083, + "loss": 0.8004, + "num_input_tokens_seen": 70475368, + "step": 121380 + }, + { + "epoch": 18.079386356866248, + "grad_norm": 0.047119140625, + "learning_rate": 0.0008350111359273431, + "loss": 0.8058, + "num_input_tokens_seen": 70478440, + "step": 121385 + }, + { + "epoch": 18.08013106940721, + "grad_norm": 0.0673828125, + "learning_rate": 0.0008343698347259704, + "loss": 0.8014, + "num_input_tokens_seen": 70481288, + "step": 121390 + }, + { + "epoch": 18.08087578194817, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008337287728385228, + "loss": 0.7735, + "num_input_tokens_seen": 70484168, + "step": 121395 + }, + { + "epoch": 18.081620494489126, + "grad_norm": 0.042724609375, + "learning_rate": 0.0008330879502758304, + "loss": 0.7897, + "num_input_tokens_seen": 70487112, + "step": 121400 + }, + { + "epoch": 18.082365207030087, + "grad_norm": 0.041259765625, + "learning_rate": 0.0008324473670487142, + "loss": 0.802, + "num_input_tokens_seen": 70489832, + "step": 121405 + }, + { + "epoch": 18.083109919571047, + "grad_norm": 0.0390625, + "learning_rate": 0.0008318070231680041, + "loss": 0.795, + "num_input_tokens_seen": 70492840, + "step": 121410 + }, + { + "epoch": 18.083854632112004, + "grad_norm": 0.058837890625, + "learning_rate": 0.0008311669186445147, + "loss": 0.7945, + "num_input_tokens_seen": 70495688, + "step": 121415 + }, + { + "epoch": 18.084599344652965, + "grad_norm": 0.0498046875, + "learning_rate": 0.0008305270534890557, + "loss": 0.7876, + "num_input_tokens_seen": 70498568, + "step": 121420 + }, + { + "epoch": 18.08534405719392, + "grad_norm": 0.04833984375, + "learning_rate": 0.0008298874277124452, + "loss": 0.7904, + "num_input_tokens_seen": 70501384, + "step": 121425 + }, + { + "epoch": 18.086088769734882, + "grad_norm": 0.048828125, + "learning_rate": 0.000829248041325481, + "loss": 0.8036, + "num_input_tokens_seen": 70504072, + "step": 121430 + }, + { + "epoch": 18.086833482275843, + "grad_norm": 0.072265625, + "learning_rate": 0.0008286088943389713, + "loss": 0.8052, + "num_input_tokens_seen": 70507048, + "step": 121435 + }, + { + "epoch": 18.0875781948168, + "grad_norm": 0.059326171875, + "learning_rate": 0.0008279699867637091, + "loss": 0.781, + "num_input_tokens_seen": 70509896, + "step": 121440 + }, + { + "epoch": 18.08832290735776, + "grad_norm": 0.048095703125, + "learning_rate": 0.0008273313186104924, + "loss": 0.8076, + "num_input_tokens_seen": 70513128, + "step": 121445 + }, + { + "epoch": 18.08906761989872, + "grad_norm": 0.060302734375, + "learning_rate": 0.000826692889890106, + "loss": 0.781, + "num_input_tokens_seen": 70516264, + "step": 121450 + }, + { + "epoch": 18.089812332439678, + "grad_norm": 0.060791015625, + "learning_rate": 0.0008260547006133411, + "loss": 0.7835, + "num_input_tokens_seen": 70519240, + "step": 121455 + }, + { + "epoch": 18.09055704498064, + "grad_norm": 0.04833984375, + "learning_rate": 0.000825416750790976, + "loss": 0.7798, + "num_input_tokens_seen": 70522344, + "step": 121460 + }, + { + "epoch": 18.091301757521595, + "grad_norm": 0.06494140625, + "learning_rate": 0.0008247790404337885, + "loss": 0.804, + "num_input_tokens_seen": 70525000, + "step": 121465 + }, + { + "epoch": 18.092046470062556, + "grad_norm": 0.03759765625, + "learning_rate": 0.0008241415695525516, + "loss": 0.7851, + "num_input_tokens_seen": 70527976, + "step": 121470 + }, + { + "epoch": 18.092791182603516, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008235043381580337, + "loss": 0.8025, + "num_input_tokens_seen": 70530792, + "step": 121475 + }, + { + "epoch": 18.093535895144473, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008228673462610025, + "loss": 0.7814, + "num_input_tokens_seen": 70533736, + "step": 121480 + }, + { + "epoch": 18.094280607685434, + "grad_norm": 0.046142578125, + "learning_rate": 0.000822230593872218, + "loss": 0.7854, + "num_input_tokens_seen": 70536520, + "step": 121485 + }, + { + "epoch": 18.09502532022639, + "grad_norm": 0.028076171875, + "learning_rate": 0.0008215940810024397, + "loss": 0.79, + "num_input_tokens_seen": 70539368, + "step": 121490 + }, + { + "epoch": 18.09577003276735, + "grad_norm": 0.037109375, + "learning_rate": 0.0008209578076624174, + "loss": 0.8093, + "num_input_tokens_seen": 70542344, + "step": 121495 + }, + { + "epoch": 18.096514745308312, + "grad_norm": 0.373046875, + "learning_rate": 0.0008203217738629026, + "loss": 0.8164, + "num_input_tokens_seen": 70545384, + "step": 121500 + }, + { + "epoch": 18.09725945784927, + "grad_norm": 0.0263671875, + "learning_rate": 0.00081968597961464, + "loss": 0.8016, + "num_input_tokens_seen": 70548616, + "step": 121505 + }, + { + "epoch": 18.09800417039023, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008190504249283692, + "loss": 0.8021, + "num_input_tokens_seen": 70552040, + "step": 121510 + }, + { + "epoch": 18.09874888293119, + "grad_norm": 0.04150390625, + "learning_rate": 0.00081841510981483, + "loss": 0.7985, + "num_input_tokens_seen": 70555240, + "step": 121515 + }, + { + "epoch": 18.099493595472147, + "grad_norm": 0.05615234375, + "learning_rate": 0.0008177800342847524, + "loss": 0.8036, + "num_input_tokens_seen": 70557768, + "step": 121520 + }, + { + "epoch": 18.100238308013108, + "grad_norm": 0.05224609375, + "learning_rate": 0.0008171451983488691, + "loss": 0.8058, + "num_input_tokens_seen": 70560744, + "step": 121525 + }, + { + "epoch": 18.100983020554064, + "grad_norm": 0.042236328125, + "learning_rate": 0.0008165106020179035, + "loss": 0.8213, + "num_input_tokens_seen": 70563720, + "step": 121530 + }, + { + "epoch": 18.101727733095025, + "grad_norm": 0.049560546875, + "learning_rate": 0.0008158762453025752, + "loss": 0.8032, + "num_input_tokens_seen": 70566664, + "step": 121535 + }, + { + "epoch": 18.102472445635986, + "grad_norm": 0.03564453125, + "learning_rate": 0.0008152421282136024, + "loss": 0.7817, + "num_input_tokens_seen": 70569416, + "step": 121540 + }, + { + "epoch": 18.103217158176943, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008146082507616931, + "loss": 0.804, + "num_input_tokens_seen": 70572296, + "step": 121545 + }, + { + "epoch": 18.103961870717903, + "grad_norm": 0.060791015625, + "learning_rate": 0.0008139746129575637, + "loss": 0.7706, + "num_input_tokens_seen": 70575080, + "step": 121550 + }, + { + "epoch": 18.104706583258864, + "grad_norm": 0.037841796875, + "learning_rate": 0.0008133412148119123, + "loss": 0.8163, + "num_input_tokens_seen": 70577832, + "step": 121555 + }, + { + "epoch": 18.10545129579982, + "grad_norm": 0.039306640625, + "learning_rate": 0.0008127080563354455, + "loss": 0.7882, + "num_input_tokens_seen": 70580680, + "step": 121560 + }, + { + "epoch": 18.10619600834078, + "grad_norm": 0.052490234375, + "learning_rate": 0.0008120751375388546, + "loss": 0.7895, + "num_input_tokens_seen": 70583432, + "step": 121565 + }, + { + "epoch": 18.106940720881738, + "grad_norm": 0.044921875, + "learning_rate": 0.0008114424584328344, + "loss": 0.82, + "num_input_tokens_seen": 70586408, + "step": 121570 + }, + { + "epoch": 18.1076854334227, + "grad_norm": 0.057861328125, + "learning_rate": 0.0008108100190280748, + "loss": 0.8062, + "num_input_tokens_seen": 70589032, + "step": 121575 + }, + { + "epoch": 18.10843014596366, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0008101778193352571, + "loss": 0.7962, + "num_input_tokens_seen": 70591528, + "step": 121580 + }, + { + "epoch": 18.109174858504616, + "grad_norm": 0.02880859375, + "learning_rate": 0.0008095458593650645, + "loss": 0.7973, + "num_input_tokens_seen": 70594280, + "step": 121585 + }, + { + "epoch": 18.109919571045577, + "grad_norm": 0.0419921875, + "learning_rate": 0.0008089141391281701, + "loss": 0.8011, + "num_input_tokens_seen": 70597672, + "step": 121590 + }, + { + "epoch": 18.110664283586537, + "grad_norm": 0.056884765625, + "learning_rate": 0.0008082826586352504, + "loss": 0.8044, + "num_input_tokens_seen": 70600616, + "step": 121595 + }, + { + "epoch": 18.111408996127494, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008076514178969717, + "loss": 0.8023, + "num_input_tokens_seen": 70603720, + "step": 121600 + }, + { + "epoch": 18.112153708668455, + "grad_norm": 0.03759765625, + "learning_rate": 0.0008070204169239975, + "loss": 0.8031, + "num_input_tokens_seen": 70606472, + "step": 121605 + }, + { + "epoch": 18.11289842120941, + "grad_norm": 0.07568359375, + "learning_rate": 0.0008063896557269856, + "loss": 0.7921, + "num_input_tokens_seen": 70609448, + "step": 121610 + }, + { + "epoch": 18.113643133750372, + "grad_norm": 0.06787109375, + "learning_rate": 0.0008057591343165976, + "loss": 0.8021, + "num_input_tokens_seen": 70612232, + "step": 121615 + }, + { + "epoch": 18.114387846291333, + "grad_norm": 0.053955078125, + "learning_rate": 0.0008051288527034816, + "loss": 0.8178, + "num_input_tokens_seen": 70615336, + "step": 121620 + }, + { + "epoch": 18.11513255883229, + "grad_norm": 0.05712890625, + "learning_rate": 0.0008044988108982859, + "loss": 0.7994, + "num_input_tokens_seen": 70618184, + "step": 121625 + }, + { + "epoch": 18.11587727137325, + "grad_norm": 0.06298828125, + "learning_rate": 0.0008038690089116568, + "loss": 0.798, + "num_input_tokens_seen": 70621064, + "step": 121630 + }, + { + "epoch": 18.11662198391421, + "grad_norm": 0.0517578125, + "learning_rate": 0.0008032394467542292, + "loss": 0.7935, + "num_input_tokens_seen": 70623944, + "step": 121635 + }, + { + "epoch": 18.117366696455168, + "grad_norm": 0.04150390625, + "learning_rate": 0.0008026101244366463, + "loss": 0.7979, + "num_input_tokens_seen": 70626696, + "step": 121640 + }, + { + "epoch": 18.11811140899613, + "grad_norm": 0.0908203125, + "learning_rate": 0.0008019810419695328, + "loss": 0.8065, + "num_input_tokens_seen": 70629672, + "step": 121645 + }, + { + "epoch": 18.118856121537085, + "grad_norm": 0.040283203125, + "learning_rate": 0.0008013521993635202, + "loss": 0.8321, + "num_input_tokens_seen": 70632456, + "step": 121650 + }, + { + "epoch": 18.119600834078046, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0008007235966292336, + "loss": 0.7904, + "num_input_tokens_seen": 70635240, + "step": 121655 + }, + { + "epoch": 18.120345546619006, + "grad_norm": 0.0400390625, + "learning_rate": 0.0008000952337772893, + "loss": 0.8115, + "num_input_tokens_seen": 70638248, + "step": 121660 + }, + { + "epoch": 18.121090259159963, + "grad_norm": 0.05322265625, + "learning_rate": 0.0007994671108183037, + "loss": 0.7829, + "num_input_tokens_seen": 70641320, + "step": 121665 + }, + { + "epoch": 18.121834971700924, + "grad_norm": 0.0537109375, + "learning_rate": 0.0007988392277628852, + "loss": 0.7757, + "num_input_tokens_seen": 70644168, + "step": 121670 + }, + { + "epoch": 18.12257968424188, + "grad_norm": 0.036376953125, + "learning_rate": 0.0007982115846216486, + "loss": 0.8022, + "num_input_tokens_seen": 70646984, + "step": 121675 + }, + { + "epoch": 18.12332439678284, + "grad_norm": 0.0419921875, + "learning_rate": 0.0007975841814051904, + "loss": 0.8082, + "num_input_tokens_seen": 70649832, + "step": 121680 + }, + { + "epoch": 18.124069109323802, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007969570181241153, + "loss": 0.7657, + "num_input_tokens_seen": 70652584, + "step": 121685 + }, + { + "epoch": 18.12481382186476, + "grad_norm": 0.07958984375, + "learning_rate": 0.0007963300947890134, + "loss": 0.7917, + "num_input_tokens_seen": 70655176, + "step": 121690 + }, + { + "epoch": 18.12555853440572, + "grad_norm": 0.04345703125, + "learning_rate": 0.000795703411410481, + "loss": 0.7999, + "num_input_tokens_seen": 70657800, + "step": 121695 + }, + { + "epoch": 18.12630324694668, + "grad_norm": 0.0537109375, + "learning_rate": 0.0007950769679991015, + "loss": 0.7984, + "num_input_tokens_seen": 70660712, + "step": 121700 + }, + { + "epoch": 18.127047959487637, + "grad_norm": 0.056640625, + "learning_rate": 0.0007944507645654597, + "loss": 0.7926, + "num_input_tokens_seen": 70663528, + "step": 121705 + }, + { + "epoch": 18.127792672028598, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007938248011201354, + "loss": 0.7912, + "num_input_tokens_seen": 70666376, + "step": 121710 + }, + { + "epoch": 18.128537384569555, + "grad_norm": 0.0244140625, + "learning_rate": 0.0007931990776737002, + "loss": 0.7897, + "num_input_tokens_seen": 70669288, + "step": 121715 + }, + { + "epoch": 18.129282097110515, + "grad_norm": 0.0625, + "learning_rate": 0.000792573594236729, + "loss": 0.8104, + "num_input_tokens_seen": 70672232, + "step": 121720 + }, + { + "epoch": 18.130026809651476, + "grad_norm": 0.043701171875, + "learning_rate": 0.0007919483508197883, + "loss": 0.7839, + "num_input_tokens_seen": 70675304, + "step": 121725 + }, + { + "epoch": 18.130771522192433, + "grad_norm": 0.05224609375, + "learning_rate": 0.0007913233474334397, + "loss": 0.7879, + "num_input_tokens_seen": 70677992, + "step": 121730 + }, + { + "epoch": 18.131516234733393, + "grad_norm": 0.06640625, + "learning_rate": 0.000790698584088238, + "loss": 0.7816, + "num_input_tokens_seen": 70681096, + "step": 121735 + }, + { + "epoch": 18.132260947274354, + "grad_norm": 0.03369140625, + "learning_rate": 0.0007900740607947465, + "loss": 0.7971, + "num_input_tokens_seen": 70683816, + "step": 121740 + }, + { + "epoch": 18.13300565981531, + "grad_norm": 0.049072265625, + "learning_rate": 0.00078944977756351, + "loss": 0.7873, + "num_input_tokens_seen": 70686664, + "step": 121745 + }, + { + "epoch": 18.13375037235627, + "grad_norm": 0.047607421875, + "learning_rate": 0.0007888257344050753, + "loss": 0.7814, + "num_input_tokens_seen": 70689480, + "step": 121750 + }, + { + "epoch": 18.134495084897228, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007882019313299887, + "loss": 0.8025, + "num_input_tokens_seen": 70692360, + "step": 121755 + }, + { + "epoch": 18.13523979743819, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007875783683487836, + "loss": 0.7756, + "num_input_tokens_seen": 70695080, + "step": 121760 + }, + { + "epoch": 18.13598450997915, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007869550454719981, + "loss": 0.8419, + "num_input_tokens_seen": 70697960, + "step": 121765 + }, + { + "epoch": 18.136729222520106, + "grad_norm": 0.09521484375, + "learning_rate": 0.0007863319627101605, + "loss": 0.7745, + "num_input_tokens_seen": 70700808, + "step": 121770 + }, + { + "epoch": 18.137473935061067, + "grad_norm": 0.046875, + "learning_rate": 0.0007857091200738008, + "loss": 0.7905, + "num_input_tokens_seen": 70703560, + "step": 121775 + }, + { + "epoch": 18.138218647602027, + "grad_norm": 0.043701171875, + "learning_rate": 0.0007850865175734389, + "loss": 0.8249, + "num_input_tokens_seen": 70706376, + "step": 121780 + }, + { + "epoch": 18.138963360142984, + "grad_norm": 0.035888671875, + "learning_rate": 0.0007844641552195897, + "loss": 0.7884, + "num_input_tokens_seen": 70709160, + "step": 121785 + }, + { + "epoch": 18.139708072683945, + "grad_norm": 0.056396484375, + "learning_rate": 0.000783842033022773, + "loss": 0.773, + "num_input_tokens_seen": 70712168, + "step": 121790 + }, + { + "epoch": 18.140452785224902, + "grad_norm": 0.055419921875, + "learning_rate": 0.0007832201509934972, + "loss": 0.7819, + "num_input_tokens_seen": 70715048, + "step": 121795 + }, + { + "epoch": 18.141197497765862, + "grad_norm": 0.041015625, + "learning_rate": 0.0007825985091422671, + "loss": 0.7876, + "num_input_tokens_seen": 70717960, + "step": 121800 + }, + { + "epoch": 18.141942210306823, + "grad_norm": 0.04248046875, + "learning_rate": 0.0007819771074795827, + "loss": 0.7985, + "num_input_tokens_seen": 70720776, + "step": 121805 + }, + { + "epoch": 18.14268692284778, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007813559460159475, + "loss": 0.7744, + "num_input_tokens_seen": 70723816, + "step": 121810 + }, + { + "epoch": 18.14343163538874, + "grad_norm": 0.048095703125, + "learning_rate": 0.0007807350247618511, + "loss": 0.7833, + "num_input_tokens_seen": 70726760, + "step": 121815 + }, + { + "epoch": 18.1441763479297, + "grad_norm": 0.04443359375, + "learning_rate": 0.0007801143437277852, + "loss": 0.7961, + "num_input_tokens_seen": 70729992, + "step": 121820 + }, + { + "epoch": 18.144921060470658, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007794939029242348, + "loss": 0.8, + "num_input_tokens_seen": 70732744, + "step": 121825 + }, + { + "epoch": 18.14566577301162, + "grad_norm": 0.058837890625, + "learning_rate": 0.0007788737023616815, + "loss": 0.804, + "num_input_tokens_seen": 70735784, + "step": 121830 + }, + { + "epoch": 18.146410485552575, + "grad_norm": 0.0390625, + "learning_rate": 0.0007782537420506036, + "loss": 0.7976, + "num_input_tokens_seen": 70738792, + "step": 121835 + }, + { + "epoch": 18.147155198093536, + "grad_norm": 0.07421875, + "learning_rate": 0.0007776340220014726, + "loss": 0.7933, + "num_input_tokens_seen": 70741800, + "step": 121840 + }, + { + "epoch": 18.147899910634496, + "grad_norm": 0.050537109375, + "learning_rate": 0.0007770145422247621, + "loss": 0.7968, + "num_input_tokens_seen": 70744776, + "step": 121845 + }, + { + "epoch": 18.148644623175453, + "grad_norm": 0.0546875, + "learning_rate": 0.0007763953027309333, + "loss": 0.7905, + "num_input_tokens_seen": 70748072, + "step": 121850 + }, + { + "epoch": 18.149389335716414, + "grad_norm": 0.059814453125, + "learning_rate": 0.0007757763035304532, + "loss": 0.8007, + "num_input_tokens_seen": 70750984, + "step": 121855 + }, + { + "epoch": 18.15013404825737, + "grad_norm": 0.1318359375, + "learning_rate": 0.0007751575446337733, + "loss": 0.8126, + "num_input_tokens_seen": 70753896, + "step": 121860 + }, + { + "epoch": 18.15087876079833, + "grad_norm": 0.05126953125, + "learning_rate": 0.0007745390260513502, + "loss": 0.7873, + "num_input_tokens_seen": 70756968, + "step": 121865 + }, + { + "epoch": 18.151623473339292, + "grad_norm": 0.054931640625, + "learning_rate": 0.0007739207477936321, + "loss": 0.8033, + "num_input_tokens_seen": 70759912, + "step": 121870 + }, + { + "epoch": 18.15236818588025, + "grad_norm": 0.041015625, + "learning_rate": 0.0007733027098710627, + "loss": 0.7854, + "num_input_tokens_seen": 70763016, + "step": 121875 + }, + { + "epoch": 18.15311289842121, + "grad_norm": 0.049072265625, + "learning_rate": 0.0007726849122940848, + "loss": 0.7975, + "num_input_tokens_seen": 70766184, + "step": 121880 + }, + { + "epoch": 18.15385761096217, + "grad_norm": 0.034912109375, + "learning_rate": 0.0007720673550731355, + "loss": 0.7855, + "num_input_tokens_seen": 70768968, + "step": 121885 + }, + { + "epoch": 18.154602323503127, + "grad_norm": 0.06640625, + "learning_rate": 0.0007714500382186479, + "loss": 0.8075, + "num_input_tokens_seen": 70771784, + "step": 121890 + }, + { + "epoch": 18.155347036044088, + "grad_norm": 0.06103515625, + "learning_rate": 0.0007708329617410502, + "loss": 0.7961, + "num_input_tokens_seen": 70774440, + "step": 121895 + }, + { + "epoch": 18.156091748585045, + "grad_norm": 0.04248046875, + "learning_rate": 0.0007702161256507694, + "loss": 0.8188, + "num_input_tokens_seen": 70777352, + "step": 121900 + }, + { + "epoch": 18.156836461126005, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007695995299582236, + "loss": 0.8126, + "num_input_tokens_seen": 70780168, + "step": 121905 + }, + { + "epoch": 18.157581173666966, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007689831746738295, + "loss": 0.7963, + "num_input_tokens_seen": 70782792, + "step": 121910 + }, + { + "epoch": 18.158325886207923, + "grad_norm": 0.055908203125, + "learning_rate": 0.0007683670598080022, + "loss": 0.8005, + "num_input_tokens_seen": 70785704, + "step": 121915 + }, + { + "epoch": 18.159070598748883, + "grad_norm": 0.1416015625, + "learning_rate": 0.0007677511853711482, + "loss": 0.7988, + "num_input_tokens_seen": 70788552, + "step": 121920 + }, + { + "epoch": 18.159815311289844, + "grad_norm": 0.047119140625, + "learning_rate": 0.0007671355513736743, + "loss": 0.7906, + "num_input_tokens_seen": 70791048, + "step": 121925 + }, + { + "epoch": 18.1605600238308, + "grad_norm": 0.05859375, + "learning_rate": 0.0007665201578259789, + "loss": 0.8025, + "num_input_tokens_seen": 70793896, + "step": 121930 + }, + { + "epoch": 18.16130473637176, + "grad_norm": 0.044189453125, + "learning_rate": 0.0007659050047384602, + "loss": 0.7825, + "num_input_tokens_seen": 70797000, + "step": 121935 + }, + { + "epoch": 18.162049448912718, + "grad_norm": 0.05908203125, + "learning_rate": 0.0007652900921215099, + "loss": 0.7987, + "num_input_tokens_seen": 70799880, + "step": 121940 + }, + { + "epoch": 18.16279416145368, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007646754199855132, + "loss": 0.7806, + "num_input_tokens_seen": 70802856, + "step": 121945 + }, + { + "epoch": 18.16353887399464, + "grad_norm": 0.10986328125, + "learning_rate": 0.0007640609883408583, + "loss": 0.7979, + "num_input_tokens_seen": 70805928, + "step": 121950 + }, + { + "epoch": 18.164283586535596, + "grad_norm": 0.02734375, + "learning_rate": 0.0007634467971979236, + "loss": 0.8036, + "num_input_tokens_seen": 70808712, + "step": 121955 + }, + { + "epoch": 18.165028299076557, + "grad_norm": 0.04296875, + "learning_rate": 0.0007628328465670874, + "loss": 0.8145, + "num_input_tokens_seen": 70811784, + "step": 121960 + }, + { + "epoch": 18.165773011617517, + "grad_norm": 0.03955078125, + "learning_rate": 0.0007622191364587183, + "loss": 0.7996, + "num_input_tokens_seen": 70815016, + "step": 121965 + }, + { + "epoch": 18.166517724158474, + "grad_norm": 0.04150390625, + "learning_rate": 0.0007616056668831877, + "loss": 0.8186, + "num_input_tokens_seen": 70818216, + "step": 121970 + }, + { + "epoch": 18.167262436699435, + "grad_norm": 0.0419921875, + "learning_rate": 0.0007609924378508575, + "loss": 0.8183, + "num_input_tokens_seen": 70821160, + "step": 121975 + }, + { + "epoch": 18.168007149240392, + "grad_norm": 0.0703125, + "learning_rate": 0.0007603794493720894, + "loss": 0.8003, + "num_input_tokens_seen": 70824232, + "step": 121980 + }, + { + "epoch": 18.168751861781352, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007597667014572367, + "loss": 0.7978, + "num_input_tokens_seen": 70827240, + "step": 121985 + }, + { + "epoch": 18.169496574322313, + "grad_norm": 0.365234375, + "learning_rate": 0.0007591541941166546, + "loss": 0.8277, + "num_input_tokens_seen": 70830344, + "step": 121990 + }, + { + "epoch": 18.17024128686327, + "grad_norm": 0.05322265625, + "learning_rate": 0.000758541927360688, + "loss": 0.7959, + "num_input_tokens_seen": 70833224, + "step": 121995 + }, + { + "epoch": 18.17098599940423, + "grad_norm": 0.0299072265625, + "learning_rate": 0.0007579299011996788, + "loss": 0.7976, + "num_input_tokens_seen": 70835880, + "step": 122000 + }, + { + "epoch": 18.171730711945187, + "grad_norm": 0.05078125, + "learning_rate": 0.0007573181156439701, + "loss": 0.7926, + "num_input_tokens_seen": 70839016, + "step": 122005 + }, + { + "epoch": 18.172475424486148, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007567065707038956, + "loss": 0.7939, + "num_input_tokens_seen": 70842120, + "step": 122010 + }, + { + "epoch": 18.17322013702711, + "grad_norm": 0.0390625, + "learning_rate": 0.0007560952663897885, + "loss": 0.8031, + "num_input_tokens_seen": 70845672, + "step": 122015 + }, + { + "epoch": 18.173964849568065, + "grad_norm": 0.07080078125, + "learning_rate": 0.0007554842027119756, + "loss": 0.7931, + "num_input_tokens_seen": 70848808, + "step": 122020 + }, + { + "epoch": 18.174709562109026, + "grad_norm": 0.06689453125, + "learning_rate": 0.0007548733796807771, + "loss": 0.7838, + "num_input_tokens_seen": 70851560, + "step": 122025 + }, + { + "epoch": 18.175454274649987, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007542627973065163, + "loss": 0.7966, + "num_input_tokens_seen": 70854472, + "step": 122030 + }, + { + "epoch": 18.176198987190944, + "grad_norm": 0.0810546875, + "learning_rate": 0.0007536524555995066, + "loss": 0.8137, + "num_input_tokens_seen": 70857224, + "step": 122035 + }, + { + "epoch": 18.176943699731904, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007530423545700598, + "loss": 0.7921, + "num_input_tokens_seen": 70859784, + "step": 122040 + }, + { + "epoch": 18.17768841227286, + "grad_norm": 0.095703125, + "learning_rate": 0.000752432494228481, + "loss": 0.7883, + "num_input_tokens_seen": 70862792, + "step": 122045 + }, + { + "epoch": 18.17843312481382, + "grad_norm": 0.041748046875, + "learning_rate": 0.0007518228745850769, + "loss": 0.793, + "num_input_tokens_seen": 70865544, + "step": 122050 + }, + { + "epoch": 18.179177837354782, + "grad_norm": 0.027099609375, + "learning_rate": 0.0007512134956501442, + "loss": 0.8375, + "num_input_tokens_seen": 70868328, + "step": 122055 + }, + { + "epoch": 18.17992254989574, + "grad_norm": 0.03173828125, + "learning_rate": 0.0007506043574339765, + "loss": 0.7905, + "num_input_tokens_seen": 70871464, + "step": 122060 + }, + { + "epoch": 18.1806672624367, + "grad_norm": 0.04638671875, + "learning_rate": 0.0007499954599468672, + "loss": 0.8025, + "num_input_tokens_seen": 70874440, + "step": 122065 + }, + { + "epoch": 18.18141197497766, + "grad_norm": 0.07080078125, + "learning_rate": 0.0007493868031990979, + "loss": 0.7877, + "num_input_tokens_seen": 70877416, + "step": 122070 + }, + { + "epoch": 18.182156687518617, + "grad_norm": 0.048095703125, + "learning_rate": 0.000748778387200959, + "loss": 0.7963, + "num_input_tokens_seen": 70880680, + "step": 122075 + }, + { + "epoch": 18.182901400059578, + "grad_norm": 0.043701171875, + "learning_rate": 0.0007481702119627203, + "loss": 0.7845, + "num_input_tokens_seen": 70883464, + "step": 122080 + }, + { + "epoch": 18.183646112600535, + "grad_norm": 0.056396484375, + "learning_rate": 0.0007475622774946655, + "loss": 0.7918, + "num_input_tokens_seen": 70886536, + "step": 122085 + }, + { + "epoch": 18.184390825141495, + "grad_norm": 0.046142578125, + "learning_rate": 0.0007469545838070579, + "loss": 0.8063, + "num_input_tokens_seen": 70889704, + "step": 122090 + }, + { + "epoch": 18.185135537682456, + "grad_norm": 0.043212890625, + "learning_rate": 0.0007463471309101676, + "loss": 0.8023, + "num_input_tokens_seen": 70892776, + "step": 122095 + }, + { + "epoch": 18.185880250223413, + "grad_norm": 0.024169921875, + "learning_rate": 0.0007457399188142566, + "loss": 0.813, + "num_input_tokens_seen": 70895752, + "step": 122100 + }, + { + "epoch": 18.186624962764373, + "grad_norm": 0.05419921875, + "learning_rate": 0.0007451329475295798, + "loss": 0.7963, + "num_input_tokens_seen": 70898632, + "step": 122105 + }, + { + "epoch": 18.187369675305334, + "grad_norm": 0.0654296875, + "learning_rate": 0.0007445262170663957, + "loss": 0.8018, + "num_input_tokens_seen": 70901384, + "step": 122110 + }, + { + "epoch": 18.18811438784629, + "grad_norm": 0.037109375, + "learning_rate": 0.0007439197274349496, + "loss": 0.7867, + "num_input_tokens_seen": 70904232, + "step": 122115 + }, + { + "epoch": 18.18885910038725, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007433134786454948, + "loss": 0.7848, + "num_input_tokens_seen": 70907016, + "step": 122120 + }, + { + "epoch": 18.18960381292821, + "grad_norm": 0.02880859375, + "learning_rate": 0.0007427074707082664, + "loss": 0.7884, + "num_input_tokens_seen": 70910024, + "step": 122125 + }, + { + "epoch": 18.19034852546917, + "grad_norm": 0.0947265625, + "learning_rate": 0.0007421017036335065, + "loss": 0.7769, + "num_input_tokens_seen": 70913000, + "step": 122130 + }, + { + "epoch": 18.19109323801013, + "grad_norm": 0.0830078125, + "learning_rate": 0.0007414961774314449, + "loss": 0.796, + "num_input_tokens_seen": 70915688, + "step": 122135 + }, + { + "epoch": 18.191837950551086, + "grad_norm": 0.08740234375, + "learning_rate": 0.000740890892112317, + "loss": 0.8308, + "num_input_tokens_seen": 70918376, + "step": 122140 + }, + { + "epoch": 18.192582663092047, + "grad_norm": 0.060302734375, + "learning_rate": 0.0007402858476863427, + "loss": 0.7883, + "num_input_tokens_seen": 70921448, + "step": 122145 + }, + { + "epoch": 18.193327375633007, + "grad_norm": 0.040771484375, + "learning_rate": 0.0007396810441637457, + "loss": 0.7946, + "num_input_tokens_seen": 70924360, + "step": 122150 + }, + { + "epoch": 18.194072088173964, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007390764815547446, + "loss": 0.7872, + "num_input_tokens_seen": 70927016, + "step": 122155 + }, + { + "epoch": 18.194816800714925, + "grad_norm": 0.05615234375, + "learning_rate": 0.000738472159869551, + "loss": 0.78, + "num_input_tokens_seen": 70929864, + "step": 122160 + }, + { + "epoch": 18.195561513255882, + "grad_norm": 0.04541015625, + "learning_rate": 0.0007378680791183783, + "loss": 0.8096, + "num_input_tokens_seen": 70932488, + "step": 122165 + }, + { + "epoch": 18.196306225796842, + "grad_norm": 0.06884765625, + "learning_rate": 0.000737264239311427, + "loss": 0.7852, + "num_input_tokens_seen": 70935560, + "step": 122170 + }, + { + "epoch": 18.197050938337803, + "grad_norm": 0.037109375, + "learning_rate": 0.0007366606404589004, + "loss": 0.8253, + "num_input_tokens_seen": 70938408, + "step": 122175 + }, + { + "epoch": 18.19779565087876, + "grad_norm": 0.04833984375, + "learning_rate": 0.0007360572825709971, + "loss": 0.7921, + "num_input_tokens_seen": 70941192, + "step": 122180 + }, + { + "epoch": 18.19854036341972, + "grad_norm": 0.05908203125, + "learning_rate": 0.0007354541656579072, + "loss": 0.7856, + "num_input_tokens_seen": 70943944, + "step": 122185 + }, + { + "epoch": 18.199285075960677, + "grad_norm": 0.045166015625, + "learning_rate": 0.0007348512897298226, + "loss": 0.7964, + "num_input_tokens_seen": 70946568, + "step": 122190 + }, + { + "epoch": 18.200029788501638, + "grad_norm": 0.02978515625, + "learning_rate": 0.0007342486547969268, + "loss": 0.8049, + "num_input_tokens_seen": 70949160, + "step": 122195 + }, + { + "epoch": 18.2007745010426, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007336462608694016, + "loss": 0.7994, + "num_input_tokens_seen": 70951816, + "step": 122200 + }, + { + "epoch": 18.201519213583556, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007330441079574206, + "loss": 0.802, + "num_input_tokens_seen": 70954856, + "step": 122205 + }, + { + "epoch": 18.202263926124516, + "grad_norm": 0.051513671875, + "learning_rate": 0.0007324421960711608, + "loss": 0.7934, + "num_input_tokens_seen": 70957640, + "step": 122210 + }, + { + "epoch": 18.203008638665477, + "grad_norm": 0.03759765625, + "learning_rate": 0.0007318405252207887, + "loss": 0.8037, + "num_input_tokens_seen": 70960424, + "step": 122215 + }, + { + "epoch": 18.203753351206434, + "grad_norm": 0.0751953125, + "learning_rate": 0.0007312390954164699, + "loss": 0.7951, + "num_input_tokens_seen": 70963592, + "step": 122220 + }, + { + "epoch": 18.204498063747394, + "grad_norm": 0.05517578125, + "learning_rate": 0.0007306379066683643, + "loss": 0.802, + "num_input_tokens_seen": 70966344, + "step": 122225 + }, + { + "epoch": 18.20524277628835, + "grad_norm": 0.043701171875, + "learning_rate": 0.0007300369589866273, + "loss": 0.7898, + "num_input_tokens_seen": 70969224, + "step": 122230 + }, + { + "epoch": 18.20598748882931, + "grad_norm": 0.0546875, + "learning_rate": 0.0007294362523814158, + "loss": 0.8031, + "num_input_tokens_seen": 70972072, + "step": 122235 + }, + { + "epoch": 18.206732201370272, + "grad_norm": 0.04248046875, + "learning_rate": 0.0007288357868628714, + "loss": 0.7798, + "num_input_tokens_seen": 70974824, + "step": 122240 + }, + { + "epoch": 18.20747691391123, + "grad_norm": 0.04443359375, + "learning_rate": 0.000728235562441143, + "loss": 0.794, + "num_input_tokens_seen": 70977672, + "step": 122245 + }, + { + "epoch": 18.20822162645219, + "grad_norm": 0.048583984375, + "learning_rate": 0.0007276355791263688, + "loss": 0.7942, + "num_input_tokens_seen": 70980424, + "step": 122250 + }, + { + "epoch": 18.20896633899315, + "grad_norm": 0.055908203125, + "learning_rate": 0.0007270358369286877, + "loss": 0.7976, + "num_input_tokens_seen": 70983240, + "step": 122255 + }, + { + "epoch": 18.209711051534107, + "grad_norm": 0.038818359375, + "learning_rate": 0.0007264363358582298, + "loss": 0.7978, + "num_input_tokens_seen": 70986408, + "step": 122260 + }, + { + "epoch": 18.210455764075068, + "grad_norm": 0.046142578125, + "learning_rate": 0.000725837075925122, + "loss": 0.7752, + "num_input_tokens_seen": 70989224, + "step": 122265 + }, + { + "epoch": 18.211200476616025, + "grad_norm": 0.040283203125, + "learning_rate": 0.0007252380571394895, + "loss": 0.8167, + "num_input_tokens_seen": 70992264, + "step": 122270 + }, + { + "epoch": 18.211945189156985, + "grad_norm": 0.0498046875, + "learning_rate": 0.0007246392795114509, + "loss": 0.8009, + "num_input_tokens_seen": 70995176, + "step": 122275 + }, + { + "epoch": 18.212689901697946, + "grad_norm": 0.0322265625, + "learning_rate": 0.000724040743051123, + "loss": 0.8026, + "num_input_tokens_seen": 70997768, + "step": 122280 + }, + { + "epoch": 18.213434614238903, + "grad_norm": 0.044677734375, + "learning_rate": 0.000723442447768618, + "loss": 0.8117, + "num_input_tokens_seen": 71000712, + "step": 122285 + }, + { + "epoch": 18.214179326779863, + "grad_norm": 0.072265625, + "learning_rate": 0.0007228443936740425, + "loss": 0.7859, + "num_input_tokens_seen": 71003368, + "step": 122290 + }, + { + "epoch": 18.214924039320824, + "grad_norm": 0.05810546875, + "learning_rate": 0.0007222465807774985, + "loss": 0.7784, + "num_input_tokens_seen": 71006664, + "step": 122295 + }, + { + "epoch": 18.21566875186178, + "grad_norm": 0.0380859375, + "learning_rate": 0.0007216490090890898, + "loss": 0.804, + "num_input_tokens_seen": 71009544, + "step": 122300 + }, + { + "epoch": 18.21641346440274, + "grad_norm": 0.0439453125, + "learning_rate": 0.0007210516786189097, + "loss": 0.7805, + "num_input_tokens_seen": 71012232, + "step": 122305 + }, + { + "epoch": 18.2171581769437, + "grad_norm": 0.05322265625, + "learning_rate": 0.000720454589377047, + "loss": 0.7919, + "num_input_tokens_seen": 71015304, + "step": 122310 + }, + { + "epoch": 18.21790288948466, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007198577413735918, + "loss": 0.8196, + "num_input_tokens_seen": 71018248, + "step": 122315 + }, + { + "epoch": 18.21864760202562, + "grad_norm": 0.048828125, + "learning_rate": 0.0007192611346186278, + "loss": 0.7877, + "num_input_tokens_seen": 71021192, + "step": 122320 + }, + { + "epoch": 18.219392314566576, + "grad_norm": 0.046630859375, + "learning_rate": 0.000718664769122232, + "loss": 0.8124, + "num_input_tokens_seen": 71023816, + "step": 122325 + }, + { + "epoch": 18.220137027107537, + "grad_norm": 0.031494140625, + "learning_rate": 0.0007180686448944779, + "loss": 0.7964, + "num_input_tokens_seen": 71026600, + "step": 122330 + }, + { + "epoch": 18.220881739648497, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0007174727619454407, + "loss": 0.8123, + "num_input_tokens_seen": 71029480, + "step": 122335 + }, + { + "epoch": 18.221626452189454, + "grad_norm": 0.039306640625, + "learning_rate": 0.0007168771202851825, + "loss": 0.7886, + "num_input_tokens_seen": 71032328, + "step": 122340 + }, + { + "epoch": 18.222371164730415, + "grad_norm": 0.036865234375, + "learning_rate": 0.0007162817199237703, + "loss": 0.8023, + "num_input_tokens_seen": 71035176, + "step": 122345 + }, + { + "epoch": 18.223115877271372, + "grad_norm": 0.08837890625, + "learning_rate": 0.0007156865608712625, + "loss": 0.8, + "num_input_tokens_seen": 71037992, + "step": 122350 + }, + { + "epoch": 18.223860589812332, + "grad_norm": 0.052978515625, + "learning_rate": 0.0007150916431377097, + "loss": 0.7912, + "num_input_tokens_seen": 71040872, + "step": 122355 + }, + { + "epoch": 18.224605302353293, + "grad_norm": 0.050048828125, + "learning_rate": 0.0007144969667331668, + "loss": 0.8177, + "num_input_tokens_seen": 71043912, + "step": 122360 + }, + { + "epoch": 18.22535001489425, + "grad_norm": 0.06884765625, + "learning_rate": 0.0007139025316676761, + "loss": 0.8157, + "num_input_tokens_seen": 71046888, + "step": 122365 + }, + { + "epoch": 18.22609472743521, + "grad_norm": 0.05419921875, + "learning_rate": 0.000713308337951286, + "loss": 0.795, + "num_input_tokens_seen": 71049544, + "step": 122370 + }, + { + "epoch": 18.226839439976168, + "grad_norm": 0.0400390625, + "learning_rate": 0.0007127143855940287, + "loss": 0.7944, + "num_input_tokens_seen": 71052328, + "step": 122375 + }, + { + "epoch": 18.227584152517128, + "grad_norm": 0.057373046875, + "learning_rate": 0.0007121206746059411, + "loss": 0.7896, + "num_input_tokens_seen": 71055176, + "step": 122380 + }, + { + "epoch": 18.22832886505809, + "grad_norm": 0.03955078125, + "learning_rate": 0.000711527204997055, + "loss": 0.7924, + "num_input_tokens_seen": 71058216, + "step": 122385 + }, + { + "epoch": 18.229073577599046, + "grad_norm": 0.038818359375, + "learning_rate": 0.0007109339767773942, + "loss": 0.794, + "num_input_tokens_seen": 71060936, + "step": 122390 + }, + { + "epoch": 18.229818290140006, + "grad_norm": 0.037109375, + "learning_rate": 0.0007103409899569807, + "loss": 0.7733, + "num_input_tokens_seen": 71063592, + "step": 122395 + }, + { + "epoch": 18.230563002680967, + "grad_norm": 0.037353515625, + "learning_rate": 0.0007097482445458313, + "loss": 0.7917, + "num_input_tokens_seen": 71066504, + "step": 122400 + }, + { + "epoch": 18.231307715221924, + "grad_norm": 0.09375, + "learning_rate": 0.0007091557405539634, + "loss": 0.7944, + "num_input_tokens_seen": 71069384, + "step": 122405 + }, + { + "epoch": 18.232052427762884, + "grad_norm": 0.029541015625, + "learning_rate": 0.0007085634779913819, + "loss": 0.7899, + "num_input_tokens_seen": 71072392, + "step": 122410 + }, + { + "epoch": 18.23279714030384, + "grad_norm": 0.052734375, + "learning_rate": 0.0007079714568680989, + "loss": 0.7974, + "num_input_tokens_seen": 71075496, + "step": 122415 + }, + { + "epoch": 18.2335418528448, + "grad_norm": 0.03125, + "learning_rate": 0.0007073796771941081, + "loss": 0.8099, + "num_input_tokens_seen": 71078216, + "step": 122420 + }, + { + "epoch": 18.234286565385762, + "grad_norm": 0.07958984375, + "learning_rate": 0.0007067881389794151, + "loss": 0.7981, + "num_input_tokens_seen": 71081576, + "step": 122425 + }, + { + "epoch": 18.23503127792672, + "grad_norm": 0.0595703125, + "learning_rate": 0.0007061968422340098, + "loss": 0.7824, + "num_input_tokens_seen": 71084264, + "step": 122430 + }, + { + "epoch": 18.23577599046768, + "grad_norm": 0.041015625, + "learning_rate": 0.0007056057869678778, + "loss": 0.7921, + "num_input_tokens_seen": 71086984, + "step": 122435 + }, + { + "epoch": 18.23652070300864, + "grad_norm": 0.053466796875, + "learning_rate": 0.000705014973191011, + "loss": 0.7935, + "num_input_tokens_seen": 71089736, + "step": 122440 + }, + { + "epoch": 18.237265415549597, + "grad_norm": 0.05322265625, + "learning_rate": 0.0007044244009133849, + "loss": 0.7944, + "num_input_tokens_seen": 71092776, + "step": 122445 + }, + { + "epoch": 18.238010128090558, + "grad_norm": 0.053955078125, + "learning_rate": 0.0007038340701449813, + "loss": 0.7833, + "num_input_tokens_seen": 71095592, + "step": 122450 + }, + { + "epoch": 18.238754840631515, + "grad_norm": 0.056640625, + "learning_rate": 0.0007032439808957707, + "loss": 0.7902, + "num_input_tokens_seen": 71098856, + "step": 122455 + }, + { + "epoch": 18.239499553172475, + "grad_norm": 0.03466796875, + "learning_rate": 0.0007026541331757219, + "loss": 0.7987, + "num_input_tokens_seen": 71101800, + "step": 122460 + }, + { + "epoch": 18.240244265713436, + "grad_norm": 0.035888671875, + "learning_rate": 0.0007020645269948017, + "loss": 0.8045, + "num_input_tokens_seen": 71104840, + "step": 122465 + }, + { + "epoch": 18.240988978254393, + "grad_norm": 0.0927734375, + "learning_rate": 0.0007014751623629673, + "loss": 0.7863, + "num_input_tokens_seen": 71107816, + "step": 122470 + }, + { + "epoch": 18.241733690795353, + "grad_norm": 0.1044921875, + "learning_rate": 0.0007008860392901789, + "loss": 0.7721, + "num_input_tokens_seen": 71110984, + "step": 122475 + }, + { + "epoch": 18.242478403336314, + "grad_norm": 0.039794921875, + "learning_rate": 0.0007002971577863869, + "loss": 0.807, + "num_input_tokens_seen": 71113800, + "step": 122480 + }, + { + "epoch": 18.24322311587727, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006997085178615419, + "loss": 0.7847, + "num_input_tokens_seen": 71116776, + "step": 122485 + }, + { + "epoch": 18.24396782841823, + "grad_norm": 0.1708984375, + "learning_rate": 0.0006991201195255858, + "loss": 0.8072, + "num_input_tokens_seen": 71119848, + "step": 122490 + }, + { + "epoch": 18.24471254095919, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006985319627884623, + "loss": 0.8087, + "num_input_tokens_seen": 71122504, + "step": 122495 + }, + { + "epoch": 18.24545725350015, + "grad_norm": 0.04296875, + "learning_rate": 0.0006979440476601051, + "loss": 0.8023, + "num_input_tokens_seen": 71125224, + "step": 122500 + }, + { + "epoch": 18.24620196604111, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006973563741504479, + "loss": 0.8098, + "num_input_tokens_seen": 71127656, + "step": 122505 + }, + { + "epoch": 18.246946678582066, + "grad_norm": 0.050048828125, + "learning_rate": 0.0006967689422694178, + "loss": 0.7896, + "num_input_tokens_seen": 71130568, + "step": 122510 + }, + { + "epoch": 18.247691391123027, + "grad_norm": 0.05615234375, + "learning_rate": 0.0006961817520269386, + "loss": 0.7784, + "num_input_tokens_seen": 71133736, + "step": 122515 + }, + { + "epoch": 18.248436103663984, + "grad_norm": 0.052734375, + "learning_rate": 0.0006955948034329323, + "loss": 0.7862, + "num_input_tokens_seen": 71136968, + "step": 122520 + }, + { + "epoch": 18.249180816204944, + "grad_norm": 0.064453125, + "learning_rate": 0.0006950080964973143, + "loss": 0.7857, + "num_input_tokens_seen": 71139752, + "step": 122525 + }, + { + "epoch": 18.249925528745905, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006944216312299933, + "loss": 0.7861, + "num_input_tokens_seen": 71142440, + "step": 122530 + }, + { + "epoch": 18.250670241286862, + "grad_norm": 0.05078125, + "learning_rate": 0.0006938354076408798, + "loss": 0.7807, + "num_input_tokens_seen": 71145224, + "step": 122535 + }, + { + "epoch": 18.251414953827823, + "grad_norm": 0.0380859375, + "learning_rate": 0.0006932494257398774, + "loss": 0.8178, + "num_input_tokens_seen": 71147944, + "step": 122540 + }, + { + "epoch": 18.252159666368783, + "grad_norm": 0.06494140625, + "learning_rate": 0.0006926636855368867, + "loss": 0.7825, + "num_input_tokens_seen": 71151016, + "step": 122545 + }, + { + "epoch": 18.25290437890974, + "grad_norm": 0.048095703125, + "learning_rate": 0.000692078187041798, + "loss": 0.8, + "num_input_tokens_seen": 71154184, + "step": 122550 + }, + { + "epoch": 18.2536490914507, + "grad_norm": 0.039306640625, + "learning_rate": 0.0006914929302645101, + "loss": 0.8047, + "num_input_tokens_seen": 71157064, + "step": 122555 + }, + { + "epoch": 18.254393803991658, + "grad_norm": 0.0546875, + "learning_rate": 0.0006909079152149034, + "loss": 0.7947, + "num_input_tokens_seen": 71160008, + "step": 122560 + }, + { + "epoch": 18.255138516532618, + "grad_norm": 0.02685546875, + "learning_rate": 0.0006903231419028666, + "loss": 0.8024, + "num_input_tokens_seen": 71162664, + "step": 122565 + }, + { + "epoch": 18.25588322907358, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006897386103382752, + "loss": 0.8057, + "num_input_tokens_seen": 71165640, + "step": 122570 + }, + { + "epoch": 18.256627941614536, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006891543205310063, + "loss": 0.7894, + "num_input_tokens_seen": 71168328, + "step": 122575 + }, + { + "epoch": 18.257372654155496, + "grad_norm": 0.06396484375, + "learning_rate": 0.0006885702724909287, + "loss": 0.7841, + "num_input_tokens_seen": 71171304, + "step": 122580 + }, + { + "epoch": 18.258117366696457, + "grad_norm": 0.03857421875, + "learning_rate": 0.0006879864662279145, + "loss": 0.8044, + "num_input_tokens_seen": 71174408, + "step": 122585 + }, + { + "epoch": 18.258862079237414, + "grad_norm": 0.050048828125, + "learning_rate": 0.0006874029017518207, + "loss": 0.791, + "num_input_tokens_seen": 71177224, + "step": 122590 + }, + { + "epoch": 18.259606791778374, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006868195790725095, + "loss": 0.7908, + "num_input_tokens_seen": 71180424, + "step": 122595 + }, + { + "epoch": 18.26035150431933, + "grad_norm": 0.0556640625, + "learning_rate": 0.0006862364981998331, + "loss": 0.8017, + "num_input_tokens_seen": 71183144, + "step": 122600 + }, + { + "epoch": 18.26109621686029, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006856536591436435, + "loss": 0.803, + "num_input_tokens_seen": 71186568, + "step": 122605 + }, + { + "epoch": 18.261840929401252, + "grad_norm": 0.0654296875, + "learning_rate": 0.0006850710619137862, + "loss": 0.7937, + "num_input_tokens_seen": 71189576, + "step": 122610 + }, + { + "epoch": 18.26258564194221, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006844887065201033, + "loss": 0.7955, + "num_input_tokens_seen": 71192392, + "step": 122615 + }, + { + "epoch": 18.26333035448317, + "grad_norm": 0.036865234375, + "learning_rate": 0.000683906592972437, + "loss": 0.7863, + "num_input_tokens_seen": 71195400, + "step": 122620 + }, + { + "epoch": 18.26407506702413, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006833247212806176, + "loss": 0.7926, + "num_input_tokens_seen": 71198344, + "step": 122625 + }, + { + "epoch": 18.264819779565087, + "grad_norm": 0.0556640625, + "learning_rate": 0.0006827430914544741, + "loss": 0.8075, + "num_input_tokens_seen": 71201192, + "step": 122630 + }, + { + "epoch": 18.265564492106048, + "grad_norm": 0.06884765625, + "learning_rate": 0.0006821617035038368, + "loss": 0.7876, + "num_input_tokens_seen": 71204040, + "step": 122635 + }, + { + "epoch": 18.266309204647005, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006815805574385247, + "loss": 0.7988, + "num_input_tokens_seen": 71206792, + "step": 122640 + }, + { + "epoch": 18.267053917187965, + "grad_norm": 0.09814453125, + "learning_rate": 0.0006809996532683566, + "loss": 0.777, + "num_input_tokens_seen": 71210088, + "step": 122645 + }, + { + "epoch": 18.267798629728926, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006804189910031477, + "loss": 0.7896, + "num_input_tokens_seen": 71213000, + "step": 122650 + }, + { + "epoch": 18.268543342269883, + "grad_norm": 0.042236328125, + "learning_rate": 0.0006798385706527054, + "loss": 0.8099, + "num_input_tokens_seen": 71215848, + "step": 122655 + }, + { + "epoch": 18.269288054810843, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006792583922268336, + "loss": 0.7853, + "num_input_tokens_seen": 71218792, + "step": 122660 + }, + { + "epoch": 18.270032767351804, + "grad_norm": 0.0419921875, + "learning_rate": 0.0006786784557353393, + "loss": 0.8015, + "num_input_tokens_seen": 71221608, + "step": 122665 + }, + { + "epoch": 18.27077747989276, + "grad_norm": 0.058837890625, + "learning_rate": 0.0006780987611880163, + "loss": 0.81, + "num_input_tokens_seen": 71224520, + "step": 122670 + }, + { + "epoch": 18.27152219243372, + "grad_norm": 0.06005859375, + "learning_rate": 0.0006775193085946568, + "loss": 0.7957, + "num_input_tokens_seen": 71227624, + "step": 122675 + }, + { + "epoch": 18.27226690497468, + "grad_norm": 0.039306640625, + "learning_rate": 0.0006769400979650547, + "loss": 0.8114, + "num_input_tokens_seen": 71230408, + "step": 122680 + }, + { + "epoch": 18.27301161751564, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006763611293089889, + "loss": 0.7951, + "num_input_tokens_seen": 71233128, + "step": 122685 + }, + { + "epoch": 18.2737563300566, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006757824026362463, + "loss": 0.7897, + "num_input_tokens_seen": 71235848, + "step": 122690 + }, + { + "epoch": 18.274501042597556, + "grad_norm": 0.037841796875, + "learning_rate": 0.000675203917956601, + "loss": 0.8078, + "num_input_tokens_seen": 71238984, + "step": 122695 + }, + { + "epoch": 18.275245755138517, + "grad_norm": 0.0615234375, + "learning_rate": 0.0006746256752798268, + "loss": 0.8226, + "num_input_tokens_seen": 71241928, + "step": 122700 + }, + { + "epoch": 18.275990467679474, + "grad_norm": 0.046142578125, + "learning_rate": 0.0006740476746156926, + "loss": 0.7987, + "num_input_tokens_seen": 71245192, + "step": 122705 + }, + { + "epoch": 18.276735180220435, + "grad_norm": 0.0380859375, + "learning_rate": 0.0006734699159739604, + "loss": 0.7928, + "num_input_tokens_seen": 71247848, + "step": 122710 + }, + { + "epoch": 18.277479892761395, + "grad_norm": 0.03515625, + "learning_rate": 0.000672892399364396, + "loss": 0.7888, + "num_input_tokens_seen": 71250504, + "step": 122715 + }, + { + "epoch": 18.278224605302352, + "grad_norm": 0.0791015625, + "learning_rate": 0.0006723151247967529, + "loss": 0.7976, + "num_input_tokens_seen": 71253416, + "step": 122720 + }, + { + "epoch": 18.278969317843313, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006717380922807836, + "loss": 0.7795, + "num_input_tokens_seen": 71256328, + "step": 122725 + }, + { + "epoch": 18.279714030384273, + "grad_norm": 0.06396484375, + "learning_rate": 0.0006711613018262336, + "loss": 0.7946, + "num_input_tokens_seen": 71259176, + "step": 122730 + }, + { + "epoch": 18.28045874292523, + "grad_norm": 0.03662109375, + "learning_rate": 0.0006705847534428532, + "loss": 0.7777, + "num_input_tokens_seen": 71262088, + "step": 122735 + }, + { + "epoch": 18.28120345546619, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006700084471403766, + "loss": 0.7883, + "num_input_tokens_seen": 71265192, + "step": 122740 + }, + { + "epoch": 18.281948168007148, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006694323829285458, + "loss": 0.8065, + "num_input_tokens_seen": 71267944, + "step": 122745 + }, + { + "epoch": 18.282692880548108, + "grad_norm": 0.0390625, + "learning_rate": 0.0006688565608170916, + "loss": 0.7926, + "num_input_tokens_seen": 71270984, + "step": 122750 + }, + { + "epoch": 18.28343759308907, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0006682809808157359, + "loss": 0.7951, + "num_input_tokens_seen": 71273736, + "step": 122755 + }, + { + "epoch": 18.284182305630026, + "grad_norm": 0.04150390625, + "learning_rate": 0.0006677056429342093, + "loss": 0.7909, + "num_input_tokens_seen": 71276808, + "step": 122760 + }, + { + "epoch": 18.284927018170986, + "grad_norm": 0.041015625, + "learning_rate": 0.0006671305471822275, + "loss": 0.7899, + "num_input_tokens_seen": 71279496, + "step": 122765 + }, + { + "epoch": 18.285671730711947, + "grad_norm": 0.0625, + "learning_rate": 0.0006665556935695093, + "loss": 0.7776, + "num_input_tokens_seen": 71282408, + "step": 122770 + }, + { + "epoch": 18.286416443252904, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006659810821057638, + "loss": 0.804, + "num_input_tokens_seen": 71285480, + "step": 122775 + }, + { + "epoch": 18.287161155793864, + "grad_norm": 0.03076171875, + "learning_rate": 0.0006654067128007013, + "loss": 0.7976, + "num_input_tokens_seen": 71288264, + "step": 122780 + }, + { + "epoch": 18.28790586833482, + "grad_norm": 0.0703125, + "learning_rate": 0.0006648325856640225, + "loss": 0.7977, + "num_input_tokens_seen": 71291112, + "step": 122785 + }, + { + "epoch": 18.28865058087578, + "grad_norm": 0.037109375, + "learning_rate": 0.0006642587007054296, + "loss": 0.7948, + "num_input_tokens_seen": 71294152, + "step": 122790 + }, + { + "epoch": 18.289395293416742, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006636850579346132, + "loss": 0.798, + "num_input_tokens_seen": 71297160, + "step": 122795 + }, + { + "epoch": 18.2901400059577, + "grad_norm": 0.052978515625, + "learning_rate": 0.0006631116573612672, + "loss": 0.7949, + "num_input_tokens_seen": 71300264, + "step": 122800 + }, + { + "epoch": 18.29088471849866, + "grad_norm": 0.031982421875, + "learning_rate": 0.0006625384989950788, + "loss": 0.7941, + "num_input_tokens_seen": 71303464, + "step": 122805 + }, + { + "epoch": 18.29162943103962, + "grad_norm": 0.03564453125, + "learning_rate": 0.0006619655828457287, + "loss": 0.7851, + "num_input_tokens_seen": 71306344, + "step": 122810 + }, + { + "epoch": 18.292374143580577, + "grad_norm": 0.0556640625, + "learning_rate": 0.0006613929089228992, + "loss": 0.7687, + "num_input_tokens_seen": 71309128, + "step": 122815 + }, + { + "epoch": 18.293118856121538, + "grad_norm": 0.050537109375, + "learning_rate": 0.0006608204772362624, + "loss": 0.791, + "num_input_tokens_seen": 71311944, + "step": 122820 + }, + { + "epoch": 18.293863568662495, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006602482877954924, + "loss": 0.7871, + "num_input_tokens_seen": 71314920, + "step": 122825 + }, + { + "epoch": 18.294608281203455, + "grad_norm": 0.068359375, + "learning_rate": 0.0006596763406102513, + "loss": 0.8102, + "num_input_tokens_seen": 71317704, + "step": 122830 + }, + { + "epoch": 18.295352993744416, + "grad_norm": 0.048828125, + "learning_rate": 0.000659104635690203, + "loss": 0.8162, + "num_input_tokens_seen": 71320744, + "step": 122835 + }, + { + "epoch": 18.296097706285373, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006585331730450083, + "loss": 0.7857, + "num_input_tokens_seen": 71323976, + "step": 122840 + }, + { + "epoch": 18.296842418826333, + "grad_norm": 0.04052734375, + "learning_rate": 0.0006579619526843178, + "loss": 0.8008, + "num_input_tokens_seen": 71326664, + "step": 122845 + }, + { + "epoch": 18.297587131367294, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006573909746177852, + "loss": 0.7858, + "num_input_tokens_seen": 71329864, + "step": 122850 + }, + { + "epoch": 18.29833184390825, + "grad_norm": 0.052490234375, + "learning_rate": 0.0006568202388550548, + "loss": 0.8038, + "num_input_tokens_seen": 71332968, + "step": 122855 + }, + { + "epoch": 18.29907655644921, + "grad_norm": 0.09521484375, + "learning_rate": 0.0006562497454057686, + "loss": 0.7673, + "num_input_tokens_seen": 71336136, + "step": 122860 + }, + { + "epoch": 18.29982126899017, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006556794942795624, + "loss": 0.7947, + "num_input_tokens_seen": 71339080, + "step": 122865 + }, + { + "epoch": 18.30056598153113, + "grad_norm": 0.05322265625, + "learning_rate": 0.000655109485486075, + "loss": 0.8017, + "num_input_tokens_seen": 71341896, + "step": 122870 + }, + { + "epoch": 18.30131069407209, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006545397190349339, + "loss": 0.7991, + "num_input_tokens_seen": 71344648, + "step": 122875 + }, + { + "epoch": 18.302055406613047, + "grad_norm": 0.08837890625, + "learning_rate": 0.0006539701949357612, + "loss": 0.7923, + "num_input_tokens_seen": 71347496, + "step": 122880 + }, + { + "epoch": 18.302800119154007, + "grad_norm": 0.0380859375, + "learning_rate": 0.0006534009131981827, + "loss": 0.8026, + "num_input_tokens_seen": 71350408, + "step": 122885 + }, + { + "epoch": 18.303544831694964, + "grad_norm": 0.060546875, + "learning_rate": 0.000652831873831814, + "loss": 0.7995, + "num_input_tokens_seen": 71353064, + "step": 122890 + }, + { + "epoch": 18.304289544235925, + "grad_norm": 0.040771484375, + "learning_rate": 0.0006522630768462706, + "loss": 0.7942, + "num_input_tokens_seen": 71355848, + "step": 122895 + }, + { + "epoch": 18.305034256776885, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006516945222511566, + "loss": 0.7784, + "num_input_tokens_seen": 71358536, + "step": 122900 + }, + { + "epoch": 18.305778969317842, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006511262100560844, + "loss": 0.8036, + "num_input_tokens_seen": 71361384, + "step": 122905 + }, + { + "epoch": 18.306523681858803, + "grad_norm": 0.03564453125, + "learning_rate": 0.0006505581402706512, + "loss": 0.7988, + "num_input_tokens_seen": 71364520, + "step": 122910 + }, + { + "epoch": 18.307268394399763, + "grad_norm": 0.05224609375, + "learning_rate": 0.000649990312904451, + "loss": 0.7994, + "num_input_tokens_seen": 71367304, + "step": 122915 + }, + { + "epoch": 18.30801310694072, + "grad_norm": 0.07177734375, + "learning_rate": 0.0006494227279670827, + "loss": 0.8084, + "num_input_tokens_seen": 71370120, + "step": 122920 + }, + { + "epoch": 18.30875781948168, + "grad_norm": 0.0537109375, + "learning_rate": 0.0006488553854681323, + "loss": 0.7842, + "num_input_tokens_seen": 71372648, + "step": 122925 + }, + { + "epoch": 18.309502532022638, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006482882854171834, + "loss": 0.8028, + "num_input_tokens_seen": 71375624, + "step": 122930 + }, + { + "epoch": 18.310247244563598, + "grad_norm": 0.051513671875, + "learning_rate": 0.0006477214278238169, + "loss": 0.8114, + "num_input_tokens_seen": 71378472, + "step": 122935 + }, + { + "epoch": 18.31099195710456, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006471548126976101, + "loss": 0.8215, + "num_input_tokens_seen": 71381352, + "step": 122940 + }, + { + "epoch": 18.311736669645516, + "grad_norm": 0.042724609375, + "learning_rate": 0.0006465884400481353, + "loss": 0.7878, + "num_input_tokens_seen": 71384392, + "step": 122945 + }, + { + "epoch": 18.312481382186476, + "grad_norm": 0.0380859375, + "learning_rate": 0.0006460223098849616, + "loss": 0.8002, + "num_input_tokens_seen": 71387400, + "step": 122950 + }, + { + "epoch": 18.313226094727437, + "grad_norm": 0.05126953125, + "learning_rate": 0.0006454564222176512, + "loss": 0.7788, + "num_input_tokens_seen": 71390600, + "step": 122955 + }, + { + "epoch": 18.313970807268394, + "grad_norm": 0.06884765625, + "learning_rate": 0.0006448907770557649, + "loss": 0.7845, + "num_input_tokens_seen": 71393832, + "step": 122960 + }, + { + "epoch": 18.314715519809354, + "grad_norm": 0.039306640625, + "learning_rate": 0.00064432537440886, + "loss": 0.7958, + "num_input_tokens_seen": 71397032, + "step": 122965 + }, + { + "epoch": 18.31546023235031, + "grad_norm": 0.06103515625, + "learning_rate": 0.0006437602142864856, + "loss": 0.7973, + "num_input_tokens_seen": 71399848, + "step": 122970 + }, + { + "epoch": 18.316204944891272, + "grad_norm": 0.05615234375, + "learning_rate": 0.0006431952966981924, + "loss": 0.7966, + "num_input_tokens_seen": 71402920, + "step": 122975 + }, + { + "epoch": 18.316949657432232, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006426306216535244, + "loss": 0.7974, + "num_input_tokens_seen": 71405960, + "step": 122980 + }, + { + "epoch": 18.31769436997319, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006420661891620205, + "loss": 0.7993, + "num_input_tokens_seen": 71408584, + "step": 122985 + }, + { + "epoch": 18.31843908251415, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006415019992332116, + "loss": 0.7893, + "num_input_tokens_seen": 71411208, + "step": 122990 + }, + { + "epoch": 18.31918379505511, + "grad_norm": 0.036376953125, + "learning_rate": 0.0006409380518766366, + "loss": 0.7878, + "num_input_tokens_seen": 71413800, + "step": 122995 + }, + { + "epoch": 18.319928507596067, + "grad_norm": 0.080078125, + "learning_rate": 0.0006403743471018197, + "loss": 0.7872, + "num_input_tokens_seen": 71416712, + "step": 123000 + }, + { + "epoch": 18.320673220137028, + "grad_norm": 0.037109375, + "learning_rate": 0.0006398108849182798, + "loss": 0.7955, + "num_input_tokens_seen": 71419912, + "step": 123005 + }, + { + "epoch": 18.321417932677985, + "grad_norm": 0.03515625, + "learning_rate": 0.0006392476653355428, + "loss": 0.7629, + "num_input_tokens_seen": 71422728, + "step": 123010 + }, + { + "epoch": 18.322162645218945, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006386846883631191, + "loss": 0.7866, + "num_input_tokens_seen": 71425608, + "step": 123015 + }, + { + "epoch": 18.322907357759906, + "grad_norm": 0.03759765625, + "learning_rate": 0.0006381219540105248, + "loss": 0.7978, + "num_input_tokens_seen": 71428680, + "step": 123020 + }, + { + "epoch": 18.323652070300863, + "grad_norm": 0.037109375, + "learning_rate": 0.0006375594622872588, + "loss": 0.7907, + "num_input_tokens_seen": 71431496, + "step": 123025 + }, + { + "epoch": 18.324396782841823, + "grad_norm": 0.059814453125, + "learning_rate": 0.0006369972132028317, + "loss": 0.8169, + "num_input_tokens_seen": 71434280, + "step": 123030 + }, + { + "epoch": 18.32514149538278, + "grad_norm": 0.07177734375, + "learning_rate": 0.0006364352067667394, + "loss": 0.8025, + "num_input_tokens_seen": 71437224, + "step": 123035 + }, + { + "epoch": 18.32588620792374, + "grad_norm": 0.04541015625, + "learning_rate": 0.0006358734429884726, + "loss": 0.7896, + "num_input_tokens_seen": 71440040, + "step": 123040 + }, + { + "epoch": 18.3266309204647, + "grad_norm": 0.055419921875, + "learning_rate": 0.0006353119218775272, + "loss": 0.8115, + "num_input_tokens_seen": 71442824, + "step": 123045 + }, + { + "epoch": 18.32737563300566, + "grad_norm": 0.035400390625, + "learning_rate": 0.0006347506434433869, + "loss": 0.8128, + "num_input_tokens_seen": 71445416, + "step": 123050 + }, + { + "epoch": 18.32812034554662, + "grad_norm": 0.0242919921875, + "learning_rate": 0.0006341896076955361, + "loss": 0.8085, + "num_input_tokens_seen": 71448264, + "step": 123055 + }, + { + "epoch": 18.32886505808758, + "grad_norm": 0.053466796875, + "learning_rate": 0.0006336288146434471, + "loss": 0.8197, + "num_input_tokens_seen": 71451336, + "step": 123060 + }, + { + "epoch": 18.329609770628537, + "grad_norm": 0.0302734375, + "learning_rate": 0.0006330682642966006, + "loss": 0.7899, + "num_input_tokens_seen": 71454184, + "step": 123065 + }, + { + "epoch": 18.330354483169497, + "grad_norm": 0.10595703125, + "learning_rate": 0.0006325079566644643, + "loss": 0.7981, + "num_input_tokens_seen": 71457128, + "step": 123070 + }, + { + "epoch": 18.331099195710454, + "grad_norm": 0.041015625, + "learning_rate": 0.0006319478917565019, + "loss": 0.7873, + "num_input_tokens_seen": 71460040, + "step": 123075 + }, + { + "epoch": 18.331843908251415, + "grad_norm": 0.047119140625, + "learning_rate": 0.0006313880695821777, + "loss": 0.8003, + "num_input_tokens_seen": 71463176, + "step": 123080 + }, + { + "epoch": 18.332588620792375, + "grad_norm": 0.039794921875, + "learning_rate": 0.0006308284901509458, + "loss": 0.7708, + "num_input_tokens_seen": 71466120, + "step": 123085 + }, + { + "epoch": 18.333333333333332, + "grad_norm": 0.0517578125, + "learning_rate": 0.000630269153472267, + "loss": 0.8131, + "num_input_tokens_seen": 71468904, + "step": 123090 + }, + { + "epoch": 18.334078045874293, + "grad_norm": 0.046630859375, + "learning_rate": 0.000629710059555582, + "loss": 0.7944, + "num_input_tokens_seen": 71471624, + "step": 123095 + }, + { + "epoch": 18.334822758415253, + "grad_norm": 0.0361328125, + "learning_rate": 0.0006291512084103434, + "loss": 0.8132, + "num_input_tokens_seen": 71474408, + "step": 123100 + }, + { + "epoch": 18.33556747095621, + "grad_norm": 0.062255859375, + "learning_rate": 0.0006285926000459868, + "loss": 0.8003, + "num_input_tokens_seen": 71477480, + "step": 123105 + }, + { + "epoch": 18.33631218349717, + "grad_norm": 0.06201171875, + "learning_rate": 0.000628034234471953, + "loss": 0.798, + "num_input_tokens_seen": 71480296, + "step": 123110 + }, + { + "epoch": 18.337056896038128, + "grad_norm": 0.058349609375, + "learning_rate": 0.0006274761116976745, + "loss": 0.7738, + "num_input_tokens_seen": 71483208, + "step": 123115 + }, + { + "epoch": 18.33780160857909, + "grad_norm": 0.0712890625, + "learning_rate": 0.0006269182317325788, + "loss": 0.784, + "num_input_tokens_seen": 71486280, + "step": 123120 + }, + { + "epoch": 18.33854632112005, + "grad_norm": 0.034912109375, + "learning_rate": 0.0006263605945860933, + "loss": 0.799, + "num_input_tokens_seen": 71489224, + "step": 123125 + }, + { + "epoch": 18.339291033661006, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006258032002676339, + "loss": 0.8016, + "num_input_tokens_seen": 71492264, + "step": 123130 + }, + { + "epoch": 18.340035746201966, + "grad_norm": 0.041259765625, + "learning_rate": 0.0006252460487866212, + "loss": 0.8016, + "num_input_tokens_seen": 71494920, + "step": 123135 + }, + { + "epoch": 18.340780458742927, + "grad_norm": 0.040283203125, + "learning_rate": 0.0006246891401524662, + "loss": 0.8185, + "num_input_tokens_seen": 71497672, + "step": 123140 + }, + { + "epoch": 18.341525171283884, + "grad_norm": 0.07470703125, + "learning_rate": 0.0006241324743745779, + "loss": 0.8028, + "num_input_tokens_seen": 71500744, + "step": 123145 + }, + { + "epoch": 18.342269883824844, + "grad_norm": 0.11083984375, + "learning_rate": 0.0006235760514623623, + "loss": 0.8058, + "num_input_tokens_seen": 71503624, + "step": 123150 + }, + { + "epoch": 18.3430145963658, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006230198714252149, + "loss": 0.8079, + "num_input_tokens_seen": 71506472, + "step": 123155 + }, + { + "epoch": 18.343759308906762, + "grad_norm": 0.048583984375, + "learning_rate": 0.0006224639342725369, + "loss": 0.792, + "num_input_tokens_seen": 71509576, + "step": 123160 + }, + { + "epoch": 18.344504021447722, + "grad_norm": 0.0322265625, + "learning_rate": 0.0006219082400137154, + "loss": 0.8006, + "num_input_tokens_seen": 71512584, + "step": 123165 + }, + { + "epoch": 18.34524873398868, + "grad_norm": 0.038818359375, + "learning_rate": 0.0006213527886581433, + "loss": 0.7905, + "num_input_tokens_seen": 71515720, + "step": 123170 + }, + { + "epoch": 18.34599344652964, + "grad_norm": 0.0279541015625, + "learning_rate": 0.000620797580215201, + "loss": 0.7967, + "num_input_tokens_seen": 71518696, + "step": 123175 + }, + { + "epoch": 18.3467381590706, + "grad_norm": 0.04345703125, + "learning_rate": 0.0006202426146942697, + "loss": 0.7954, + "num_input_tokens_seen": 71521640, + "step": 123180 + }, + { + "epoch": 18.347482871611557, + "grad_norm": 0.041748046875, + "learning_rate": 0.0006196878921047249, + "loss": 0.7968, + "num_input_tokens_seen": 71524360, + "step": 123185 + }, + { + "epoch": 18.348227584152518, + "grad_norm": 0.07861328125, + "learning_rate": 0.0006191334124559378, + "loss": 0.8082, + "num_input_tokens_seen": 71527176, + "step": 123190 + }, + { + "epoch": 18.348972296693475, + "grad_norm": 0.044189453125, + "learning_rate": 0.0006185791757572756, + "loss": 0.8007, + "num_input_tokens_seen": 71530440, + "step": 123195 + }, + { + "epoch": 18.349717009234435, + "grad_norm": 0.0732421875, + "learning_rate": 0.0006180251820181009, + "loss": 0.7986, + "num_input_tokens_seen": 71533448, + "step": 123200 + }, + { + "epoch": 18.350461721775396, + "grad_norm": 0.043701171875, + "learning_rate": 0.0006174714312477747, + "loss": 0.8039, + "num_input_tokens_seen": 71536168, + "step": 123205 + }, + { + "epoch": 18.351206434316353, + "grad_norm": 0.04296875, + "learning_rate": 0.0006169179234556476, + "loss": 0.7865, + "num_input_tokens_seen": 71538888, + "step": 123210 + }, + { + "epoch": 18.351951146857314, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0006163646586510774, + "loss": 0.7847, + "num_input_tokens_seen": 71541736, + "step": 123215 + }, + { + "epoch": 18.35269585939827, + "grad_norm": 0.048095703125, + "learning_rate": 0.0006158116368434047, + "loss": 0.8162, + "num_input_tokens_seen": 71544648, + "step": 123220 + }, + { + "epoch": 18.35344057193923, + "grad_norm": 0.061279296875, + "learning_rate": 0.0006152588580419771, + "loss": 0.808, + "num_input_tokens_seen": 71547368, + "step": 123225 + }, + { + "epoch": 18.35418528448019, + "grad_norm": 0.037841796875, + "learning_rate": 0.0006147063222561322, + "loss": 0.7757, + "num_input_tokens_seen": 71550408, + "step": 123230 + }, + { + "epoch": 18.35492999702115, + "grad_norm": 0.04296875, + "learning_rate": 0.0006141540294951991, + "loss": 0.7791, + "num_input_tokens_seen": 71552968, + "step": 123235 + }, + { + "epoch": 18.35567470956211, + "grad_norm": 0.037841796875, + "learning_rate": 0.0006136019797685172, + "loss": 0.8113, + "num_input_tokens_seen": 71555880, + "step": 123240 + }, + { + "epoch": 18.35641942210307, + "grad_norm": 0.0634765625, + "learning_rate": 0.0006130501730854037, + "loss": 0.8037, + "num_input_tokens_seen": 71558824, + "step": 123245 + }, + { + "epoch": 18.357164134644027, + "grad_norm": 0.035888671875, + "learning_rate": 0.0006124986094551898, + "loss": 0.8052, + "num_input_tokens_seen": 71561448, + "step": 123250 + }, + { + "epoch": 18.357908847184987, + "grad_norm": 0.0869140625, + "learning_rate": 0.0006119472888871863, + "loss": 0.7992, + "num_input_tokens_seen": 71564040, + "step": 123255 + }, + { + "epoch": 18.358653559725944, + "grad_norm": 0.0478515625, + "learning_rate": 0.0006113962113907123, + "loss": 0.7914, + "num_input_tokens_seen": 71567208, + "step": 123260 + }, + { + "epoch": 18.359398272266905, + "grad_norm": 0.043212890625, + "learning_rate": 0.0006108453769750721, + "loss": 0.7887, + "num_input_tokens_seen": 71569992, + "step": 123265 + }, + { + "epoch": 18.360142984807865, + "grad_norm": 0.0272216796875, + "learning_rate": 0.0006102947856495766, + "loss": 0.7866, + "num_input_tokens_seen": 71572840, + "step": 123270 + }, + { + "epoch": 18.360887697348822, + "grad_norm": 0.0380859375, + "learning_rate": 0.0006097444374235267, + "loss": 0.8439, + "num_input_tokens_seen": 71575720, + "step": 123275 + }, + { + "epoch": 18.361632409889783, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006091943323062166, + "loss": 0.7811, + "num_input_tokens_seen": 71578344, + "step": 123280 + }, + { + "epoch": 18.362377122430743, + "grad_norm": 0.046630859375, + "learning_rate": 0.0006086444703069438, + "loss": 0.7924, + "num_input_tokens_seen": 71581224, + "step": 123285 + }, + { + "epoch": 18.3631218349717, + "grad_norm": 0.056396484375, + "learning_rate": 0.0006080948514349943, + "loss": 0.7771, + "num_input_tokens_seen": 71584232, + "step": 123290 + }, + { + "epoch": 18.36386654751266, + "grad_norm": 0.06298828125, + "learning_rate": 0.0006075454756996573, + "loss": 0.7735, + "num_input_tokens_seen": 71587176, + "step": 123295 + }, + { + "epoch": 18.364611260053618, + "grad_norm": 0.03271484375, + "learning_rate": 0.0006069963431102088, + "loss": 0.7878, + "num_input_tokens_seen": 71590120, + "step": 123300 + }, + { + "epoch": 18.36535597259458, + "grad_norm": 0.0634765625, + "learning_rate": 0.0006064474536759312, + "loss": 0.7997, + "num_input_tokens_seen": 71593256, + "step": 123305 + }, + { + "epoch": 18.36610068513554, + "grad_norm": 0.04541015625, + "learning_rate": 0.0006058988074060939, + "loss": 0.8019, + "num_input_tokens_seen": 71596200, + "step": 123310 + }, + { + "epoch": 18.366845397676496, + "grad_norm": 0.029541015625, + "learning_rate": 0.0006053504043099677, + "loss": 0.8409, + "num_input_tokens_seen": 71598952, + "step": 123315 + }, + { + "epoch": 18.367590110217456, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006048022443968154, + "loss": 0.7957, + "num_input_tokens_seen": 71601928, + "step": 123320 + }, + { + "epoch": 18.368334822758417, + "grad_norm": 0.05517578125, + "learning_rate": 0.0006042543276758994, + "loss": 0.7987, + "num_input_tokens_seen": 71605192, + "step": 123325 + }, + { + "epoch": 18.369079535299374, + "grad_norm": 0.038330078125, + "learning_rate": 0.0006037066541564756, + "loss": 0.8071, + "num_input_tokens_seen": 71608168, + "step": 123330 + }, + { + "epoch": 18.369824247840334, + "grad_norm": 0.045166015625, + "learning_rate": 0.0006031592238477934, + "loss": 0.8056, + "num_input_tokens_seen": 71611400, + "step": 123335 + }, + { + "epoch": 18.37056896038129, + "grad_norm": 0.049072265625, + "learning_rate": 0.0006026120367591053, + "loss": 0.8002, + "num_input_tokens_seen": 71614568, + "step": 123340 + }, + { + "epoch": 18.371313672922252, + "grad_norm": 0.052490234375, + "learning_rate": 0.000602065092899654, + "loss": 0.804, + "num_input_tokens_seen": 71617352, + "step": 123345 + }, + { + "epoch": 18.372058385463212, + "grad_norm": 0.037841796875, + "learning_rate": 0.0006015183922786805, + "loss": 0.791, + "num_input_tokens_seen": 71620424, + "step": 123350 + }, + { + "epoch": 18.37280309800417, + "grad_norm": 0.053955078125, + "learning_rate": 0.0006009719349054204, + "loss": 0.7875, + "num_input_tokens_seen": 71623336, + "step": 123355 + }, + { + "epoch": 18.37354781054513, + "grad_norm": 0.07568359375, + "learning_rate": 0.0006004257207891017, + "loss": 0.8022, + "num_input_tokens_seen": 71626312, + "step": 123360 + }, + { + "epoch": 18.37429252308609, + "grad_norm": 0.04931640625, + "learning_rate": 0.0005998797499389585, + "loss": 0.8005, + "num_input_tokens_seen": 71629192, + "step": 123365 + }, + { + "epoch": 18.375037235627047, + "grad_norm": 0.05859375, + "learning_rate": 0.0005993340223642084, + "loss": 0.7892, + "num_input_tokens_seen": 71632200, + "step": 123370 + }, + { + "epoch": 18.375781948168008, + "grad_norm": 0.032958984375, + "learning_rate": 0.0005987885380740759, + "loss": 0.8072, + "num_input_tokens_seen": 71635080, + "step": 123375 + }, + { + "epoch": 18.376526660708965, + "grad_norm": 0.05126953125, + "learning_rate": 0.0005982432970777751, + "loss": 0.7955, + "num_input_tokens_seen": 71637896, + "step": 123380 + }, + { + "epoch": 18.377271373249926, + "grad_norm": 0.052490234375, + "learning_rate": 0.000597698299384517, + "loss": 0.8097, + "num_input_tokens_seen": 71640840, + "step": 123385 + }, + { + "epoch": 18.378016085790886, + "grad_norm": 0.035400390625, + "learning_rate": 0.0005971535450035059, + "loss": 0.7819, + "num_input_tokens_seen": 71643720, + "step": 123390 + }, + { + "epoch": 18.378760798331843, + "grad_norm": 0.0390625, + "learning_rate": 0.0005966090339439478, + "loss": 0.7908, + "num_input_tokens_seen": 71646632, + "step": 123395 + }, + { + "epoch": 18.379505510872804, + "grad_norm": 0.0732421875, + "learning_rate": 0.000596064766215042, + "loss": 0.8327, + "num_input_tokens_seen": 71649800, + "step": 123400 + }, + { + "epoch": 18.38025022341376, + "grad_norm": 0.05712890625, + "learning_rate": 0.0005955207418259812, + "loss": 0.8165, + "num_input_tokens_seen": 71652488, + "step": 123405 + }, + { + "epoch": 18.38099493595472, + "grad_norm": 0.054443359375, + "learning_rate": 0.0005949769607859578, + "loss": 0.8007, + "num_input_tokens_seen": 71655336, + "step": 123410 + }, + { + "epoch": 18.38173964849568, + "grad_norm": 0.0634765625, + "learning_rate": 0.0005944334231041564, + "loss": 0.7807, + "num_input_tokens_seen": 71658216, + "step": 123415 + }, + { + "epoch": 18.38248436103664, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005938901287897647, + "loss": 0.7997, + "num_input_tokens_seen": 71661128, + "step": 123420 + }, + { + "epoch": 18.3832290735776, + "grad_norm": 0.033935546875, + "learning_rate": 0.0005933470778519551, + "loss": 0.7776, + "num_input_tokens_seen": 71664232, + "step": 123425 + }, + { + "epoch": 18.38397378611856, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005928042702999053, + "loss": 0.8001, + "num_input_tokens_seen": 71667336, + "step": 123430 + }, + { + "epoch": 18.384718498659517, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005922617061427848, + "loss": 0.7902, + "num_input_tokens_seen": 71670344, + "step": 123435 + }, + { + "epoch": 18.385463211200477, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005917193853897579, + "loss": 0.8024, + "num_input_tokens_seen": 71673064, + "step": 123440 + }, + { + "epoch": 18.386207923741434, + "grad_norm": 0.053955078125, + "learning_rate": 0.0005911773080499888, + "loss": 0.8092, + "num_input_tokens_seen": 71676296, + "step": 123445 + }, + { + "epoch": 18.386952636282395, + "grad_norm": 0.044677734375, + "learning_rate": 0.000590635474132637, + "loss": 0.8028, + "num_input_tokens_seen": 71679208, + "step": 123450 + }, + { + "epoch": 18.387697348823355, + "grad_norm": 0.039306640625, + "learning_rate": 0.0005900938836468516, + "loss": 0.8071, + "num_input_tokens_seen": 71682344, + "step": 123455 + }, + { + "epoch": 18.388442061364312, + "grad_norm": 0.037841796875, + "learning_rate": 0.000589552536601784, + "loss": 0.7959, + "num_input_tokens_seen": 71685096, + "step": 123460 + }, + { + "epoch": 18.389186773905273, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005890114330065815, + "loss": 0.7926, + "num_input_tokens_seen": 71687560, + "step": 123465 + }, + { + "epoch": 18.389931486446233, + "grad_norm": 0.038818359375, + "learning_rate": 0.000588470572870382, + "loss": 0.7912, + "num_input_tokens_seen": 71690536, + "step": 123470 + }, + { + "epoch": 18.39067619898719, + "grad_norm": 0.0380859375, + "learning_rate": 0.0005879299562023266, + "loss": 0.7993, + "num_input_tokens_seen": 71693096, + "step": 123475 + }, + { + "epoch": 18.39142091152815, + "grad_norm": 0.0400390625, + "learning_rate": 0.0005873895830115477, + "loss": 0.7857, + "num_input_tokens_seen": 71696168, + "step": 123480 + }, + { + "epoch": 18.392165624069108, + "grad_norm": 0.041015625, + "learning_rate": 0.0005868494533071716, + "loss": 0.779, + "num_input_tokens_seen": 71699016, + "step": 123485 + }, + { + "epoch": 18.39291033661007, + "grad_norm": 0.023681640625, + "learning_rate": 0.0005863095670983259, + "loss": 0.8052, + "num_input_tokens_seen": 71702184, + "step": 123490 + }, + { + "epoch": 18.39365504915103, + "grad_norm": 0.059326171875, + "learning_rate": 0.00058576992439413, + "loss": 0.7929, + "num_input_tokens_seen": 71705000, + "step": 123495 + }, + { + "epoch": 18.394399761691986, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005852305252037016, + "loss": 0.7953, + "num_input_tokens_seen": 71707848, + "step": 123500 + }, + { + "epoch": 18.395144474232946, + "grad_norm": 0.041259765625, + "learning_rate": 0.0005846913695361533, + "loss": 0.792, + "num_input_tokens_seen": 71710600, + "step": 123505 + }, + { + "epoch": 18.395889186773907, + "grad_norm": 0.05517578125, + "learning_rate": 0.000584152457400593, + "loss": 0.7917, + "num_input_tokens_seen": 71713672, + "step": 123510 + }, + { + "epoch": 18.396633899314864, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005836137888061265, + "loss": 0.8282, + "num_input_tokens_seen": 71716520, + "step": 123515 + }, + { + "epoch": 18.397378611855824, + "grad_norm": 0.03466796875, + "learning_rate": 0.0005830753637618519, + "loss": 0.8089, + "num_input_tokens_seen": 71719560, + "step": 123520 + }, + { + "epoch": 18.39812332439678, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005825371822768682, + "loss": 0.7918, + "num_input_tokens_seen": 71722408, + "step": 123525 + }, + { + "epoch": 18.398868036937742, + "grad_norm": 0.1103515625, + "learning_rate": 0.0005819992443602617, + "loss": 0.8117, + "num_input_tokens_seen": 71725096, + "step": 123530 + }, + { + "epoch": 18.399612749478703, + "grad_norm": 0.037109375, + "learning_rate": 0.0005814615500211268, + "loss": 0.7837, + "num_input_tokens_seen": 71727912, + "step": 123535 + }, + { + "epoch": 18.40035746201966, + "grad_norm": 0.04931640625, + "learning_rate": 0.0005809240992685427, + "loss": 0.7932, + "num_input_tokens_seen": 71730632, + "step": 123540 + }, + { + "epoch": 18.40110217456062, + "grad_norm": 0.06396484375, + "learning_rate": 0.000580386892111594, + "loss": 0.8025, + "num_input_tokens_seen": 71733288, + "step": 123545 + }, + { + "epoch": 18.401846887101577, + "grad_norm": 0.040283203125, + "learning_rate": 0.00057984992855935, + "loss": 0.7977, + "num_input_tokens_seen": 71736072, + "step": 123550 + }, + { + "epoch": 18.402591599642538, + "grad_norm": 0.046875, + "learning_rate": 0.0005793132086208885, + "loss": 0.8011, + "num_input_tokens_seen": 71739240, + "step": 123555 + }, + { + "epoch": 18.403336312183498, + "grad_norm": 0.056640625, + "learning_rate": 0.0005787767323052722, + "loss": 0.7779, + "num_input_tokens_seen": 71742024, + "step": 123560 + }, + { + "epoch": 18.404081024724455, + "grad_norm": 0.060302734375, + "learning_rate": 0.0005782404996215656, + "loss": 0.8027, + "num_input_tokens_seen": 71744680, + "step": 123565 + }, + { + "epoch": 18.404825737265416, + "grad_norm": 0.052734375, + "learning_rate": 0.0005777045105788297, + "loss": 0.8004, + "num_input_tokens_seen": 71747592, + "step": 123570 + }, + { + "epoch": 18.405570449806376, + "grad_norm": 0.0517578125, + "learning_rate": 0.0005771687651861157, + "loss": 0.8045, + "num_input_tokens_seen": 71750664, + "step": 123575 + }, + { + "epoch": 18.406315162347333, + "grad_norm": 0.05859375, + "learning_rate": 0.0005766332634524778, + "loss": 0.7945, + "num_input_tokens_seen": 71753416, + "step": 123580 + }, + { + "epoch": 18.407059874888294, + "grad_norm": 0.043701171875, + "learning_rate": 0.0005760980053869624, + "loss": 0.7993, + "num_input_tokens_seen": 71756648, + "step": 123585 + }, + { + "epoch": 18.40780458742925, + "grad_norm": 0.33984375, + "learning_rate": 0.000575562990998612, + "loss": 0.795, + "num_input_tokens_seen": 71759432, + "step": 123590 + }, + { + "epoch": 18.40854929997021, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005750282202964646, + "loss": 0.8028, + "num_input_tokens_seen": 71762088, + "step": 123595 + }, + { + "epoch": 18.40929401251117, + "grad_norm": 0.048095703125, + "learning_rate": 0.0005744936932895511, + "loss": 0.7766, + "num_input_tokens_seen": 71765160, + "step": 123600 + }, + { + "epoch": 18.41003872505213, + "grad_norm": 0.042724609375, + "learning_rate": 0.0005739594099869078, + "loss": 0.7865, + "num_input_tokens_seen": 71767912, + "step": 123605 + }, + { + "epoch": 18.41078343759309, + "grad_norm": 0.1572265625, + "learning_rate": 0.0005734253703975573, + "loss": 0.8147, + "num_input_tokens_seen": 71771016, + "step": 123610 + }, + { + "epoch": 18.41152815013405, + "grad_norm": 0.04638671875, + "learning_rate": 0.000572891574530524, + "loss": 0.7975, + "num_input_tokens_seen": 71774184, + "step": 123615 + }, + { + "epoch": 18.412272862675007, + "grad_norm": 0.0419921875, + "learning_rate": 0.0005723580223948227, + "loss": 0.8061, + "num_input_tokens_seen": 71777256, + "step": 123620 + }, + { + "epoch": 18.413017575215967, + "grad_norm": 0.05517578125, + "learning_rate": 0.0005718247139994708, + "loss": 0.7806, + "num_input_tokens_seen": 71780584, + "step": 123625 + }, + { + "epoch": 18.413762287756924, + "grad_norm": 0.038818359375, + "learning_rate": 0.000571291649353473, + "loss": 0.7986, + "num_input_tokens_seen": 71783560, + "step": 123630 + }, + { + "epoch": 18.414507000297885, + "grad_norm": 0.044921875, + "learning_rate": 0.0005707588284658421, + "loss": 0.8182, + "num_input_tokens_seen": 71786632, + "step": 123635 + }, + { + "epoch": 18.415251712838845, + "grad_norm": 0.107421875, + "learning_rate": 0.0005702262513455741, + "loss": 0.7822, + "num_input_tokens_seen": 71789736, + "step": 123640 + }, + { + "epoch": 18.415996425379802, + "grad_norm": 0.0289306640625, + "learning_rate": 0.0005696939180016685, + "loss": 0.7908, + "num_input_tokens_seen": 71792584, + "step": 123645 + }, + { + "epoch": 18.416741137920763, + "grad_norm": 0.029541015625, + "learning_rate": 0.0005691618284431165, + "loss": 0.791, + "num_input_tokens_seen": 71795176, + "step": 123650 + }, + { + "epoch": 18.417485850461723, + "grad_norm": 0.0458984375, + "learning_rate": 0.0005686299826789092, + "loss": 0.7842, + "num_input_tokens_seen": 71798344, + "step": 123655 + }, + { + "epoch": 18.41823056300268, + "grad_norm": 0.037841796875, + "learning_rate": 0.0005680983807180329, + "loss": 0.7999, + "num_input_tokens_seen": 71801064, + "step": 123660 + }, + { + "epoch": 18.41897527554364, + "grad_norm": 0.039794921875, + "learning_rate": 0.0005675670225694618, + "loss": 0.7949, + "num_input_tokens_seen": 71804040, + "step": 123665 + }, + { + "epoch": 18.419719988084598, + "grad_norm": 0.06591796875, + "learning_rate": 0.0005670359082421822, + "loss": 0.7918, + "num_input_tokens_seen": 71806888, + "step": 123670 + }, + { + "epoch": 18.42046470062556, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005665050377451602, + "loss": 0.7941, + "num_input_tokens_seen": 71809800, + "step": 123675 + }, + { + "epoch": 18.42120941316652, + "grad_norm": 0.0576171875, + "learning_rate": 0.0005659744110873638, + "loss": 0.7968, + "num_input_tokens_seen": 71812808, + "step": 123680 + }, + { + "epoch": 18.421954125707476, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005654440282777623, + "loss": 0.7997, + "num_input_tokens_seen": 71815688, + "step": 123685 + }, + { + "epoch": 18.422698838248436, + "grad_norm": 0.05859375, + "learning_rate": 0.000564913889325312, + "loss": 0.8037, + "num_input_tokens_seen": 71818440, + "step": 123690 + }, + { + "epoch": 18.423443550789397, + "grad_norm": 0.0390625, + "learning_rate": 0.0005643839942389706, + "loss": 0.8174, + "num_input_tokens_seen": 71821512, + "step": 123695 + }, + { + "epoch": 18.424188263330354, + "grad_norm": 0.053955078125, + "learning_rate": 0.0005638543430276893, + "loss": 0.8029, + "num_input_tokens_seen": 71824520, + "step": 123700 + }, + { + "epoch": 18.424932975871315, + "grad_norm": 0.03466796875, + "learning_rate": 0.0005633249357004177, + "loss": 0.7963, + "num_input_tokens_seen": 71827464, + "step": 123705 + }, + { + "epoch": 18.42567768841227, + "grad_norm": 0.0380859375, + "learning_rate": 0.0005627957722661003, + "loss": 0.7934, + "num_input_tokens_seen": 71830184, + "step": 123710 + }, + { + "epoch": 18.426422400953232, + "grad_norm": 0.0458984375, + "learning_rate": 0.0005622668527336732, + "loss": 0.7775, + "num_input_tokens_seen": 71832904, + "step": 123715 + }, + { + "epoch": 18.427167113494193, + "grad_norm": 0.041259765625, + "learning_rate": 0.0005617381771120743, + "loss": 0.7973, + "num_input_tokens_seen": 71835912, + "step": 123720 + }, + { + "epoch": 18.42791182603515, + "grad_norm": 0.04150390625, + "learning_rate": 0.0005612097454102332, + "loss": 0.7903, + "num_input_tokens_seen": 71838792, + "step": 123725 + }, + { + "epoch": 18.42865653857611, + "grad_norm": 0.041015625, + "learning_rate": 0.0005606815576370794, + "loss": 0.7857, + "num_input_tokens_seen": 71841320, + "step": 123730 + }, + { + "epoch": 18.42940125111707, + "grad_norm": 0.0361328125, + "learning_rate": 0.0005601536138015339, + "loss": 0.7882, + "num_input_tokens_seen": 71844200, + "step": 123735 + }, + { + "epoch": 18.430145963658028, + "grad_norm": 0.0419921875, + "learning_rate": 0.0005596259139125182, + "loss": 0.7941, + "num_input_tokens_seen": 71846984, + "step": 123740 + }, + { + "epoch": 18.430890676198988, + "grad_norm": 0.036376953125, + "learning_rate": 0.0005590984579789449, + "loss": 0.8008, + "num_input_tokens_seen": 71849608, + "step": 123745 + }, + { + "epoch": 18.431635388739945, + "grad_norm": 0.04248046875, + "learning_rate": 0.0005585712460097286, + "loss": 0.8166, + "num_input_tokens_seen": 71852552, + "step": 123750 + }, + { + "epoch": 18.432380101280906, + "grad_norm": 0.047607421875, + "learning_rate": 0.0005580442780137723, + "loss": 0.7915, + "num_input_tokens_seen": 71855880, + "step": 123755 + }, + { + "epoch": 18.433124813821866, + "grad_norm": 0.03564453125, + "learning_rate": 0.0005575175539999788, + "loss": 0.7958, + "num_input_tokens_seen": 71858600, + "step": 123760 + }, + { + "epoch": 18.433869526362823, + "grad_norm": 0.061767578125, + "learning_rate": 0.0005569910739772493, + "loss": 0.7941, + "num_input_tokens_seen": 71861608, + "step": 123765 + }, + { + "epoch": 18.434614238903784, + "grad_norm": 0.06005859375, + "learning_rate": 0.0005564648379544751, + "loss": 0.7881, + "num_input_tokens_seen": 71864392, + "step": 123770 + }, + { + "epoch": 18.43535895144474, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005559388459405473, + "loss": 0.796, + "num_input_tokens_seen": 71867208, + "step": 123775 + }, + { + "epoch": 18.4361036639857, + "grad_norm": 0.043212890625, + "learning_rate": 0.0005554130979443538, + "loss": 0.7805, + "num_input_tokens_seen": 71870088, + "step": 123780 + }, + { + "epoch": 18.43684837652666, + "grad_norm": 0.037353515625, + "learning_rate": 0.000554887593974776, + "loss": 0.7877, + "num_input_tokens_seen": 71872904, + "step": 123785 + }, + { + "epoch": 18.43759308906762, + "grad_norm": 0.04443359375, + "learning_rate": 0.0005543623340406883, + "loss": 0.7788, + "num_input_tokens_seen": 71875656, + "step": 123790 + }, + { + "epoch": 18.43833780160858, + "grad_norm": 0.0390625, + "learning_rate": 0.0005538373181509687, + "loss": 0.7816, + "num_input_tokens_seen": 71878312, + "step": 123795 + }, + { + "epoch": 18.43908251414954, + "grad_norm": 0.05859375, + "learning_rate": 0.000553312546314485, + "loss": 0.7905, + "num_input_tokens_seen": 71881512, + "step": 123800 + }, + { + "epoch": 18.439827226690497, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005527880185401019, + "loss": 0.8103, + "num_input_tokens_seen": 71884424, + "step": 123805 + }, + { + "epoch": 18.440571939231457, + "grad_norm": 0.03857421875, + "learning_rate": 0.0005522637348366838, + "loss": 0.7983, + "num_input_tokens_seen": 71887240, + "step": 123810 + }, + { + "epoch": 18.441316651772414, + "grad_norm": 0.048828125, + "learning_rate": 0.0005517396952130837, + "loss": 0.8104, + "num_input_tokens_seen": 71890088, + "step": 123815 + }, + { + "epoch": 18.442061364313375, + "grad_norm": 0.03466796875, + "learning_rate": 0.000551215899678158, + "loss": 0.7892, + "num_input_tokens_seen": 71892968, + "step": 123820 + }, + { + "epoch": 18.442806076854335, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005506923482407527, + "loss": 0.7936, + "num_input_tokens_seen": 71895752, + "step": 123825 + }, + { + "epoch": 18.443550789395292, + "grad_norm": 0.049072265625, + "learning_rate": 0.0005501690409097177, + "loss": 0.7965, + "num_input_tokens_seen": 71898888, + "step": 123830 + }, + { + "epoch": 18.444295501936253, + "grad_norm": 0.052978515625, + "learning_rate": 0.0005496459776938889, + "loss": 0.7878, + "num_input_tokens_seen": 71901704, + "step": 123835 + }, + { + "epoch": 18.445040214477213, + "grad_norm": 0.046875, + "learning_rate": 0.0005491231586021044, + "loss": 0.7712, + "num_input_tokens_seen": 71904520, + "step": 123840 + }, + { + "epoch": 18.44578492701817, + "grad_norm": 0.0390625, + "learning_rate": 0.0005486005836431972, + "loss": 0.8122, + "num_input_tokens_seen": 71907336, + "step": 123845 + }, + { + "epoch": 18.44652963955913, + "grad_norm": 0.05712890625, + "learning_rate": 0.0005480782528259969, + "loss": 0.8023, + "num_input_tokens_seen": 71910536, + "step": 123850 + }, + { + "epoch": 18.447274352100088, + "grad_norm": 0.09326171875, + "learning_rate": 0.0005475561661593247, + "loss": 0.7994, + "num_input_tokens_seen": 71913320, + "step": 123855 + }, + { + "epoch": 18.44801906464105, + "grad_norm": 0.050048828125, + "learning_rate": 0.000547034323652002, + "loss": 0.7808, + "num_input_tokens_seen": 71916552, + "step": 123860 + }, + { + "epoch": 18.44876377718201, + "grad_norm": 0.03857421875, + "learning_rate": 0.0005465127253128465, + "loss": 0.7917, + "num_input_tokens_seen": 71919400, + "step": 123865 + }, + { + "epoch": 18.449508489722966, + "grad_norm": 0.03466796875, + "learning_rate": 0.0005459913711506664, + "loss": 0.7676, + "num_input_tokens_seen": 71922216, + "step": 123870 + }, + { + "epoch": 18.450253202263927, + "grad_norm": 0.03369140625, + "learning_rate": 0.0005454702611742729, + "loss": 0.804, + "num_input_tokens_seen": 71925064, + "step": 123875 + }, + { + "epoch": 18.450997914804887, + "grad_norm": 0.041015625, + "learning_rate": 0.0005449493953924689, + "loss": 0.8013, + "num_input_tokens_seen": 71928104, + "step": 123880 + }, + { + "epoch": 18.451742627345844, + "grad_norm": 0.05712890625, + "learning_rate": 0.0005444287738140523, + "loss": 0.8001, + "num_input_tokens_seen": 71930920, + "step": 123885 + }, + { + "epoch": 18.452487339886805, + "grad_norm": 0.03857421875, + "learning_rate": 0.0005439083964478214, + "loss": 0.8186, + "num_input_tokens_seen": 71933768, + "step": 123890 + }, + { + "epoch": 18.45323205242776, + "grad_norm": 0.044677734375, + "learning_rate": 0.0005433882633025638, + "loss": 0.7899, + "num_input_tokens_seen": 71936872, + "step": 123895 + }, + { + "epoch": 18.453976764968722, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005428683743870694, + "loss": 0.7934, + "num_input_tokens_seen": 71939880, + "step": 123900 + }, + { + "epoch": 18.454721477509683, + "grad_norm": 0.0859375, + "learning_rate": 0.0005423487297101193, + "loss": 0.7908, + "num_input_tokens_seen": 71942664, + "step": 123905 + }, + { + "epoch": 18.45546619005064, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005418293292804931, + "loss": 0.8246, + "num_input_tokens_seen": 71945544, + "step": 123910 + }, + { + "epoch": 18.4562109025916, + "grad_norm": 0.046630859375, + "learning_rate": 0.0005413101731069675, + "loss": 0.8131, + "num_input_tokens_seen": 71948200, + "step": 123915 + }, + { + "epoch": 18.456955615132557, + "grad_norm": 0.04833984375, + "learning_rate": 0.0005407912611983118, + "loss": 0.8037, + "num_input_tokens_seen": 71951240, + "step": 123920 + }, + { + "epoch": 18.457700327673518, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005402725935632907, + "loss": 0.8035, + "num_input_tokens_seen": 71953832, + "step": 123925 + }, + { + "epoch": 18.458445040214478, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005397541702106673, + "loss": 0.7833, + "num_input_tokens_seen": 71956456, + "step": 123930 + }, + { + "epoch": 18.459189752755435, + "grad_norm": 0.056396484375, + "learning_rate": 0.0005392359911492011, + "loss": 0.803, + "num_input_tokens_seen": 71959496, + "step": 123935 + }, + { + "epoch": 18.459934465296396, + "grad_norm": 0.048583984375, + "learning_rate": 0.0005387180563876453, + "loss": 0.8046, + "num_input_tokens_seen": 71962472, + "step": 123940 + }, + { + "epoch": 18.460679177837356, + "grad_norm": 0.08837890625, + "learning_rate": 0.0005382003659347511, + "loss": 0.7862, + "num_input_tokens_seen": 71965320, + "step": 123945 + }, + { + "epoch": 18.461423890378313, + "grad_norm": 0.040771484375, + "learning_rate": 0.0005376829197992617, + "loss": 0.8111, + "num_input_tokens_seen": 71968040, + "step": 123950 + }, + { + "epoch": 18.462168602919274, + "grad_norm": 0.039306640625, + "learning_rate": 0.0005371657179899214, + "loss": 0.782, + "num_input_tokens_seen": 71971176, + "step": 123955 + }, + { + "epoch": 18.46291331546023, + "grad_norm": 0.030029296875, + "learning_rate": 0.0005366487605154684, + "loss": 0.8068, + "num_input_tokens_seen": 71973800, + "step": 123960 + }, + { + "epoch": 18.46365802800119, + "grad_norm": 0.047119140625, + "learning_rate": 0.0005361320473846326, + "loss": 0.8091, + "num_input_tokens_seen": 71976616, + "step": 123965 + }, + { + "epoch": 18.464402740542152, + "grad_norm": 0.033447265625, + "learning_rate": 0.0005356155786061467, + "loss": 0.7821, + "num_input_tokens_seen": 71979656, + "step": 123970 + }, + { + "epoch": 18.46514745308311, + "grad_norm": 0.04150390625, + "learning_rate": 0.0005350993541887356, + "loss": 0.7995, + "num_input_tokens_seen": 71982600, + "step": 123975 + }, + { + "epoch": 18.46589216562407, + "grad_norm": 0.083984375, + "learning_rate": 0.0005345833741411155, + "loss": 0.8089, + "num_input_tokens_seen": 71985736, + "step": 123980 + }, + { + "epoch": 18.46663687816503, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005340676384720111, + "loss": 0.7972, + "num_input_tokens_seen": 71988712, + "step": 123985 + }, + { + "epoch": 18.467381590705987, + "grad_norm": 0.04150390625, + "learning_rate": 0.0005335521471901305, + "loss": 0.8004, + "num_input_tokens_seen": 71991720, + "step": 123990 + }, + { + "epoch": 18.468126303246947, + "grad_norm": 0.048828125, + "learning_rate": 0.0005330369003041835, + "loss": 0.7798, + "num_input_tokens_seen": 71994600, + "step": 123995 + }, + { + "epoch": 18.468871015787904, + "grad_norm": 0.046630859375, + "learning_rate": 0.0005325218978228729, + "loss": 0.792, + "num_input_tokens_seen": 71997480, + "step": 124000 + }, + { + "epoch": 18.469615728328865, + "grad_norm": 0.038330078125, + "learning_rate": 0.0005320071397549036, + "loss": 0.7975, + "num_input_tokens_seen": 72000520, + "step": 124005 + }, + { + "epoch": 18.470360440869825, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005314926261089653, + "loss": 0.805, + "num_input_tokens_seen": 72003208, + "step": 124010 + }, + { + "epoch": 18.471105153410782, + "grad_norm": 0.07958984375, + "learning_rate": 0.0005309783568937576, + "loss": 0.7977, + "num_input_tokens_seen": 72006088, + "step": 124015 + }, + { + "epoch": 18.471849865951743, + "grad_norm": 0.042236328125, + "learning_rate": 0.000530464332117962, + "loss": 0.8024, + "num_input_tokens_seen": 72009128, + "step": 124020 + }, + { + "epoch": 18.472594578492703, + "grad_norm": 0.049560546875, + "learning_rate": 0.0005299505517902664, + "loss": 0.7852, + "num_input_tokens_seen": 72011816, + "step": 124025 + }, + { + "epoch": 18.47333929103366, + "grad_norm": 0.05224609375, + "learning_rate": 0.000529437015919349, + "loss": 0.7983, + "num_input_tokens_seen": 72014536, + "step": 124030 + }, + { + "epoch": 18.47408400357462, + "grad_norm": 0.052490234375, + "learning_rate": 0.0005289237245138877, + "loss": 0.7949, + "num_input_tokens_seen": 72017192, + "step": 124035 + }, + { + "epoch": 18.474828716115578, + "grad_norm": 0.0830078125, + "learning_rate": 0.0005284106775825508, + "loss": 0.8219, + "num_input_tokens_seen": 72019976, + "step": 124040 + }, + { + "epoch": 18.47557342865654, + "grad_norm": 0.05859375, + "learning_rate": 0.0005278978751340097, + "loss": 0.8007, + "num_input_tokens_seen": 72022600, + "step": 124045 + }, + { + "epoch": 18.4763181411975, + "grad_norm": 0.041748046875, + "learning_rate": 0.0005273853171769238, + "loss": 0.8033, + "num_input_tokens_seen": 72025448, + "step": 124050 + }, + { + "epoch": 18.477062853738456, + "grad_norm": 0.057373046875, + "learning_rate": 0.0005268730037199515, + "loss": 0.8003, + "num_input_tokens_seen": 72028424, + "step": 124055 + }, + { + "epoch": 18.477807566279417, + "grad_norm": 0.0654296875, + "learning_rate": 0.0005263609347717523, + "loss": 0.7895, + "num_input_tokens_seen": 72031400, + "step": 124060 + }, + { + "epoch": 18.478552278820374, + "grad_norm": 0.046142578125, + "learning_rate": 0.0005258491103409729, + "loss": 0.8165, + "num_input_tokens_seen": 72034216, + "step": 124065 + }, + { + "epoch": 18.479296991361334, + "grad_norm": 0.029541015625, + "learning_rate": 0.0005253375304362645, + "loss": 0.8089, + "num_input_tokens_seen": 72037224, + "step": 124070 + }, + { + "epoch": 18.480041703902295, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005248261950662635, + "loss": 0.7962, + "num_input_tokens_seen": 72040136, + "step": 124075 + }, + { + "epoch": 18.48078641644325, + "grad_norm": 0.036865234375, + "learning_rate": 0.0005243151042396149, + "loss": 0.7838, + "num_input_tokens_seen": 72042920, + "step": 124080 + }, + { + "epoch": 18.481531128984212, + "grad_norm": 0.0517578125, + "learning_rate": 0.0005238042579649499, + "loss": 0.7828, + "num_input_tokens_seen": 72045864, + "step": 124085 + }, + { + "epoch": 18.482275841525173, + "grad_norm": 0.05517578125, + "learning_rate": 0.0005232936562508983, + "loss": 0.8077, + "num_input_tokens_seen": 72048648, + "step": 124090 + }, + { + "epoch": 18.48302055406613, + "grad_norm": 0.0546875, + "learning_rate": 0.0005227832991060882, + "loss": 0.8009, + "num_input_tokens_seen": 72051624, + "step": 124095 + }, + { + "epoch": 18.48376526660709, + "grad_norm": 0.045166015625, + "learning_rate": 0.0005222731865391377, + "loss": 0.7993, + "num_input_tokens_seen": 72054472, + "step": 124100 + }, + { + "epoch": 18.484509979148047, + "grad_norm": 0.0517578125, + "learning_rate": 0.0005217633185586701, + "loss": 0.7855, + "num_input_tokens_seen": 72057320, + "step": 124105 + }, + { + "epoch": 18.485254691689008, + "grad_norm": 0.0791015625, + "learning_rate": 0.0005212536951732949, + "loss": 0.8009, + "num_input_tokens_seen": 72060200, + "step": 124110 + }, + { + "epoch": 18.48599940422997, + "grad_norm": 0.053466796875, + "learning_rate": 0.0005207443163916237, + "loss": 0.7897, + "num_input_tokens_seen": 72063048, + "step": 124115 + }, + { + "epoch": 18.486744116770925, + "grad_norm": 0.06298828125, + "learning_rate": 0.0005202351822222611, + "loss": 0.7997, + "num_input_tokens_seen": 72065960, + "step": 124120 + }, + { + "epoch": 18.487488829311886, + "grad_norm": 0.04150390625, + "learning_rate": 0.0005197262926738072, + "loss": 0.7896, + "num_input_tokens_seen": 72068680, + "step": 124125 + }, + { + "epoch": 18.488233541852846, + "grad_norm": 0.05078125, + "learning_rate": 0.0005192176477548616, + "loss": 0.8036, + "num_input_tokens_seen": 72071912, + "step": 124130 + }, + { + "epoch": 18.488978254393803, + "grad_norm": 0.04541015625, + "learning_rate": 0.0005187092474740157, + "loss": 0.7969, + "num_input_tokens_seen": 72074696, + "step": 124135 + }, + { + "epoch": 18.489722966934764, + "grad_norm": 0.054443359375, + "learning_rate": 0.0005182010918398594, + "loss": 0.7762, + "num_input_tokens_seen": 72077704, + "step": 124140 + }, + { + "epoch": 18.49046767947572, + "grad_norm": 0.033203125, + "learning_rate": 0.0005176931808609758, + "loss": 0.7935, + "num_input_tokens_seen": 72080648, + "step": 124145 + }, + { + "epoch": 18.49121239201668, + "grad_norm": 0.08349609375, + "learning_rate": 0.000517185514545948, + "loss": 0.7826, + "num_input_tokens_seen": 72083560, + "step": 124150 + }, + { + "epoch": 18.491957104557642, + "grad_norm": 0.025634765625, + "learning_rate": 0.0005166780929033493, + "loss": 0.8007, + "num_input_tokens_seen": 72086472, + "step": 124155 + }, + { + "epoch": 18.4927018170986, + "grad_norm": 0.06396484375, + "learning_rate": 0.000516170915941756, + "loss": 0.8005, + "num_input_tokens_seen": 72089000, + "step": 124160 + }, + { + "epoch": 18.49344652963956, + "grad_norm": 0.0625, + "learning_rate": 0.0005156639836697363, + "loss": 0.7986, + "num_input_tokens_seen": 72092072, + "step": 124165 + }, + { + "epoch": 18.49419124218052, + "grad_norm": 0.05712890625, + "learning_rate": 0.0005151572960958484, + "loss": 0.8088, + "num_input_tokens_seen": 72094696, + "step": 124170 + }, + { + "epoch": 18.494935954721477, + "grad_norm": 0.0537109375, + "learning_rate": 0.0005146508532286587, + "loss": 0.7941, + "num_input_tokens_seen": 72097576, + "step": 124175 + }, + { + "epoch": 18.495680667262437, + "grad_norm": 0.0654296875, + "learning_rate": 0.0005141446550767203, + "loss": 0.7934, + "num_input_tokens_seen": 72100616, + "step": 124180 + }, + { + "epoch": 18.496425379803394, + "grad_norm": 0.0966796875, + "learning_rate": 0.0005136387016485849, + "loss": 0.8033, + "num_input_tokens_seen": 72103656, + "step": 124185 + }, + { + "epoch": 18.497170092344355, + "grad_norm": 0.044189453125, + "learning_rate": 0.0005131329929527989, + "loss": 0.7881, + "num_input_tokens_seen": 72106856, + "step": 124190 + }, + { + "epoch": 18.497914804885315, + "grad_norm": 0.037841796875, + "learning_rate": 0.0005126275289979105, + "loss": 0.7981, + "num_input_tokens_seen": 72109672, + "step": 124195 + }, + { + "epoch": 18.498659517426272, + "grad_norm": 0.0439453125, + "learning_rate": 0.0005121223097924543, + "loss": 0.8018, + "num_input_tokens_seen": 72112392, + "step": 124200 + }, + { + "epoch": 18.499404229967233, + "grad_norm": 0.052001953125, + "learning_rate": 0.0005116173353449637, + "loss": 0.8199, + "num_input_tokens_seen": 72115112, + "step": 124205 + }, + { + "epoch": 18.500148942508194, + "grad_norm": 0.036865234375, + "learning_rate": 0.000511112605663977, + "loss": 0.7939, + "num_input_tokens_seen": 72118088, + "step": 124210 + }, + { + "epoch": 18.50089365504915, + "grad_norm": 0.064453125, + "learning_rate": 0.0005106081207580138, + "loss": 0.7991, + "num_input_tokens_seen": 72121704, + "step": 124215 + }, + { + "epoch": 18.50163836759011, + "grad_norm": 0.040283203125, + "learning_rate": 0.0005101038806356022, + "loss": 0.8297, + "num_input_tokens_seen": 72124744, + "step": 124220 + }, + { + "epoch": 18.502383080131068, + "grad_norm": 0.08740234375, + "learning_rate": 0.0005095998853052574, + "loss": 0.8205, + "num_input_tokens_seen": 72127784, + "step": 124225 + }, + { + "epoch": 18.50312779267203, + "grad_norm": 0.038818359375, + "learning_rate": 0.0005090961347754958, + "loss": 0.793, + "num_input_tokens_seen": 72130568, + "step": 124230 + }, + { + "epoch": 18.50387250521299, + "grad_norm": 0.049560546875, + "learning_rate": 0.000508592629054827, + "loss": 0.7963, + "num_input_tokens_seen": 72133448, + "step": 124235 + }, + { + "epoch": 18.504617217753946, + "grad_norm": 0.052490234375, + "learning_rate": 0.0005080893681517579, + "loss": 0.7866, + "num_input_tokens_seen": 72136488, + "step": 124240 + }, + { + "epoch": 18.505361930294907, + "grad_norm": 0.055908203125, + "learning_rate": 0.0005075863520747898, + "loss": 0.7711, + "num_input_tokens_seen": 72139208, + "step": 124245 + }, + { + "epoch": 18.506106642835867, + "grad_norm": 0.035888671875, + "learning_rate": 0.0005070835808324209, + "loss": 0.7863, + "num_input_tokens_seen": 72141960, + "step": 124250 + }, + { + "epoch": 18.506851355376824, + "grad_norm": 0.058837890625, + "learning_rate": 0.0005065810544331461, + "loss": 0.8207, + "num_input_tokens_seen": 72144648, + "step": 124255 + }, + { + "epoch": 18.507596067917785, + "grad_norm": 0.0771484375, + "learning_rate": 0.0005060787728854521, + "loss": 0.7997, + "num_input_tokens_seen": 72147816, + "step": 124260 + }, + { + "epoch": 18.50834078045874, + "grad_norm": 0.064453125, + "learning_rate": 0.000505576736197827, + "loss": 0.8059, + "num_input_tokens_seen": 72150600, + "step": 124265 + }, + { + "epoch": 18.509085492999702, + "grad_norm": 0.034423828125, + "learning_rate": 0.0005050749443787489, + "loss": 0.8117, + "num_input_tokens_seen": 72153352, + "step": 124270 + }, + { + "epoch": 18.509830205540663, + "grad_norm": 0.04541015625, + "learning_rate": 0.0005045733974367011, + "loss": 0.8207, + "num_input_tokens_seen": 72156168, + "step": 124275 + }, + { + "epoch": 18.51057491808162, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0005040720953801536, + "loss": 0.7942, + "num_input_tokens_seen": 72159016, + "step": 124280 + }, + { + "epoch": 18.51131963062258, + "grad_norm": 0.052490234375, + "learning_rate": 0.0005035710382175712, + "loss": 0.7925, + "num_input_tokens_seen": 72161768, + "step": 124285 + }, + { + "epoch": 18.512064343163537, + "grad_norm": 0.038818359375, + "learning_rate": 0.0005030702259574254, + "loss": 0.8048, + "num_input_tokens_seen": 72164680, + "step": 124290 + }, + { + "epoch": 18.512809055704498, + "grad_norm": 0.061767578125, + "learning_rate": 0.0005025696586081729, + "loss": 0.7747, + "num_input_tokens_seen": 72167272, + "step": 124295 + }, + { + "epoch": 18.51355376824546, + "grad_norm": 0.0546875, + "learning_rate": 0.0005020693361782735, + "loss": 0.7906, + "num_input_tokens_seen": 72170216, + "step": 124300 + }, + { + "epoch": 18.514298480786415, + "grad_norm": 0.0654296875, + "learning_rate": 0.0005015692586761772, + "loss": 0.8037, + "num_input_tokens_seen": 72173416, + "step": 124305 + }, + { + "epoch": 18.515043193327376, + "grad_norm": 0.037353515625, + "learning_rate": 0.0005010694261103304, + "loss": 0.7943, + "num_input_tokens_seen": 72176264, + "step": 124310 + }, + { + "epoch": 18.515787905868336, + "grad_norm": 0.0478515625, + "learning_rate": 0.0005005698384891816, + "loss": 0.8021, + "num_input_tokens_seen": 72179272, + "step": 124315 + }, + { + "epoch": 18.516532618409293, + "grad_norm": 0.054931640625, + "learning_rate": 0.0005000704958211688, + "loss": 0.7908, + "num_input_tokens_seen": 72182248, + "step": 124320 + }, + { + "epoch": 18.517277330950254, + "grad_norm": 0.0546875, + "learning_rate": 0.000499571398114727, + "loss": 0.7874, + "num_input_tokens_seen": 72185320, + "step": 124325 + }, + { + "epoch": 18.51802204349121, + "grad_norm": 0.03955078125, + "learning_rate": 0.0004990725453782895, + "loss": 0.8019, + "num_input_tokens_seen": 72188456, + "step": 124330 + }, + { + "epoch": 18.51876675603217, + "grad_norm": 0.08642578125, + "learning_rate": 0.000498573937620283, + "loss": 0.8018, + "num_input_tokens_seen": 72190984, + "step": 124335 + }, + { + "epoch": 18.519511468573132, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004980755748491306, + "loss": 0.8204, + "num_input_tokens_seen": 72194024, + "step": 124340 + }, + { + "epoch": 18.52025618111409, + "grad_norm": 0.1005859375, + "learning_rate": 0.0004975774570732522, + "loss": 0.7865, + "num_input_tokens_seen": 72196872, + "step": 124345 + }, + { + "epoch": 18.52100089365505, + "grad_norm": 0.03857421875, + "learning_rate": 0.0004970795843010628, + "loss": 0.8095, + "num_input_tokens_seen": 72199784, + "step": 124350 + }, + { + "epoch": 18.52174560619601, + "grad_norm": 0.04931640625, + "learning_rate": 0.0004965819565409757, + "loss": 0.7989, + "num_input_tokens_seen": 72202536, + "step": 124355 + }, + { + "epoch": 18.522490318736967, + "grad_norm": 0.03173828125, + "learning_rate": 0.0004960845738013958, + "loss": 0.7937, + "num_input_tokens_seen": 72205192, + "step": 124360 + }, + { + "epoch": 18.523235031277927, + "grad_norm": 0.049560546875, + "learning_rate": 0.000495587436090723, + "loss": 0.7934, + "num_input_tokens_seen": 72208040, + "step": 124365 + }, + { + "epoch": 18.523979743818884, + "grad_norm": 0.031494140625, + "learning_rate": 0.0004950905434173624, + "loss": 0.8013, + "num_input_tokens_seen": 72210856, + "step": 124370 + }, + { + "epoch": 18.524724456359845, + "grad_norm": 0.0546875, + "learning_rate": 0.0004945938957897039, + "loss": 0.7993, + "num_input_tokens_seen": 72213768, + "step": 124375 + }, + { + "epoch": 18.525469168900806, + "grad_norm": 0.058349609375, + "learning_rate": 0.000494097493216139, + "loss": 0.8053, + "num_input_tokens_seen": 72216808, + "step": 124380 + }, + { + "epoch": 18.526213881441763, + "grad_norm": 0.042236328125, + "learning_rate": 0.0004936013357050512, + "loss": 0.8138, + "num_input_tokens_seen": 72219368, + "step": 124385 + }, + { + "epoch": 18.526958593982723, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004931054232648285, + "loss": 0.8124, + "num_input_tokens_seen": 72222024, + "step": 124390 + }, + { + "epoch": 18.527703306523684, + "grad_norm": 0.038818359375, + "learning_rate": 0.0004926097559038427, + "loss": 0.7863, + "num_input_tokens_seen": 72224968, + "step": 124395 + }, + { + "epoch": 18.52844801906464, + "grad_norm": 0.034423828125, + "learning_rate": 0.0004921143336304723, + "loss": 0.8048, + "num_input_tokens_seen": 72228008, + "step": 124400 + }, + { + "epoch": 18.5291927316056, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004916191564530853, + "loss": 0.8104, + "num_input_tokens_seen": 72231112, + "step": 124405 + }, + { + "epoch": 18.529937444146558, + "grad_norm": 0.037109375, + "learning_rate": 0.000491124224380045, + "loss": 0.7991, + "num_input_tokens_seen": 72234056, + "step": 124410 + }, + { + "epoch": 18.53068215668752, + "grad_norm": 0.0439453125, + "learning_rate": 0.0004906295374197167, + "loss": 0.8033, + "num_input_tokens_seen": 72236968, + "step": 124415 + }, + { + "epoch": 18.53142686922848, + "grad_norm": 0.048583984375, + "learning_rate": 0.0004901350955804534, + "loss": 0.7933, + "num_input_tokens_seen": 72239976, + "step": 124420 + }, + { + "epoch": 18.532171581769436, + "grad_norm": 0.06396484375, + "learning_rate": 0.0004896408988706135, + "loss": 0.8174, + "num_input_tokens_seen": 72242760, + "step": 124425 + }, + { + "epoch": 18.532916294310397, + "grad_norm": 0.044677734375, + "learning_rate": 0.0004891469472985405, + "loss": 0.8314, + "num_input_tokens_seen": 72245608, + "step": 124430 + }, + { + "epoch": 18.533661006851354, + "grad_norm": 0.030029296875, + "learning_rate": 0.0004886532408725824, + "loss": 0.8026, + "num_input_tokens_seen": 72248456, + "step": 124435 + }, + { + "epoch": 18.534405719392314, + "grad_norm": 0.0791015625, + "learning_rate": 0.0004881597796010811, + "loss": 0.7945, + "num_input_tokens_seen": 72251368, + "step": 124440 + }, + { + "epoch": 18.535150431933275, + "grad_norm": 0.036376953125, + "learning_rate": 0.0004876665634923699, + "loss": 0.8063, + "num_input_tokens_seen": 72254152, + "step": 124445 + }, + { + "epoch": 18.53589514447423, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004871735925547821, + "loss": 0.7884, + "num_input_tokens_seen": 72256840, + "step": 124450 + }, + { + "epoch": 18.536639857015192, + "grad_norm": 0.04150390625, + "learning_rate": 0.000486680866796646, + "loss": 0.8181, + "num_input_tokens_seen": 72259656, + "step": 124455 + }, + { + "epoch": 18.537384569556153, + "grad_norm": 0.05615234375, + "learning_rate": 0.00048618838622628666, + "loss": 0.7981, + "num_input_tokens_seen": 72262600, + "step": 124460 + }, + { + "epoch": 18.53812928209711, + "grad_norm": 0.06640625, + "learning_rate": 0.00048569615085202074, + "loss": 0.7816, + "num_input_tokens_seen": 72265992, + "step": 124465 + }, + { + "epoch": 18.53887399463807, + "grad_norm": 0.03662109375, + "learning_rate": 0.00048520416068217, + "loss": 0.7769, + "num_input_tokens_seen": 72268776, + "step": 124470 + }, + { + "epoch": 18.539618707179027, + "grad_norm": 0.040283203125, + "learning_rate": 0.00048471241572504097, + "loss": 0.7914, + "num_input_tokens_seen": 72271688, + "step": 124475 + }, + { + "epoch": 18.540363419719988, + "grad_norm": 0.050537109375, + "learning_rate": 0.00048422091598894376, + "loss": 0.8013, + "num_input_tokens_seen": 72274600, + "step": 124480 + }, + { + "epoch": 18.54110813226095, + "grad_norm": 0.05908203125, + "learning_rate": 0.00048372966148218176, + "loss": 0.805, + "num_input_tokens_seen": 72277288, + "step": 124485 + }, + { + "epoch": 18.541852844801905, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004832386522130516, + "loss": 0.804, + "num_input_tokens_seen": 72280072, + "step": 124490 + }, + { + "epoch": 18.542597557342866, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0004827478881898517, + "loss": 0.8134, + "num_input_tokens_seen": 72282920, + "step": 124495 + }, + { + "epoch": 18.543342269883826, + "grad_norm": 0.037353515625, + "learning_rate": 0.00048225736942087035, + "loss": 0.8075, + "num_input_tokens_seen": 72285896, + "step": 124500 + }, + { + "epoch": 18.544086982424783, + "grad_norm": 0.03662109375, + "learning_rate": 0.000481767095914396, + "loss": 0.7862, + "num_input_tokens_seen": 72288488, + "step": 124505 + }, + { + "epoch": 18.544831694965744, + "grad_norm": 0.033203125, + "learning_rate": 0.00048127706767871203, + "loss": 0.7921, + "num_input_tokens_seen": 72291272, + "step": 124510 + }, + { + "epoch": 18.5455764075067, + "grad_norm": 0.05322265625, + "learning_rate": 0.00048078728472209507, + "loss": 0.8028, + "num_input_tokens_seen": 72293928, + "step": 124515 + }, + { + "epoch": 18.54632112004766, + "grad_norm": 0.046630859375, + "learning_rate": 0.00048029774705282024, + "loss": 0.7961, + "num_input_tokens_seen": 72296808, + "step": 124520 + }, + { + "epoch": 18.547065832588622, + "grad_norm": 0.0380859375, + "learning_rate": 0.0004798084546791575, + "loss": 0.7965, + "num_input_tokens_seen": 72299560, + "step": 124525 + }, + { + "epoch": 18.54781054512958, + "grad_norm": 0.06103515625, + "learning_rate": 0.00047931940760937374, + "loss": 0.7973, + "num_input_tokens_seen": 72302440, + "step": 124530 + }, + { + "epoch": 18.54855525767054, + "grad_norm": 0.042236328125, + "learning_rate": 0.00047883060585172883, + "loss": 0.8063, + "num_input_tokens_seen": 72305256, + "step": 124535 + }, + { + "epoch": 18.5492999702115, + "grad_norm": 0.054443359375, + "learning_rate": 0.00047834204941448464, + "loss": 0.7975, + "num_input_tokens_seen": 72307944, + "step": 124540 + }, + { + "epoch": 18.550044682752457, + "grad_norm": 0.048583984375, + "learning_rate": 0.0004778537383058895, + "loss": 0.8011, + "num_input_tokens_seen": 72310760, + "step": 124545 + }, + { + "epoch": 18.550789395293418, + "grad_norm": 0.0419921875, + "learning_rate": 0.00047736567253419846, + "loss": 0.8059, + "num_input_tokens_seen": 72313704, + "step": 124550 + }, + { + "epoch": 18.551534107834375, + "grad_norm": 0.04443359375, + "learning_rate": 0.0004768778521076533, + "loss": 0.7943, + "num_input_tokens_seen": 72316808, + "step": 124555 + }, + { + "epoch": 18.552278820375335, + "grad_norm": 0.050048828125, + "learning_rate": 0.00047639027703449575, + "loss": 0.762, + "num_input_tokens_seen": 72319592, + "step": 124560 + }, + { + "epoch": 18.553023532916296, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004759029473229642, + "loss": 0.7831, + "num_input_tokens_seen": 72322536, + "step": 124565 + }, + { + "epoch": 18.553768245457253, + "grad_norm": 0.055908203125, + "learning_rate": 0.00047541586298129046, + "loss": 0.812, + "num_input_tokens_seen": 72325448, + "step": 124570 + }, + { + "epoch": 18.554512957998213, + "grad_norm": 0.040771484375, + "learning_rate": 0.00047492902401770283, + "loss": 0.7835, + "num_input_tokens_seen": 72328616, + "step": 124575 + }, + { + "epoch": 18.55525767053917, + "grad_norm": 0.040283203125, + "learning_rate": 0.0004744424304404282, + "loss": 0.8051, + "num_input_tokens_seen": 72331528, + "step": 124580 + }, + { + "epoch": 18.55600238308013, + "grad_norm": 0.053466796875, + "learning_rate": 0.0004739560822576849, + "loss": 0.7972, + "num_input_tokens_seen": 72334248, + "step": 124585 + }, + { + "epoch": 18.55674709562109, + "grad_norm": 0.06689453125, + "learning_rate": 0.00047346997947768973, + "loss": 0.8088, + "num_input_tokens_seen": 72336936, + "step": 124590 + }, + { + "epoch": 18.557491808162048, + "grad_norm": 0.04052734375, + "learning_rate": 0.0004729841221086561, + "loss": 0.8083, + "num_input_tokens_seen": 72339720, + "step": 124595 + }, + { + "epoch": 18.55823652070301, + "grad_norm": 0.03759765625, + "learning_rate": 0.0004724985101587908, + "loss": 0.7807, + "num_input_tokens_seen": 72342504, + "step": 124600 + }, + { + "epoch": 18.55898123324397, + "grad_norm": 0.0625, + "learning_rate": 0.00047201314363629564, + "loss": 0.7911, + "num_input_tokens_seen": 72345128, + "step": 124605 + }, + { + "epoch": 18.559725945784926, + "grad_norm": 0.037109375, + "learning_rate": 0.00047152802254937565, + "loss": 0.8008, + "num_input_tokens_seen": 72348392, + "step": 124610 + }, + { + "epoch": 18.560470658325887, + "grad_norm": 0.034423828125, + "learning_rate": 0.00047104314690622095, + "loss": 0.7732, + "num_input_tokens_seen": 72351112, + "step": 124615 + }, + { + "epoch": 18.561215370866844, + "grad_norm": 0.05224609375, + "learning_rate": 0.00047055851671502834, + "loss": 0.8259, + "num_input_tokens_seen": 72353960, + "step": 124620 + }, + { + "epoch": 18.561960083407804, + "grad_norm": 0.0546875, + "learning_rate": 0.00047007413198398127, + "loss": 0.8084, + "num_input_tokens_seen": 72356680, + "step": 124625 + }, + { + "epoch": 18.562704795948765, + "grad_norm": 0.05712890625, + "learning_rate": 0.00046958999272126486, + "loss": 0.8007, + "num_input_tokens_seen": 72359400, + "step": 124630 + }, + { + "epoch": 18.56344950848972, + "grad_norm": 0.06787109375, + "learning_rate": 0.0004691060989350576, + "loss": 0.7886, + "num_input_tokens_seen": 72362248, + "step": 124635 + }, + { + "epoch": 18.564194221030682, + "grad_norm": 0.039306640625, + "learning_rate": 0.0004686224506335329, + "loss": 0.79, + "num_input_tokens_seen": 72365000, + "step": 124640 + }, + { + "epoch": 18.564938933571643, + "grad_norm": 0.05859375, + "learning_rate": 0.00046813904782486425, + "loss": 0.7855, + "num_input_tokens_seen": 72367752, + "step": 124645 + }, + { + "epoch": 18.5656836461126, + "grad_norm": 0.09912109375, + "learning_rate": 0.00046765589051721675, + "loss": 0.7989, + "num_input_tokens_seen": 72370824, + "step": 124650 + }, + { + "epoch": 18.56642835865356, + "grad_norm": 0.03759765625, + "learning_rate": 0.00046717297871875396, + "loss": 0.7962, + "num_input_tokens_seen": 72373608, + "step": 124655 + }, + { + "epoch": 18.567173071194517, + "grad_norm": 0.045654296875, + "learning_rate": 0.0004666903124376309, + "loss": 0.7895, + "num_input_tokens_seen": 72376552, + "step": 124660 + }, + { + "epoch": 18.567917783735478, + "grad_norm": 0.041259765625, + "learning_rate": 0.0004662078916820045, + "loss": 0.7972, + "num_input_tokens_seen": 72379624, + "step": 124665 + }, + { + "epoch": 18.56866249627644, + "grad_norm": 0.0849609375, + "learning_rate": 0.0004657257164600248, + "loss": 0.8051, + "num_input_tokens_seen": 72382472, + "step": 124670 + }, + { + "epoch": 18.569407208817395, + "grad_norm": 0.087890625, + "learning_rate": 0.000465243786779837, + "loss": 0.8055, + "num_input_tokens_seen": 72385704, + "step": 124675 + }, + { + "epoch": 18.570151921358356, + "grad_norm": 0.03955078125, + "learning_rate": 0.0004647621026495813, + "loss": 0.7999, + "num_input_tokens_seen": 72388200, + "step": 124680 + }, + { + "epoch": 18.570896633899316, + "grad_norm": 0.041259765625, + "learning_rate": 0.00046428066407739943, + "loss": 0.8045, + "num_input_tokens_seen": 72390920, + "step": 124685 + }, + { + "epoch": 18.571641346440273, + "grad_norm": 0.061279296875, + "learning_rate": 0.0004637994710714233, + "loss": 0.79, + "num_input_tokens_seen": 72393768, + "step": 124690 + }, + { + "epoch": 18.572386058981234, + "grad_norm": 0.052001953125, + "learning_rate": 0.00046331852363977797, + "loss": 0.8179, + "num_input_tokens_seen": 72396616, + "step": 124695 + }, + { + "epoch": 18.57313077152219, + "grad_norm": 0.04150390625, + "learning_rate": 0.0004628378217905937, + "loss": 0.7819, + "num_input_tokens_seen": 72399784, + "step": 124700 + }, + { + "epoch": 18.57387548406315, + "grad_norm": 0.06494140625, + "learning_rate": 0.000462357365531989, + "loss": 0.7944, + "num_input_tokens_seen": 72402920, + "step": 124705 + }, + { + "epoch": 18.574620196604112, + "grad_norm": 0.0654296875, + "learning_rate": 0.00046187715487208236, + "loss": 0.8004, + "num_input_tokens_seen": 72405704, + "step": 124710 + }, + { + "epoch": 18.57536490914507, + "grad_norm": 0.03466796875, + "learning_rate": 0.00046139718981898226, + "loss": 0.7995, + "num_input_tokens_seen": 72408808, + "step": 124715 + }, + { + "epoch": 18.57610962168603, + "grad_norm": 0.0849609375, + "learning_rate": 0.0004609174703808039, + "loss": 0.8001, + "num_input_tokens_seen": 72412168, + "step": 124720 + }, + { + "epoch": 18.57685433422699, + "grad_norm": 0.056640625, + "learning_rate": 0.00046043799656564577, + "loss": 0.8069, + "num_input_tokens_seen": 72414760, + "step": 124725 + }, + { + "epoch": 18.577599046767947, + "grad_norm": 0.060546875, + "learning_rate": 0.00045995876838160975, + "loss": 0.8315, + "num_input_tokens_seen": 72417640, + "step": 124730 + }, + { + "epoch": 18.578343759308908, + "grad_norm": 0.037353515625, + "learning_rate": 0.00045947978583679434, + "loss": 0.7982, + "num_input_tokens_seen": 72420456, + "step": 124735 + }, + { + "epoch": 18.579088471849865, + "grad_norm": 0.03369140625, + "learning_rate": 0.0004590010489392882, + "loss": 0.822, + "num_input_tokens_seen": 72423464, + "step": 124740 + }, + { + "epoch": 18.579833184390825, + "grad_norm": 0.042724609375, + "learning_rate": 0.0004585225576971813, + "loss": 0.8093, + "num_input_tokens_seen": 72426472, + "step": 124745 + }, + { + "epoch": 18.580577896931786, + "grad_norm": 0.056640625, + "learning_rate": 0.00045804431211855567, + "loss": 0.7934, + "num_input_tokens_seen": 72429320, + "step": 124750 + }, + { + "epoch": 18.581322609472743, + "grad_norm": 0.068359375, + "learning_rate": 0.0004575663122114931, + "loss": 0.7883, + "num_input_tokens_seen": 72432360, + "step": 124755 + }, + { + "epoch": 18.582067322013703, + "grad_norm": 0.05419921875, + "learning_rate": 0.00045708855798406555, + "loss": 0.8003, + "num_input_tokens_seen": 72435112, + "step": 124760 + }, + { + "epoch": 18.582812034554664, + "grad_norm": 0.05419921875, + "learning_rate": 0.00045661104944434813, + "loss": 0.7968, + "num_input_tokens_seen": 72438152, + "step": 124765 + }, + { + "epoch": 18.58355674709562, + "grad_norm": 0.03466796875, + "learning_rate": 0.00045613378660040445, + "loss": 0.7915, + "num_input_tokens_seen": 72440968, + "step": 124770 + }, + { + "epoch": 18.58430145963658, + "grad_norm": 0.047607421875, + "learning_rate": 0.00045565676946029976, + "loss": 0.8033, + "num_input_tokens_seen": 72443688, + "step": 124775 + }, + { + "epoch": 18.585046172177538, + "grad_norm": 0.07958984375, + "learning_rate": 0.0004551799980320925, + "loss": 0.7905, + "num_input_tokens_seen": 72446600, + "step": 124780 + }, + { + "epoch": 18.5857908847185, + "grad_norm": 0.055908203125, + "learning_rate": 0.00045470347232383297, + "loss": 0.7906, + "num_input_tokens_seen": 72449960, + "step": 124785 + }, + { + "epoch": 18.58653559725946, + "grad_norm": 0.07666015625, + "learning_rate": 0.0004542271923435781, + "loss": 0.7933, + "num_input_tokens_seen": 72452648, + "step": 124790 + }, + { + "epoch": 18.587280309800416, + "grad_norm": 0.051025390625, + "learning_rate": 0.0004537511580993697, + "loss": 0.7946, + "num_input_tokens_seen": 72455624, + "step": 124795 + }, + { + "epoch": 18.588025022341377, + "grad_norm": 0.0673828125, + "learning_rate": 0.00045327536959925306, + "loss": 0.7954, + "num_input_tokens_seen": 72458312, + "step": 124800 + }, + { + "epoch": 18.588769734882334, + "grad_norm": 0.046630859375, + "learning_rate": 0.0004527998268512634, + "loss": 0.8027, + "num_input_tokens_seen": 72461000, + "step": 124805 + }, + { + "epoch": 18.589514447423294, + "grad_norm": 0.056884765625, + "learning_rate": 0.00045232452986343426, + "loss": 0.7842, + "num_input_tokens_seen": 72464040, + "step": 124810 + }, + { + "epoch": 18.590259159964255, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00045184947864379766, + "loss": 0.7982, + "num_input_tokens_seen": 72466888, + "step": 124815 + }, + { + "epoch": 18.591003872505212, + "grad_norm": 0.050537109375, + "learning_rate": 0.00045137467320037705, + "loss": 0.7917, + "num_input_tokens_seen": 72470088, + "step": 124820 + }, + { + "epoch": 18.591748585046172, + "grad_norm": 0.0693359375, + "learning_rate": 0.00045090011354119604, + "loss": 0.7981, + "num_input_tokens_seen": 72472840, + "step": 124825 + }, + { + "epoch": 18.592493297587133, + "grad_norm": 0.041748046875, + "learning_rate": 0.00045042579967426996, + "loss": 0.7967, + "num_input_tokens_seen": 72475848, + "step": 124830 + }, + { + "epoch": 18.59323801012809, + "grad_norm": 0.051513671875, + "learning_rate": 0.00044995173160761404, + "loss": 0.8255, + "num_input_tokens_seen": 72478728, + "step": 124835 + }, + { + "epoch": 18.59398272266905, + "grad_norm": 0.045166015625, + "learning_rate": 0.0004494779093492351, + "loss": 0.8047, + "num_input_tokens_seen": 72481800, + "step": 124840 + }, + { + "epoch": 18.594727435210007, + "grad_norm": 0.052001953125, + "learning_rate": 0.0004490043329071369, + "loss": 0.7979, + "num_input_tokens_seen": 72484552, + "step": 124845 + }, + { + "epoch": 18.595472147750968, + "grad_norm": 0.0595703125, + "learning_rate": 0.0004485310022893229, + "loss": 0.7986, + "num_input_tokens_seen": 72487176, + "step": 124850 + }, + { + "epoch": 18.59621686029193, + "grad_norm": 0.060546875, + "learning_rate": 0.00044805791750378684, + "loss": 0.7865, + "num_input_tokens_seen": 72490120, + "step": 124855 + }, + { + "epoch": 18.596961572832885, + "grad_norm": 0.07958984375, + "learning_rate": 0.0004475850785585239, + "loss": 0.7879, + "num_input_tokens_seen": 72493064, + "step": 124860 + }, + { + "epoch": 18.597706285373846, + "grad_norm": 0.041015625, + "learning_rate": 0.00044711248546151934, + "loss": 0.809, + "num_input_tokens_seen": 72496264, + "step": 124865 + }, + { + "epoch": 18.598450997914806, + "grad_norm": 0.051513671875, + "learning_rate": 0.0004466401382207585, + "loss": 0.8088, + "num_input_tokens_seen": 72499336, + "step": 124870 + }, + { + "epoch": 18.599195710455763, + "grad_norm": 0.05712890625, + "learning_rate": 0.00044616803684422167, + "loss": 0.7811, + "num_input_tokens_seen": 72502440, + "step": 124875 + }, + { + "epoch": 18.599940422996724, + "grad_norm": 0.050048828125, + "learning_rate": 0.00044569618133988406, + "loss": 0.802, + "num_input_tokens_seen": 72505192, + "step": 124880 + }, + { + "epoch": 18.60068513553768, + "grad_norm": 0.0225830078125, + "learning_rate": 0.0004452245717157177, + "loss": 0.8044, + "num_input_tokens_seen": 72508264, + "step": 124885 + }, + { + "epoch": 18.60142984807864, + "grad_norm": 0.034912109375, + "learning_rate": 0.0004447532079796879, + "loss": 0.7903, + "num_input_tokens_seen": 72511176, + "step": 124890 + }, + { + "epoch": 18.602174560619602, + "grad_norm": 0.03076171875, + "learning_rate": 0.0004442820901397598, + "loss": 0.7939, + "num_input_tokens_seen": 72514248, + "step": 124895 + }, + { + "epoch": 18.60291927316056, + "grad_norm": 0.036865234375, + "learning_rate": 0.00044381121820389225, + "loss": 0.8042, + "num_input_tokens_seen": 72517192, + "step": 124900 + }, + { + "epoch": 18.60366398570152, + "grad_norm": 0.064453125, + "learning_rate": 0.0004433405921800404, + "loss": 0.8126, + "num_input_tokens_seen": 72520008, + "step": 124905 + }, + { + "epoch": 18.60440869824248, + "grad_norm": 0.03662109375, + "learning_rate": 0.0004428702120761546, + "loss": 0.7859, + "num_input_tokens_seen": 72522760, + "step": 124910 + }, + { + "epoch": 18.605153410783437, + "grad_norm": 0.043701171875, + "learning_rate": 0.0004424000779001819, + "loss": 0.7877, + "num_input_tokens_seen": 72525672, + "step": 124915 + }, + { + "epoch": 18.605898123324398, + "grad_norm": 0.052001953125, + "learning_rate": 0.0004419301896600608, + "loss": 0.794, + "num_input_tokens_seen": 72528712, + "step": 124920 + }, + { + "epoch": 18.606642835865355, + "grad_norm": 0.0537109375, + "learning_rate": 0.0004414605473637367, + "loss": 0.7655, + "num_input_tokens_seen": 72531432, + "step": 124925 + }, + { + "epoch": 18.607387548406315, + "grad_norm": 0.0537109375, + "learning_rate": 0.00044099115101913834, + "loss": 0.7938, + "num_input_tokens_seen": 72534568, + "step": 124930 + }, + { + "epoch": 18.608132260947276, + "grad_norm": 0.02587890625, + "learning_rate": 0.00044052200063419755, + "loss": 0.8105, + "num_input_tokens_seen": 72538024, + "step": 124935 + }, + { + "epoch": 18.608876973488233, + "grad_norm": 0.04345703125, + "learning_rate": 0.0004400530962168414, + "loss": 0.8022, + "num_input_tokens_seen": 72540808, + "step": 124940 + }, + { + "epoch": 18.609621686029193, + "grad_norm": 0.05615234375, + "learning_rate": 0.00043958443777498686, + "loss": 0.7991, + "num_input_tokens_seen": 72543656, + "step": 124945 + }, + { + "epoch": 18.61036639857015, + "grad_norm": 0.044921875, + "learning_rate": 0.0004391160253165577, + "loss": 0.8076, + "num_input_tokens_seen": 72546312, + "step": 124950 + }, + { + "epoch": 18.61111111111111, + "grad_norm": 0.045166015625, + "learning_rate": 0.00043864785884946243, + "loss": 0.81, + "num_input_tokens_seen": 72549032, + "step": 124955 + }, + { + "epoch": 18.61185582365207, + "grad_norm": 0.052001953125, + "learning_rate": 0.00043817993838161316, + "loss": 0.808, + "num_input_tokens_seen": 72551848, + "step": 124960 + }, + { + "epoch": 18.61260053619303, + "grad_norm": 0.048583984375, + "learning_rate": 0.00043771226392091354, + "loss": 0.7955, + "num_input_tokens_seen": 72554696, + "step": 124965 + }, + { + "epoch": 18.61334524873399, + "grad_norm": 0.0361328125, + "learning_rate": 0.00043724483547526394, + "loss": 0.8032, + "num_input_tokens_seen": 72557832, + "step": 124970 + }, + { + "epoch": 18.61408996127495, + "grad_norm": 0.029052734375, + "learning_rate": 0.000436777653052563, + "loss": 0.7792, + "num_input_tokens_seen": 72561064, + "step": 124975 + }, + { + "epoch": 18.614834673815906, + "grad_norm": 0.044677734375, + "learning_rate": 0.00043631071666070276, + "loss": 0.8141, + "num_input_tokens_seen": 72563816, + "step": 124980 + }, + { + "epoch": 18.615579386356867, + "grad_norm": 0.042724609375, + "learning_rate": 0.0004358440263075719, + "loss": 0.7973, + "num_input_tokens_seen": 72567144, + "step": 124985 + }, + { + "epoch": 18.616324098897824, + "grad_norm": 0.0712890625, + "learning_rate": 0.0004353775820010508, + "loss": 0.7909, + "num_input_tokens_seen": 72570024, + "step": 124990 + }, + { + "epoch": 18.617068811438784, + "grad_norm": 0.06787109375, + "learning_rate": 0.0004349113837490248, + "loss": 0.8036, + "num_input_tokens_seen": 72572680, + "step": 124995 + }, + { + "epoch": 18.617813523979745, + "grad_norm": 0.06787109375, + "learning_rate": 0.000434445431559366, + "loss": 0.7762, + "num_input_tokens_seen": 72575432, + "step": 125000 + }, + { + "epoch": 18.618558236520702, + "grad_norm": 0.053955078125, + "learning_rate": 0.0004339797254399497, + "loss": 0.8049, + "num_input_tokens_seen": 72578376, + "step": 125005 + }, + { + "epoch": 18.619302949061662, + "grad_norm": 0.064453125, + "learning_rate": 0.00043351426539864124, + "loss": 0.7782, + "num_input_tokens_seen": 72581416, + "step": 125010 + }, + { + "epoch": 18.620047661602623, + "grad_norm": 0.051025390625, + "learning_rate": 0.0004330490514433027, + "loss": 0.7957, + "num_input_tokens_seen": 72584424, + "step": 125015 + }, + { + "epoch": 18.62079237414358, + "grad_norm": 0.0419921875, + "learning_rate": 0.00043258408358179786, + "loss": 0.8191, + "num_input_tokens_seen": 72587432, + "step": 125020 + }, + { + "epoch": 18.62153708668454, + "grad_norm": 0.06982421875, + "learning_rate": 0.00043211936182197707, + "loss": 0.7887, + "num_input_tokens_seen": 72590184, + "step": 125025 + }, + { + "epoch": 18.622281799225497, + "grad_norm": 0.07080078125, + "learning_rate": 0.00043165488617169566, + "loss": 0.8088, + "num_input_tokens_seen": 72593096, + "step": 125030 + }, + { + "epoch": 18.623026511766458, + "grad_norm": 0.049560546875, + "learning_rate": 0.0004311906566387974, + "loss": 0.8068, + "num_input_tokens_seen": 72595912, + "step": 125035 + }, + { + "epoch": 18.62377122430742, + "grad_norm": 0.040771484375, + "learning_rate": 0.000430726673231126, + "loss": 0.8114, + "num_input_tokens_seen": 72598760, + "step": 125040 + }, + { + "epoch": 18.624515936848375, + "grad_norm": 0.05712890625, + "learning_rate": 0.0004302629359565202, + "loss": 0.8021, + "num_input_tokens_seen": 72601448, + "step": 125045 + }, + { + "epoch": 18.625260649389336, + "grad_norm": 0.08349609375, + "learning_rate": 0.0004297994448228137, + "loss": 0.7975, + "num_input_tokens_seen": 72604200, + "step": 125050 + }, + { + "epoch": 18.626005361930297, + "grad_norm": 0.044189453125, + "learning_rate": 0.000429336199837837, + "loss": 0.8105, + "num_input_tokens_seen": 72608168, + "step": 125055 + }, + { + "epoch": 18.626750074471254, + "grad_norm": 0.06103515625, + "learning_rate": 0.00042887320100941536, + "loss": 0.809, + "num_input_tokens_seen": 72610984, + "step": 125060 + }, + { + "epoch": 18.627494787012214, + "grad_norm": 0.052001953125, + "learning_rate": 0.00042841044834537265, + "loss": 0.7907, + "num_input_tokens_seen": 72613832, + "step": 125065 + }, + { + "epoch": 18.62823949955317, + "grad_norm": 0.05224609375, + "learning_rate": 0.00042794794185352424, + "loss": 0.8039, + "num_input_tokens_seen": 72617000, + "step": 125070 + }, + { + "epoch": 18.62898421209413, + "grad_norm": 0.038330078125, + "learning_rate": 0.0004274856815416872, + "loss": 0.812, + "num_input_tokens_seen": 72619560, + "step": 125075 + }, + { + "epoch": 18.629728924635092, + "grad_norm": 0.0322265625, + "learning_rate": 0.00042702366741766693, + "loss": 0.7873, + "num_input_tokens_seen": 72622536, + "step": 125080 + }, + { + "epoch": 18.63047363717605, + "grad_norm": 0.03955078125, + "learning_rate": 0.00042656189948927223, + "loss": 0.8085, + "num_input_tokens_seen": 72625416, + "step": 125085 + }, + { + "epoch": 18.63121834971701, + "grad_norm": 0.0400390625, + "learning_rate": 0.0004261003777643019, + "loss": 0.782, + "num_input_tokens_seen": 72628168, + "step": 125090 + }, + { + "epoch": 18.631963062257967, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00042563910225055124, + "loss": 0.7841, + "num_input_tokens_seen": 72631176, + "step": 125095 + }, + { + "epoch": 18.632707774798927, + "grad_norm": 0.0654296875, + "learning_rate": 0.00042517807295581745, + "loss": 0.781, + "num_input_tokens_seen": 72634024, + "step": 125100 + }, + { + "epoch": 18.633452487339888, + "grad_norm": 0.054443359375, + "learning_rate": 0.0004247172898878876, + "loss": 0.797, + "num_input_tokens_seen": 72636904, + "step": 125105 + }, + { + "epoch": 18.634197199880845, + "grad_norm": 0.049560546875, + "learning_rate": 0.00042425675305454377, + "loss": 0.7889, + "num_input_tokens_seen": 72639976, + "step": 125110 + }, + { + "epoch": 18.634941912421805, + "grad_norm": 0.058349609375, + "learning_rate": 0.00042379646246356814, + "loss": 0.8, + "num_input_tokens_seen": 72642888, + "step": 125115 + }, + { + "epoch": 18.635686624962766, + "grad_norm": 0.036376953125, + "learning_rate": 0.0004233364181227378, + "loss": 0.7978, + "num_input_tokens_seen": 72645800, + "step": 125120 + }, + { + "epoch": 18.636431337503723, + "grad_norm": 0.064453125, + "learning_rate": 0.0004228766200398232, + "loss": 0.7919, + "num_input_tokens_seen": 72648584, + "step": 125125 + }, + { + "epoch": 18.637176050044683, + "grad_norm": 0.04638671875, + "learning_rate": 0.0004224170682225914, + "loss": 0.7979, + "num_input_tokens_seen": 72651496, + "step": 125130 + }, + { + "epoch": 18.63792076258564, + "grad_norm": 0.03955078125, + "learning_rate": 0.0004219577626788079, + "loss": 0.8248, + "num_input_tokens_seen": 72654472, + "step": 125135 + }, + { + "epoch": 18.6386654751266, + "grad_norm": 0.0810546875, + "learning_rate": 0.00042149870341623153, + "loss": 0.7974, + "num_input_tokens_seen": 72657416, + "step": 125140 + }, + { + "epoch": 18.63941018766756, + "grad_norm": 0.0262451171875, + "learning_rate": 0.0004210398904426177, + "loss": 0.8017, + "num_input_tokens_seen": 72660424, + "step": 125145 + }, + { + "epoch": 18.64015490020852, + "grad_norm": 0.039306640625, + "learning_rate": 0.0004205813237657169, + "loss": 0.7956, + "num_input_tokens_seen": 72663080, + "step": 125150 + }, + { + "epoch": 18.64089961274948, + "grad_norm": 0.058837890625, + "learning_rate": 0.0004201230033932779, + "loss": 0.8098, + "num_input_tokens_seen": 72666216, + "step": 125155 + }, + { + "epoch": 18.64164432529044, + "grad_norm": 0.061767578125, + "learning_rate": 0.0004196649293330412, + "loss": 0.8113, + "num_input_tokens_seen": 72669032, + "step": 125160 + }, + { + "epoch": 18.642389037831396, + "grad_norm": 0.049560546875, + "learning_rate": 0.0004192071015927473, + "loss": 0.8126, + "num_input_tokens_seen": 72671592, + "step": 125165 + }, + { + "epoch": 18.643133750372357, + "grad_norm": 0.0830078125, + "learning_rate": 0.00041874952018013166, + "loss": 0.7891, + "num_input_tokens_seen": 72674472, + "step": 125170 + }, + { + "epoch": 18.643878462913314, + "grad_norm": 0.035400390625, + "learning_rate": 0.0004182921851029231, + "loss": 0.8008, + "num_input_tokens_seen": 72677448, + "step": 125175 + }, + { + "epoch": 18.644623175454274, + "grad_norm": 0.05029296875, + "learning_rate": 0.00041783509636884707, + "loss": 0.7826, + "num_input_tokens_seen": 72680136, + "step": 125180 + }, + { + "epoch": 18.645367887995235, + "grad_norm": 0.037353515625, + "learning_rate": 0.0004173782539856241, + "loss": 0.8064, + "num_input_tokens_seen": 72683272, + "step": 125185 + }, + { + "epoch": 18.646112600536192, + "grad_norm": 0.037353515625, + "learning_rate": 0.000416921657960978, + "loss": 0.7937, + "num_input_tokens_seen": 72686440, + "step": 125190 + }, + { + "epoch": 18.646857313077152, + "grad_norm": 0.047119140625, + "learning_rate": 0.00041646530830261594, + "loss": 0.8001, + "num_input_tokens_seen": 72689000, + "step": 125195 + }, + { + "epoch": 18.647602025618113, + "grad_norm": 0.07177734375, + "learning_rate": 0.0004160092050182518, + "loss": 0.8128, + "num_input_tokens_seen": 72691816, + "step": 125200 + }, + { + "epoch": 18.64834673815907, + "grad_norm": 0.076171875, + "learning_rate": 0.00041555334811559097, + "loss": 0.8139, + "num_input_tokens_seen": 72694888, + "step": 125205 + }, + { + "epoch": 18.64909145070003, + "grad_norm": 0.0291748046875, + "learning_rate": 0.0004150977376023307, + "loss": 0.8027, + "num_input_tokens_seen": 72697896, + "step": 125210 + }, + { + "epoch": 18.649836163240987, + "grad_norm": 0.03466796875, + "learning_rate": 0.00041464237348617315, + "loss": 0.8028, + "num_input_tokens_seen": 72700872, + "step": 125215 + }, + { + "epoch": 18.650580875781948, + "grad_norm": 0.052490234375, + "learning_rate": 0.0004141872557748055, + "loss": 0.8069, + "num_input_tokens_seen": 72703912, + "step": 125220 + }, + { + "epoch": 18.65132558832291, + "grad_norm": 0.06884765625, + "learning_rate": 0.0004137323844759233, + "loss": 0.7964, + "num_input_tokens_seen": 72706824, + "step": 125225 + }, + { + "epoch": 18.652070300863866, + "grad_norm": 0.039306640625, + "learning_rate": 0.0004132777595972054, + "loss": 0.8008, + "num_input_tokens_seen": 72709576, + "step": 125230 + }, + { + "epoch": 18.652815013404826, + "grad_norm": 0.03857421875, + "learning_rate": 0.0004128233811463355, + "loss": 0.7997, + "num_input_tokens_seen": 72712456, + "step": 125235 + }, + { + "epoch": 18.653559725945787, + "grad_norm": 0.0390625, + "learning_rate": 0.0004123692491309894, + "loss": 0.7858, + "num_input_tokens_seen": 72715304, + "step": 125240 + }, + { + "epoch": 18.654304438486744, + "grad_norm": 0.042724609375, + "learning_rate": 0.00041191536355883907, + "loss": 0.7924, + "num_input_tokens_seen": 72718120, + "step": 125245 + }, + { + "epoch": 18.655049151027704, + "grad_norm": 0.035400390625, + "learning_rate": 0.0004114617244375501, + "loss": 0.7993, + "num_input_tokens_seen": 72720872, + "step": 125250 + }, + { + "epoch": 18.65579386356866, + "grad_norm": 0.044677734375, + "learning_rate": 0.0004110083317747881, + "loss": 0.7899, + "num_input_tokens_seen": 72723656, + "step": 125255 + }, + { + "epoch": 18.65653857610962, + "grad_norm": 0.041748046875, + "learning_rate": 0.00041055518557821523, + "loss": 0.7892, + "num_input_tokens_seen": 72726504, + "step": 125260 + }, + { + "epoch": 18.657283288650582, + "grad_norm": 0.0546875, + "learning_rate": 0.000410102285855482, + "loss": 0.7889, + "num_input_tokens_seen": 72729832, + "step": 125265 + }, + { + "epoch": 18.65802800119154, + "grad_norm": 0.037841796875, + "learning_rate": 0.00040964963261424234, + "loss": 0.7846, + "num_input_tokens_seen": 72732584, + "step": 125270 + }, + { + "epoch": 18.6587727137325, + "grad_norm": 0.03857421875, + "learning_rate": 0.0004091972258621435, + "loss": 0.8111, + "num_input_tokens_seen": 72735208, + "step": 125275 + }, + { + "epoch": 18.65951742627346, + "grad_norm": 0.03271484375, + "learning_rate": 0.0004087450656068292, + "loss": 0.7929, + "num_input_tokens_seen": 72738152, + "step": 125280 + }, + { + "epoch": 18.660262138814417, + "grad_norm": 0.046142578125, + "learning_rate": 0.00040829315185593516, + "loss": 0.8054, + "num_input_tokens_seen": 72740936, + "step": 125285 + }, + { + "epoch": 18.661006851355378, + "grad_norm": 0.04443359375, + "learning_rate": 0.00040784148461710023, + "loss": 0.7982, + "num_input_tokens_seen": 72743752, + "step": 125290 + }, + { + "epoch": 18.661751563896335, + "grad_norm": 0.025634765625, + "learning_rate": 0.0004073900638979516, + "loss": 0.8152, + "num_input_tokens_seen": 72746472, + "step": 125295 + }, + { + "epoch": 18.662496276437295, + "grad_norm": 0.078125, + "learning_rate": 0.0004069388897061149, + "loss": 0.7971, + "num_input_tokens_seen": 72749320, + "step": 125300 + }, + { + "epoch": 18.663240988978256, + "grad_norm": 0.064453125, + "learning_rate": 0.00040648796204921734, + "loss": 0.8055, + "num_input_tokens_seen": 72752488, + "step": 125305 + }, + { + "epoch": 18.663985701519213, + "grad_norm": 0.056396484375, + "learning_rate": 0.0004060372809348711, + "loss": 0.8142, + "num_input_tokens_seen": 72755240, + "step": 125310 + }, + { + "epoch": 18.664730414060173, + "grad_norm": 0.0517578125, + "learning_rate": 0.0004055868463706935, + "loss": 0.7843, + "num_input_tokens_seen": 72758152, + "step": 125315 + }, + { + "epoch": 18.66547512660113, + "grad_norm": 0.035888671875, + "learning_rate": 0.0004051366583642918, + "loss": 0.8039, + "num_input_tokens_seen": 72761128, + "step": 125320 + }, + { + "epoch": 18.66621983914209, + "grad_norm": 0.0712890625, + "learning_rate": 0.00040468671692327484, + "loss": 0.8076, + "num_input_tokens_seen": 72763816, + "step": 125325 + }, + { + "epoch": 18.66696455168305, + "grad_norm": 0.056884765625, + "learning_rate": 0.0004042370220552399, + "loss": 0.8137, + "num_input_tokens_seen": 72766376, + "step": 125330 + }, + { + "epoch": 18.66770926422401, + "grad_norm": 0.055908203125, + "learning_rate": 0.00040378757376778596, + "loss": 0.8127, + "num_input_tokens_seen": 72769160, + "step": 125335 + }, + { + "epoch": 18.66845397676497, + "grad_norm": 0.03857421875, + "learning_rate": 0.00040333837206850686, + "loss": 0.8063, + "num_input_tokens_seen": 72771944, + "step": 125340 + }, + { + "epoch": 18.66919868930593, + "grad_norm": 0.044921875, + "learning_rate": 0.0004028894169649899, + "loss": 0.8106, + "num_input_tokens_seen": 72774760, + "step": 125345 + }, + { + "epoch": 18.669943401846886, + "grad_norm": 0.0576171875, + "learning_rate": 0.0004024407084648207, + "loss": 0.7911, + "num_input_tokens_seen": 72777480, + "step": 125350 + }, + { + "epoch": 18.670688114387847, + "grad_norm": 0.08984375, + "learning_rate": 0.0004019922465755782, + "loss": 0.7826, + "num_input_tokens_seen": 72780328, + "step": 125355 + }, + { + "epoch": 18.671432826928804, + "grad_norm": 0.0272216796875, + "learning_rate": 0.00040154403130484125, + "loss": 0.7843, + "num_input_tokens_seen": 72783048, + "step": 125360 + }, + { + "epoch": 18.672177539469764, + "grad_norm": 0.06640625, + "learning_rate": 0.00040109606266018226, + "loss": 0.8011, + "num_input_tokens_seen": 72786056, + "step": 125365 + }, + { + "epoch": 18.672922252010725, + "grad_norm": 0.0859375, + "learning_rate": 0.00040064834064916673, + "loss": 0.7967, + "num_input_tokens_seen": 72788744, + "step": 125370 + }, + { + "epoch": 18.673666964551682, + "grad_norm": 0.041748046875, + "learning_rate": 0.00040020086527935703, + "loss": 0.8092, + "num_input_tokens_seen": 72791560, + "step": 125375 + }, + { + "epoch": 18.674411677092642, + "grad_norm": 0.038330078125, + "learning_rate": 0.00039975363655831705, + "loss": 0.7938, + "num_input_tokens_seen": 72794600, + "step": 125380 + }, + { + "epoch": 18.675156389633603, + "grad_norm": 0.09423828125, + "learning_rate": 0.0003993066544936008, + "loss": 0.8152, + "num_input_tokens_seen": 72797416, + "step": 125385 + }, + { + "epoch": 18.67590110217456, + "grad_norm": 0.047119140625, + "learning_rate": 0.0003988599190927572, + "loss": 0.793, + "num_input_tokens_seen": 72800296, + "step": 125390 + }, + { + "epoch": 18.67664581471552, + "grad_norm": 0.123046875, + "learning_rate": 0.00039841343036333684, + "loss": 0.7909, + "num_input_tokens_seen": 72803144, + "step": 125395 + }, + { + "epoch": 18.677390527256478, + "grad_norm": 0.0498046875, + "learning_rate": 0.0003979671883128805, + "loss": 0.8118, + "num_input_tokens_seen": 72806184, + "step": 125400 + }, + { + "epoch": 18.678135239797438, + "grad_norm": 0.059814453125, + "learning_rate": 0.0003975211929489286, + "loss": 0.7884, + "num_input_tokens_seen": 72809032, + "step": 125405 + }, + { + "epoch": 18.6788799523384, + "grad_norm": 0.044189453125, + "learning_rate": 0.00039707544427901363, + "loss": 0.783, + "num_input_tokens_seen": 72811816, + "step": 125410 + }, + { + "epoch": 18.679624664879356, + "grad_norm": 0.052001953125, + "learning_rate": 0.0003966299423106678, + "loss": 0.7922, + "num_input_tokens_seen": 72814568, + "step": 125415 + }, + { + "epoch": 18.680369377420316, + "grad_norm": 0.048828125, + "learning_rate": 0.00039618468705141684, + "loss": 0.7985, + "num_input_tokens_seen": 72817192, + "step": 125420 + }, + { + "epoch": 18.681114089961277, + "grad_norm": 0.0830078125, + "learning_rate": 0.000395739678508783, + "loss": 0.7981, + "num_input_tokens_seen": 72819880, + "step": 125425 + }, + { + "epoch": 18.681858802502234, + "grad_norm": 0.049560546875, + "learning_rate": 0.00039529491669028523, + "loss": 0.8099, + "num_input_tokens_seen": 72822760, + "step": 125430 + }, + { + "epoch": 18.682603515043194, + "grad_norm": 0.0498046875, + "learning_rate": 0.000394850401603436, + "loss": 0.7905, + "num_input_tokens_seen": 72825704, + "step": 125435 + }, + { + "epoch": 18.68334822758415, + "grad_norm": 0.0419921875, + "learning_rate": 0.00039440613325574414, + "loss": 0.7953, + "num_input_tokens_seen": 72828520, + "step": 125440 + }, + { + "epoch": 18.68409294012511, + "grad_norm": 0.07421875, + "learning_rate": 0.0003939621116547154, + "loss": 0.8125, + "num_input_tokens_seen": 72831208, + "step": 125445 + }, + { + "epoch": 18.684837652666072, + "grad_norm": 0.0732421875, + "learning_rate": 0.0003935183368078537, + "loss": 0.8121, + "num_input_tokens_seen": 72834152, + "step": 125450 + }, + { + "epoch": 18.68558236520703, + "grad_norm": 0.0498046875, + "learning_rate": 0.00039307480872265485, + "loss": 0.8111, + "num_input_tokens_seen": 72837096, + "step": 125455 + }, + { + "epoch": 18.68632707774799, + "grad_norm": 0.046630859375, + "learning_rate": 0.00039263152740660944, + "loss": 0.7924, + "num_input_tokens_seen": 72839976, + "step": 125460 + }, + { + "epoch": 18.687071790288947, + "grad_norm": 0.041748046875, + "learning_rate": 0.0003921884928672098, + "loss": 0.8016, + "num_input_tokens_seen": 72842664, + "step": 125465 + }, + { + "epoch": 18.687816502829907, + "grad_norm": 0.028076171875, + "learning_rate": 0.0003917457051119366, + "loss": 0.8061, + "num_input_tokens_seen": 72845832, + "step": 125470 + }, + { + "epoch": 18.688561215370868, + "grad_norm": 0.026611328125, + "learning_rate": 0.00039130316414827393, + "loss": 0.7858, + "num_input_tokens_seen": 72848648, + "step": 125475 + }, + { + "epoch": 18.689305927911825, + "grad_norm": 0.060546875, + "learning_rate": 0.00039086086998369574, + "loss": 0.8059, + "num_input_tokens_seen": 72851624, + "step": 125480 + }, + { + "epoch": 18.690050640452785, + "grad_norm": 0.10888671875, + "learning_rate": 0.00039041882262567603, + "loss": 0.783, + "num_input_tokens_seen": 72854504, + "step": 125485 + }, + { + "epoch": 18.690795352993746, + "grad_norm": 0.06982421875, + "learning_rate": 0.00038997702208168225, + "loss": 0.7949, + "num_input_tokens_seen": 72857544, + "step": 125490 + }, + { + "epoch": 18.691540065534703, + "grad_norm": 0.054443359375, + "learning_rate": 0.00038953546835917674, + "loss": 0.8029, + "num_input_tokens_seen": 72860840, + "step": 125495 + }, + { + "epoch": 18.692284778075663, + "grad_norm": 0.047607421875, + "learning_rate": 0.00038909416146562016, + "loss": 0.7884, + "num_input_tokens_seen": 72863944, + "step": 125500 + }, + { + "epoch": 18.69302949061662, + "grad_norm": 0.08837890625, + "learning_rate": 0.00038865310140846987, + "loss": 0.7909, + "num_input_tokens_seen": 72866600, + "step": 125505 + }, + { + "epoch": 18.69377420315758, + "grad_norm": 0.03515625, + "learning_rate": 0.00038821228819517327, + "loss": 0.8217, + "num_input_tokens_seen": 72869352, + "step": 125510 + }, + { + "epoch": 18.69451891569854, + "grad_norm": 0.03955078125, + "learning_rate": 0.0003877717218331794, + "loss": 0.7902, + "num_input_tokens_seen": 72872168, + "step": 125515 + }, + { + "epoch": 18.6952636282395, + "grad_norm": 0.0595703125, + "learning_rate": 0.0003873314023299307, + "loss": 0.8026, + "num_input_tokens_seen": 72874952, + "step": 125520 + }, + { + "epoch": 18.69600834078046, + "grad_norm": 0.0400390625, + "learning_rate": 0.0003868913296928661, + "loss": 0.8057, + "num_input_tokens_seen": 72878280, + "step": 125525 + }, + { + "epoch": 18.69675305332142, + "grad_norm": 0.0947265625, + "learning_rate": 0.00038645150392942136, + "loss": 0.8001, + "num_input_tokens_seen": 72881480, + "step": 125530 + }, + { + "epoch": 18.697497765862376, + "grad_norm": 0.0703125, + "learning_rate": 0.0003860119250470273, + "loss": 0.7808, + "num_input_tokens_seen": 72884456, + "step": 125535 + }, + { + "epoch": 18.698242478403337, + "grad_norm": 0.06591796875, + "learning_rate": 0.0003855725930531062, + "loss": 0.7831, + "num_input_tokens_seen": 72887560, + "step": 125540 + }, + { + "epoch": 18.698987190944294, + "grad_norm": 0.04931640625, + "learning_rate": 0.0003851335079550838, + "loss": 0.8356, + "num_input_tokens_seen": 72890376, + "step": 125545 + }, + { + "epoch": 18.699731903485254, + "grad_norm": 0.030517578125, + "learning_rate": 0.0003846946697603759, + "loss": 0.7995, + "num_input_tokens_seen": 72893288, + "step": 125550 + }, + { + "epoch": 18.700476616026215, + "grad_norm": 0.053955078125, + "learning_rate": 0.0003842560784763982, + "loss": 0.7828, + "num_input_tokens_seen": 72896168, + "step": 125555 + }, + { + "epoch": 18.701221328567172, + "grad_norm": 0.06591796875, + "learning_rate": 0.00038381773411055804, + "loss": 0.791, + "num_input_tokens_seen": 72899432, + "step": 125560 + }, + { + "epoch": 18.701966041108133, + "grad_norm": 0.07666015625, + "learning_rate": 0.00038337963667026294, + "loss": 0.7918, + "num_input_tokens_seen": 72902344, + "step": 125565 + }, + { + "epoch": 18.702710753649093, + "grad_norm": 0.037353515625, + "learning_rate": 0.0003829417861629153, + "loss": 0.808, + "num_input_tokens_seen": 72905128, + "step": 125570 + }, + { + "epoch": 18.70345546619005, + "grad_norm": 0.0361328125, + "learning_rate": 0.00038250418259590745, + "loss": 0.799, + "num_input_tokens_seen": 72907784, + "step": 125575 + }, + { + "epoch": 18.70420017873101, + "grad_norm": 0.05078125, + "learning_rate": 0.0003820668259766369, + "loss": 0.8196, + "num_input_tokens_seen": 72910536, + "step": 125580 + }, + { + "epoch": 18.704944891271968, + "grad_norm": 0.0390625, + "learning_rate": 0.00038162971631248765, + "loss": 0.8039, + "num_input_tokens_seen": 72913256, + "step": 125585 + }, + { + "epoch": 18.705689603812928, + "grad_norm": 0.06982421875, + "learning_rate": 0.00038119285361084887, + "loss": 0.8045, + "num_input_tokens_seen": 72916328, + "step": 125590 + }, + { + "epoch": 18.70643431635389, + "grad_norm": 0.078125, + "learning_rate": 0.00038075623787909793, + "loss": 0.8043, + "num_input_tokens_seen": 72919432, + "step": 125595 + }, + { + "epoch": 18.707179028894846, + "grad_norm": 0.10546875, + "learning_rate": 0.00038031986912461234, + "loss": 0.7919, + "num_input_tokens_seen": 72922344, + "step": 125600 + }, + { + "epoch": 18.707923741435806, + "grad_norm": 0.029541015625, + "learning_rate": 0.0003798837473547628, + "loss": 0.8119, + "num_input_tokens_seen": 72925320, + "step": 125605 + }, + { + "epoch": 18.708668453976763, + "grad_norm": 0.054443359375, + "learning_rate": 0.0003794478725769201, + "loss": 0.8152, + "num_input_tokens_seen": 72928264, + "step": 125610 + }, + { + "epoch": 18.709413166517724, + "grad_norm": 0.05419921875, + "learning_rate": 0.00037901224479844505, + "loss": 0.8, + "num_input_tokens_seen": 72930728, + "step": 125615 + }, + { + "epoch": 18.710157879058684, + "grad_norm": 0.0693359375, + "learning_rate": 0.00037857686402669675, + "loss": 0.8012, + "num_input_tokens_seen": 72933672, + "step": 125620 + }, + { + "epoch": 18.71090259159964, + "grad_norm": 0.03564453125, + "learning_rate": 0.0003781417302690343, + "loss": 0.7856, + "num_input_tokens_seen": 72936552, + "step": 125625 + }, + { + "epoch": 18.7116473041406, + "grad_norm": 0.0284423828125, + "learning_rate": 0.00037770684353280345, + "loss": 0.7938, + "num_input_tokens_seen": 72939944, + "step": 125630 + }, + { + "epoch": 18.712392016681562, + "grad_norm": 0.08740234375, + "learning_rate": 0.00037727220382535677, + "loss": 0.7841, + "num_input_tokens_seen": 72942632, + "step": 125635 + }, + { + "epoch": 18.71313672922252, + "grad_norm": 0.043701171875, + "learning_rate": 0.00037683781115403325, + "loss": 0.7985, + "num_input_tokens_seen": 72945672, + "step": 125640 + }, + { + "epoch": 18.71388144176348, + "grad_norm": 0.05615234375, + "learning_rate": 0.0003764036655261721, + "loss": 0.7865, + "num_input_tokens_seen": 72948520, + "step": 125645 + }, + { + "epoch": 18.714626154304437, + "grad_norm": 0.043701171875, + "learning_rate": 0.00037596976694911075, + "loss": 0.8211, + "num_input_tokens_seen": 72951400, + "step": 125650 + }, + { + "epoch": 18.715370866845397, + "grad_norm": 0.0537109375, + "learning_rate": 0.0003755361154301734, + "loss": 0.816, + "num_input_tokens_seen": 72954408, + "step": 125655 + }, + { + "epoch": 18.716115579386358, + "grad_norm": 0.04052734375, + "learning_rate": 0.00037510271097669076, + "loss": 0.8048, + "num_input_tokens_seen": 72957608, + "step": 125660 + }, + { + "epoch": 18.716860291927315, + "grad_norm": 0.1181640625, + "learning_rate": 0.0003746695535959837, + "loss": 0.7909, + "num_input_tokens_seen": 72960328, + "step": 125665 + }, + { + "epoch": 18.717605004468275, + "grad_norm": 0.072265625, + "learning_rate": 0.0003742366432953714, + "loss": 0.8073, + "num_input_tokens_seen": 72963304, + "step": 125670 + }, + { + "epoch": 18.718349717009236, + "grad_norm": 0.0390625, + "learning_rate": 0.00037380398008216465, + "loss": 0.7938, + "num_input_tokens_seen": 72966056, + "step": 125675 + }, + { + "epoch": 18.719094429550193, + "grad_norm": 0.05224609375, + "learning_rate": 0.0003733715639636742, + "loss": 0.8041, + "num_input_tokens_seen": 72968744, + "step": 125680 + }, + { + "epoch": 18.719839142091153, + "grad_norm": 0.0400390625, + "learning_rate": 0.00037293939494720594, + "loss": 0.8112, + "num_input_tokens_seen": 72971720, + "step": 125685 + }, + { + "epoch": 18.72058385463211, + "grad_norm": 0.06298828125, + "learning_rate": 0.0003725074730400607, + "loss": 0.7993, + "num_input_tokens_seen": 72974600, + "step": 125690 + }, + { + "epoch": 18.72132856717307, + "grad_norm": 0.05126953125, + "learning_rate": 0.00037207579824953427, + "loss": 0.7957, + "num_input_tokens_seen": 72977320, + "step": 125695 + }, + { + "epoch": 18.72207327971403, + "grad_norm": 0.052734375, + "learning_rate": 0.00037164437058292087, + "loss": 0.7953, + "num_input_tokens_seen": 72980200, + "step": 125700 + }, + { + "epoch": 18.72281799225499, + "grad_norm": 0.06591796875, + "learning_rate": 0.00037121319004750796, + "loss": 0.7831, + "num_input_tokens_seen": 72983208, + "step": 125705 + }, + { + "epoch": 18.72356270479595, + "grad_norm": 0.054931640625, + "learning_rate": 0.0003707822566505797, + "loss": 0.829, + "num_input_tokens_seen": 72986056, + "step": 125710 + }, + { + "epoch": 18.72430741733691, + "grad_norm": 0.06005859375, + "learning_rate": 0.000370351570399417, + "loss": 0.8143, + "num_input_tokens_seen": 72989000, + "step": 125715 + }, + { + "epoch": 18.725052129877866, + "grad_norm": 0.201171875, + "learning_rate": 0.0003699211313012957, + "loss": 0.8189, + "num_input_tokens_seen": 72991976, + "step": 125720 + }, + { + "epoch": 18.725796842418827, + "grad_norm": 0.05224609375, + "learning_rate": 0.0003694909393634882, + "loss": 0.8049, + "num_input_tokens_seen": 72995080, + "step": 125725 + }, + { + "epoch": 18.726541554959784, + "grad_norm": 0.056884765625, + "learning_rate": 0.00036906099459326045, + "loss": 0.8166, + "num_input_tokens_seen": 72998088, + "step": 125730 + }, + { + "epoch": 18.727286267500745, + "grad_norm": 0.03857421875, + "learning_rate": 0.00036863129699787667, + "loss": 0.793, + "num_input_tokens_seen": 73000744, + "step": 125735 + }, + { + "epoch": 18.728030980041705, + "grad_norm": 0.0478515625, + "learning_rate": 0.00036820184658459764, + "loss": 0.8029, + "num_input_tokens_seen": 73003368, + "step": 125740 + }, + { + "epoch": 18.728775692582662, + "grad_norm": 0.0888671875, + "learning_rate": 0.0003677726433606759, + "loss": 0.7941, + "num_input_tokens_seen": 73006248, + "step": 125745 + }, + { + "epoch": 18.729520405123623, + "grad_norm": 0.0439453125, + "learning_rate": 0.00036734368733336406, + "loss": 0.774, + "num_input_tokens_seen": 73009032, + "step": 125750 + }, + { + "epoch": 18.730265117664583, + "grad_norm": 0.0498046875, + "learning_rate": 0.0003669149785099096, + "loss": 0.8093, + "num_input_tokens_seen": 73011624, + "step": 125755 + }, + { + "epoch": 18.73100983020554, + "grad_norm": 0.048583984375, + "learning_rate": 0.00036648651689755337, + "loss": 0.8079, + "num_input_tokens_seen": 73014440, + "step": 125760 + }, + { + "epoch": 18.7317545427465, + "grad_norm": 0.0517578125, + "learning_rate": 0.0003660583025035363, + "loss": 0.786, + "num_input_tokens_seen": 73017704, + "step": 125765 + }, + { + "epoch": 18.732499255287458, + "grad_norm": 0.05126953125, + "learning_rate": 0.00036563033533508927, + "loss": 0.7756, + "num_input_tokens_seen": 73020296, + "step": 125770 + }, + { + "epoch": 18.733243967828418, + "grad_norm": 0.0267333984375, + "learning_rate": 0.00036520261539944475, + "loss": 0.7911, + "num_input_tokens_seen": 73023176, + "step": 125775 + }, + { + "epoch": 18.73398868036938, + "grad_norm": 0.0517578125, + "learning_rate": 0.000364775142703827, + "loss": 0.8041, + "num_input_tokens_seen": 73025992, + "step": 125780 + }, + { + "epoch": 18.734733392910336, + "grad_norm": 0.05419921875, + "learning_rate": 0.00036434791725546033, + "loss": 0.8135, + "num_input_tokens_seen": 73028968, + "step": 125785 + }, + { + "epoch": 18.735478105451296, + "grad_norm": 0.0419921875, + "learning_rate": 0.00036392093906155887, + "loss": 0.7849, + "num_input_tokens_seen": 73031784, + "step": 125790 + }, + { + "epoch": 18.736222817992257, + "grad_norm": 0.05810546875, + "learning_rate": 0.00036349420812933854, + "loss": 0.7934, + "num_input_tokens_seen": 73034728, + "step": 125795 + }, + { + "epoch": 18.736967530533214, + "grad_norm": 0.0400390625, + "learning_rate": 0.0003630677244660069, + "loss": 0.8326, + "num_input_tokens_seen": 73037928, + "step": 125800 + }, + { + "epoch": 18.737712243074174, + "grad_norm": 0.0400390625, + "learning_rate": 0.0003626414880787698, + "loss": 0.7819, + "num_input_tokens_seen": 73040744, + "step": 125805 + }, + { + "epoch": 18.73845695561513, + "grad_norm": 0.0361328125, + "learning_rate": 0.0003622154989748283, + "loss": 0.7915, + "num_input_tokens_seen": 73043848, + "step": 125810 + }, + { + "epoch": 18.739201668156092, + "grad_norm": 0.08837890625, + "learning_rate": 0.0003617897571613798, + "loss": 0.8131, + "num_input_tokens_seen": 73046472, + "step": 125815 + }, + { + "epoch": 18.739946380697052, + "grad_norm": 0.035888671875, + "learning_rate": 0.00036136426264561536, + "loss": 0.7931, + "num_input_tokens_seen": 73049512, + "step": 125820 + }, + { + "epoch": 18.74069109323801, + "grad_norm": 0.037109375, + "learning_rate": 0.00036093901543472084, + "loss": 0.8013, + "num_input_tokens_seen": 73052520, + "step": 125825 + }, + { + "epoch": 18.74143580577897, + "grad_norm": 0.0751953125, + "learning_rate": 0.0003605140155358871, + "loss": 0.7876, + "num_input_tokens_seen": 73055240, + "step": 125830 + }, + { + "epoch": 18.742180518319927, + "grad_norm": 0.05419921875, + "learning_rate": 0.0003600892629562885, + "loss": 0.7843, + "num_input_tokens_seen": 73058152, + "step": 125835 + }, + { + "epoch": 18.742925230860887, + "grad_norm": 0.05078125, + "learning_rate": 0.00035966475770310256, + "loss": 0.788, + "num_input_tokens_seen": 73061096, + "step": 125840 + }, + { + "epoch": 18.743669943401848, + "grad_norm": 0.038818359375, + "learning_rate": 0.00035924049978350027, + "loss": 0.8022, + "num_input_tokens_seen": 73063720, + "step": 125845 + }, + { + "epoch": 18.744414655942805, + "grad_norm": 0.0419921875, + "learning_rate": 0.0003588164892046508, + "loss": 0.7942, + "num_input_tokens_seen": 73066728, + "step": 125850 + }, + { + "epoch": 18.745159368483765, + "grad_norm": 0.08447265625, + "learning_rate": 0.0003583927259737152, + "loss": 0.7956, + "num_input_tokens_seen": 73069512, + "step": 125855 + }, + { + "epoch": 18.745904081024726, + "grad_norm": 0.0830078125, + "learning_rate": 0.00035796921009785266, + "loss": 0.7796, + "num_input_tokens_seen": 73072328, + "step": 125860 + }, + { + "epoch": 18.746648793565683, + "grad_norm": 0.0615234375, + "learning_rate": 0.0003575459415842208, + "loss": 0.8236, + "num_input_tokens_seen": 73075176, + "step": 125865 + }, + { + "epoch": 18.747393506106643, + "grad_norm": 0.05419921875, + "learning_rate": 0.0003571229204399656, + "loss": 0.817, + "num_input_tokens_seen": 73077928, + "step": 125870 + }, + { + "epoch": 18.7481382186476, + "grad_norm": 0.0498046875, + "learning_rate": 0.00035670014667223967, + "loss": 0.7864, + "num_input_tokens_seen": 73081032, + "step": 125875 + }, + { + "epoch": 18.74888293118856, + "grad_norm": 0.06005859375, + "learning_rate": 0.000356277620288179, + "loss": 0.7841, + "num_input_tokens_seen": 73084136, + "step": 125880 + }, + { + "epoch": 18.74962764372952, + "grad_norm": 0.0576171875, + "learning_rate": 0.0003558553412949261, + "loss": 0.8112, + "num_input_tokens_seen": 73087112, + "step": 125885 + }, + { + "epoch": 18.75037235627048, + "grad_norm": 0.0322265625, + "learning_rate": 0.0003554333096996137, + "loss": 0.8065, + "num_input_tokens_seen": 73089992, + "step": 125890 + }, + { + "epoch": 18.75111706881144, + "grad_norm": 0.05322265625, + "learning_rate": 0.00035501152550937106, + "loss": 0.8076, + "num_input_tokens_seen": 73092808, + "step": 125895 + }, + { + "epoch": 18.7518617813524, + "grad_norm": 0.041259765625, + "learning_rate": 0.0003545899887313242, + "loss": 0.7927, + "num_input_tokens_seen": 73095784, + "step": 125900 + }, + { + "epoch": 18.752606493893357, + "grad_norm": 0.060302734375, + "learning_rate": 0.0003541686993725956, + "loss": 0.8334, + "num_input_tokens_seen": 73098824, + "step": 125905 + }, + { + "epoch": 18.753351206434317, + "grad_norm": 0.0791015625, + "learning_rate": 0.00035374765744030143, + "loss": 0.7941, + "num_input_tokens_seen": 73101320, + "step": 125910 + }, + { + "epoch": 18.754095918975274, + "grad_norm": 0.0283203125, + "learning_rate": 0.0003533268629415526, + "loss": 0.8087, + "num_input_tokens_seen": 73104104, + "step": 125915 + }, + { + "epoch": 18.754840631516235, + "grad_norm": 0.06103515625, + "learning_rate": 0.00035290631588346164, + "loss": 0.8104, + "num_input_tokens_seen": 73106984, + "step": 125920 + }, + { + "epoch": 18.755585344057195, + "grad_norm": 0.0546875, + "learning_rate": 0.00035248601627313133, + "loss": 0.7902, + "num_input_tokens_seen": 73110696, + "step": 125925 + }, + { + "epoch": 18.756330056598152, + "grad_norm": 0.12353515625, + "learning_rate": 0.0003520659641176643, + "loss": 0.7961, + "num_input_tokens_seen": 73113704, + "step": 125930 + }, + { + "epoch": 18.757074769139113, + "grad_norm": 0.044189453125, + "learning_rate": 0.0003516461594241532, + "loss": 0.7841, + "num_input_tokens_seen": 73116584, + "step": 125935 + }, + { + "epoch": 18.757819481680073, + "grad_norm": 0.08154296875, + "learning_rate": 0.00035122660219969235, + "loss": 0.7685, + "num_input_tokens_seen": 73119240, + "step": 125940 + }, + { + "epoch": 18.75856419422103, + "grad_norm": 0.37109375, + "learning_rate": 0.0003508072924513711, + "loss": 0.8534, + "num_input_tokens_seen": 73122280, + "step": 125945 + }, + { + "epoch": 18.75930890676199, + "grad_norm": 0.044677734375, + "learning_rate": 0.0003503882301862704, + "loss": 0.8402, + "num_input_tokens_seen": 73125128, + "step": 125950 + }, + { + "epoch": 18.760053619302948, + "grad_norm": 0.099609375, + "learning_rate": 0.00034996941541147296, + "loss": 0.7921, + "num_input_tokens_seen": 73128072, + "step": 125955 + }, + { + "epoch": 18.760798331843908, + "grad_norm": 0.051513671875, + "learning_rate": 0.00034955084813405145, + "loss": 0.7886, + "num_input_tokens_seen": 73131048, + "step": 125960 + }, + { + "epoch": 18.76154304438487, + "grad_norm": 0.060302734375, + "learning_rate": 0.0003491325283610802, + "loss": 0.7904, + "num_input_tokens_seen": 73133768, + "step": 125965 + }, + { + "epoch": 18.762287756925826, + "grad_norm": 0.080078125, + "learning_rate": 0.0003487144560996236, + "loss": 0.7921, + "num_input_tokens_seen": 73136904, + "step": 125970 + }, + { + "epoch": 18.763032469466786, + "grad_norm": 0.055908203125, + "learning_rate": 0.00034829663135674423, + "loss": 0.8096, + "num_input_tokens_seen": 73139944, + "step": 125975 + }, + { + "epoch": 18.763777182007743, + "grad_norm": 0.05224609375, + "learning_rate": 0.00034787905413950326, + "loss": 0.7837, + "num_input_tokens_seen": 73142984, + "step": 125980 + }, + { + "epoch": 18.764521894548704, + "grad_norm": 0.0498046875, + "learning_rate": 0.00034746172445495325, + "loss": 0.8101, + "num_input_tokens_seen": 73145864, + "step": 125985 + }, + { + "epoch": 18.765266607089664, + "grad_norm": 0.052978515625, + "learning_rate": 0.0003470446423101453, + "loss": 0.8167, + "num_input_tokens_seen": 73148584, + "step": 125990 + }, + { + "epoch": 18.76601131963062, + "grad_norm": 0.05322265625, + "learning_rate": 0.00034662780771212376, + "loss": 0.7933, + "num_input_tokens_seen": 73151400, + "step": 125995 + }, + { + "epoch": 18.766756032171582, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00034621122066793463, + "loss": 0.8055, + "num_input_tokens_seen": 73154152, + "step": 126000 + }, + { + "epoch": 18.767500744712542, + "grad_norm": 0.060546875, + "learning_rate": 0.00034579488118461233, + "loss": 0.7922, + "num_input_tokens_seen": 73156808, + "step": 126005 + }, + { + "epoch": 18.7682454572535, + "grad_norm": 0.0654296875, + "learning_rate": 0.00034537878926919283, + "loss": 0.7931, + "num_input_tokens_seen": 73159688, + "step": 126010 + }, + { + "epoch": 18.76899016979446, + "grad_norm": 0.04931640625, + "learning_rate": 0.00034496294492870395, + "loss": 0.8102, + "num_input_tokens_seen": 73162376, + "step": 126015 + }, + { + "epoch": 18.769734882335417, + "grad_norm": 0.035400390625, + "learning_rate": 0.00034454734817016995, + "loss": 0.8009, + "num_input_tokens_seen": 73165032, + "step": 126020 + }, + { + "epoch": 18.770479594876377, + "grad_norm": 0.051025390625, + "learning_rate": 0.0003441319990006153, + "loss": 0.804, + "num_input_tokens_seen": 73167816, + "step": 126025 + }, + { + "epoch": 18.771224307417338, + "grad_norm": 0.060546875, + "learning_rate": 0.00034371689742705437, + "loss": 0.7806, + "num_input_tokens_seen": 73170504, + "step": 126030 + }, + { + "epoch": 18.771969019958295, + "grad_norm": 0.03759765625, + "learning_rate": 0.0003433020434564998, + "loss": 0.8051, + "num_input_tokens_seen": 73173128, + "step": 126035 + }, + { + "epoch": 18.772713732499255, + "grad_norm": 0.052978515625, + "learning_rate": 0.00034288743709596115, + "loss": 0.7964, + "num_input_tokens_seen": 73175880, + "step": 126040 + }, + { + "epoch": 18.773458445040216, + "grad_norm": 0.0390625, + "learning_rate": 0.0003424730783524443, + "loss": 0.8139, + "num_input_tokens_seen": 73178760, + "step": 126045 + }, + { + "epoch": 18.774203157581173, + "grad_norm": 0.045654296875, + "learning_rate": 0.0003420589672329455, + "loss": 0.7863, + "num_input_tokens_seen": 73181512, + "step": 126050 + }, + { + "epoch": 18.774947870122134, + "grad_norm": 0.046630859375, + "learning_rate": 0.0003416451037444623, + "loss": 0.7957, + "num_input_tokens_seen": 73184648, + "step": 126055 + }, + { + "epoch": 18.77569258266309, + "grad_norm": 0.05517578125, + "learning_rate": 0.00034123148789398926, + "loss": 0.8144, + "num_input_tokens_seen": 73187784, + "step": 126060 + }, + { + "epoch": 18.77643729520405, + "grad_norm": 0.0458984375, + "learning_rate": 0.00034081811968850905, + "loss": 0.8113, + "num_input_tokens_seen": 73190760, + "step": 126065 + }, + { + "epoch": 18.77718200774501, + "grad_norm": 0.051513671875, + "learning_rate": 0.0003404049991350094, + "loss": 0.8165, + "num_input_tokens_seen": 73193416, + "step": 126070 + }, + { + "epoch": 18.77792672028597, + "grad_norm": 0.025390625, + "learning_rate": 0.0003399921262404665, + "loss": 0.7906, + "num_input_tokens_seen": 73196424, + "step": 126075 + }, + { + "epoch": 18.77867143282693, + "grad_norm": 0.037109375, + "learning_rate": 0.0003395795010118596, + "loss": 0.7838, + "num_input_tokens_seen": 73199240, + "step": 126080 + }, + { + "epoch": 18.77941614536789, + "grad_norm": 0.05078125, + "learning_rate": 0.00033916712345615326, + "loss": 0.7939, + "num_input_tokens_seen": 73202056, + "step": 126085 + }, + { + "epoch": 18.780160857908847, + "grad_norm": 0.03564453125, + "learning_rate": 0.0003387549935803202, + "loss": 0.8165, + "num_input_tokens_seen": 73205096, + "step": 126090 + }, + { + "epoch": 18.780905570449807, + "grad_norm": 0.05078125, + "learning_rate": 0.0003383431113913215, + "loss": 0.8002, + "num_input_tokens_seen": 73207624, + "step": 126095 + }, + { + "epoch": 18.781650282990764, + "grad_norm": 0.041015625, + "learning_rate": 0.00033793147689611334, + "loss": 0.8041, + "num_input_tokens_seen": 73210504, + "step": 126100 + }, + { + "epoch": 18.782394995531725, + "grad_norm": 0.038330078125, + "learning_rate": 0.00033752009010165005, + "loss": 0.8103, + "num_input_tokens_seen": 73213256, + "step": 126105 + }, + { + "epoch": 18.783139708072685, + "grad_norm": 0.05419921875, + "learning_rate": 0.0003371089510148828, + "loss": 0.8063, + "num_input_tokens_seen": 73216680, + "step": 126110 + }, + { + "epoch": 18.783884420613642, + "grad_norm": 0.0458984375, + "learning_rate": 0.00033669805964275766, + "loss": 0.7993, + "num_input_tokens_seen": 73219848, + "step": 126115 + }, + { + "epoch": 18.784629133154603, + "grad_norm": 0.0673828125, + "learning_rate": 0.00033628741599221577, + "loss": 0.7887, + "num_input_tokens_seen": 73222856, + "step": 126120 + }, + { + "epoch": 18.785373845695563, + "grad_norm": 0.0303955078125, + "learning_rate": 0.0003358770200701949, + "loss": 0.7839, + "num_input_tokens_seen": 73225608, + "step": 126125 + }, + { + "epoch": 18.78611855823652, + "grad_norm": 0.05712890625, + "learning_rate": 0.0003354668718836262, + "loss": 0.7878, + "num_input_tokens_seen": 73228712, + "step": 126130 + }, + { + "epoch": 18.78686327077748, + "grad_norm": 0.0517578125, + "learning_rate": 0.0003350569714394408, + "loss": 0.7999, + "num_input_tokens_seen": 73231560, + "step": 126135 + }, + { + "epoch": 18.787607983318438, + "grad_norm": 0.047119140625, + "learning_rate": 0.00033464731874456474, + "loss": 0.8043, + "num_input_tokens_seen": 73234760, + "step": 126140 + }, + { + "epoch": 18.7883526958594, + "grad_norm": 0.031982421875, + "learning_rate": 0.00033423791380591427, + "loss": 0.7968, + "num_input_tokens_seen": 73237544, + "step": 126145 + }, + { + "epoch": 18.78909740840036, + "grad_norm": 0.050048828125, + "learning_rate": 0.00033382875663041044, + "loss": 0.7882, + "num_input_tokens_seen": 73240520, + "step": 126150 + }, + { + "epoch": 18.789842120941316, + "grad_norm": 0.0400390625, + "learning_rate": 0.00033341984722496277, + "loss": 0.771, + "num_input_tokens_seen": 73243560, + "step": 126155 + }, + { + "epoch": 18.790586833482276, + "grad_norm": 0.038330078125, + "learning_rate": 0.00033301118559648233, + "loss": 0.8025, + "num_input_tokens_seen": 73246536, + "step": 126160 + }, + { + "epoch": 18.791331546023237, + "grad_norm": 0.03955078125, + "learning_rate": 0.00033260277175187035, + "loss": 0.7868, + "num_input_tokens_seen": 73249640, + "step": 126165 + }, + { + "epoch": 18.792076258564194, + "grad_norm": 0.06591796875, + "learning_rate": 0.00033219460569802625, + "loss": 0.7942, + "num_input_tokens_seen": 73252712, + "step": 126170 + }, + { + "epoch": 18.792820971105154, + "grad_norm": 0.04052734375, + "learning_rate": 0.0003317866874418479, + "loss": 0.7921, + "num_input_tokens_seen": 73255656, + "step": 126175 + }, + { + "epoch": 18.79356568364611, + "grad_norm": 0.042236328125, + "learning_rate": 0.00033137901699022474, + "loss": 0.8102, + "num_input_tokens_seen": 73258440, + "step": 126180 + }, + { + "epoch": 18.794310396187072, + "grad_norm": 0.1845703125, + "learning_rate": 0.0003309715943500446, + "loss": 0.8412, + "num_input_tokens_seen": 73261448, + "step": 126185 + }, + { + "epoch": 18.795055108728032, + "grad_norm": 0.045654296875, + "learning_rate": 0.0003305644195281887, + "loss": 0.7964, + "num_input_tokens_seen": 73264232, + "step": 126190 + }, + { + "epoch": 18.79579982126899, + "grad_norm": 0.0439453125, + "learning_rate": 0.0003301574925315398, + "loss": 0.7938, + "num_input_tokens_seen": 73266856, + "step": 126195 + }, + { + "epoch": 18.79654453380995, + "grad_norm": 0.052490234375, + "learning_rate": 0.00032975081336696907, + "loss": 0.8376, + "num_input_tokens_seen": 73269544, + "step": 126200 + }, + { + "epoch": 18.797289246350907, + "grad_norm": 0.03955078125, + "learning_rate": 0.0003293443820413494, + "loss": 0.787, + "num_input_tokens_seen": 73272488, + "step": 126205 + }, + { + "epoch": 18.798033958891867, + "grad_norm": 0.09375, + "learning_rate": 0.0003289381985615436, + "loss": 0.7808, + "num_input_tokens_seen": 73275016, + "step": 126210 + }, + { + "epoch": 18.798778671432828, + "grad_norm": 0.0341796875, + "learning_rate": 0.0003285322629344178, + "loss": 0.8068, + "num_input_tokens_seen": 73278120, + "step": 126215 + }, + { + "epoch": 18.799523383973785, + "grad_norm": 0.041015625, + "learning_rate": 0.0003281265751668283, + "loss": 0.7865, + "num_input_tokens_seen": 73281224, + "step": 126220 + }, + { + "epoch": 18.800268096514746, + "grad_norm": 0.045654296875, + "learning_rate": 0.0003277211352656262, + "loss": 0.7835, + "num_input_tokens_seen": 73284232, + "step": 126225 + }, + { + "epoch": 18.801012809055706, + "grad_norm": 0.049560546875, + "learning_rate": 0.000327315943237666, + "loss": 0.7954, + "num_input_tokens_seen": 73286984, + "step": 126230 + }, + { + "epoch": 18.801757521596663, + "grad_norm": 0.056396484375, + "learning_rate": 0.0003269109990897889, + "loss": 0.7927, + "num_input_tokens_seen": 73289768, + "step": 126235 + }, + { + "epoch": 18.802502234137624, + "grad_norm": 0.046142578125, + "learning_rate": 0.0003265063028288395, + "loss": 0.8101, + "num_input_tokens_seen": 73292712, + "step": 126240 + }, + { + "epoch": 18.80324694667858, + "grad_norm": 0.3359375, + "learning_rate": 0.000326101854461649, + "loss": 0.8098, + "num_input_tokens_seen": 73295720, + "step": 126245 + }, + { + "epoch": 18.80399165921954, + "grad_norm": 0.0693359375, + "learning_rate": 0.0003256976539950568, + "loss": 0.782, + "num_input_tokens_seen": 73298568, + "step": 126250 + }, + { + "epoch": 18.8047363717605, + "grad_norm": 0.0439453125, + "learning_rate": 0.0003252937014358875, + "loss": 0.7952, + "num_input_tokens_seen": 73301320, + "step": 126255 + }, + { + "epoch": 18.80548108430146, + "grad_norm": 0.0869140625, + "learning_rate": 0.0003248899967909657, + "loss": 0.812, + "num_input_tokens_seen": 73304104, + "step": 126260 + }, + { + "epoch": 18.80622579684242, + "grad_norm": 0.03857421875, + "learning_rate": 0.00032448654006711264, + "loss": 0.7811, + "num_input_tokens_seen": 73306952, + "step": 126265 + }, + { + "epoch": 18.80697050938338, + "grad_norm": 0.04248046875, + "learning_rate": 0.00032408333127114275, + "loss": 0.7993, + "num_input_tokens_seen": 73309800, + "step": 126270 + }, + { + "epoch": 18.807715221924337, + "grad_norm": 0.0439453125, + "learning_rate": 0.0003236803704098706, + "loss": 0.7913, + "num_input_tokens_seen": 73312808, + "step": 126275 + }, + { + "epoch": 18.808459934465297, + "grad_norm": 0.036865234375, + "learning_rate": 0.00032327765749010247, + "loss": 0.804, + "num_input_tokens_seen": 73315752, + "step": 126280 + }, + { + "epoch": 18.809204647006254, + "grad_norm": 0.04638671875, + "learning_rate": 0.00032287519251864125, + "loss": 0.8141, + "num_input_tokens_seen": 73318760, + "step": 126285 + }, + { + "epoch": 18.809949359547215, + "grad_norm": 0.0537109375, + "learning_rate": 0.0003224729755022848, + "loss": 0.7938, + "num_input_tokens_seen": 73321576, + "step": 126290 + }, + { + "epoch": 18.810694072088175, + "grad_norm": 0.072265625, + "learning_rate": 0.0003220710064478327, + "loss": 0.7945, + "num_input_tokens_seen": 73324744, + "step": 126295 + }, + { + "epoch": 18.811438784629132, + "grad_norm": 0.06201171875, + "learning_rate": 0.00032166928536207117, + "loss": 0.7864, + "num_input_tokens_seen": 73327560, + "step": 126300 + }, + { + "epoch": 18.812183497170093, + "grad_norm": 0.060546875, + "learning_rate": 0.0003212678122517898, + "loss": 0.8083, + "num_input_tokens_seen": 73330600, + "step": 126305 + }, + { + "epoch": 18.812928209711053, + "grad_norm": 0.05029296875, + "learning_rate": 0.0003208665871237698, + "loss": 0.8085, + "num_input_tokens_seen": 73333320, + "step": 126310 + }, + { + "epoch": 18.81367292225201, + "grad_norm": 0.1025390625, + "learning_rate": 0.0003204656099847891, + "loss": 0.8148, + "num_input_tokens_seen": 73336616, + "step": 126315 + }, + { + "epoch": 18.81441763479297, + "grad_norm": 0.0546875, + "learning_rate": 0.0003200648808416223, + "loss": 0.7998, + "num_input_tokens_seen": 73339496, + "step": 126320 + }, + { + "epoch": 18.815162347333928, + "grad_norm": 0.11572265625, + "learning_rate": 0.00031966439970103897, + "loss": 0.8138, + "num_input_tokens_seen": 73342248, + "step": 126325 + }, + { + "epoch": 18.81590705987489, + "grad_norm": 0.041259765625, + "learning_rate": 0.00031926416656980535, + "loss": 0.8023, + "num_input_tokens_seen": 73344872, + "step": 126330 + }, + { + "epoch": 18.81665177241585, + "grad_norm": 0.07421875, + "learning_rate": 0.0003188641814546844, + "loss": 0.7884, + "num_input_tokens_seen": 73347976, + "step": 126335 + }, + { + "epoch": 18.817396484956806, + "grad_norm": 0.033203125, + "learning_rate": 0.00031846444436242903, + "loss": 0.7876, + "num_input_tokens_seen": 73351016, + "step": 126340 + }, + { + "epoch": 18.818141197497766, + "grad_norm": 0.06982421875, + "learning_rate": 0.0003180649552997972, + "loss": 0.8094, + "num_input_tokens_seen": 73354248, + "step": 126345 + }, + { + "epoch": 18.818885910038723, + "grad_norm": 0.044921875, + "learning_rate": 0.0003176657142735351, + "loss": 0.8164, + "num_input_tokens_seen": 73357224, + "step": 126350 + }, + { + "epoch": 18.819630622579684, + "grad_norm": 0.04248046875, + "learning_rate": 0.000317266721290389, + "loss": 0.8048, + "num_input_tokens_seen": 73360104, + "step": 126355 + }, + { + "epoch": 18.820375335120644, + "grad_norm": 0.037353515625, + "learning_rate": 0.0003168679763570986, + "loss": 0.7833, + "num_input_tokens_seen": 73362984, + "step": 126360 + }, + { + "epoch": 18.8211200476616, + "grad_norm": 0.04248046875, + "learning_rate": 0.00031646947948039847, + "loss": 0.808, + "num_input_tokens_seen": 73365992, + "step": 126365 + }, + { + "epoch": 18.821864760202562, + "grad_norm": 0.033935546875, + "learning_rate": 0.0003160712306670249, + "loss": 0.7965, + "num_input_tokens_seen": 73368744, + "step": 126370 + }, + { + "epoch": 18.822609472743522, + "grad_norm": 0.055908203125, + "learning_rate": 0.00031567322992370414, + "loss": 0.7815, + "num_input_tokens_seen": 73371656, + "step": 126375 + }, + { + "epoch": 18.82335418528448, + "grad_norm": 0.03955078125, + "learning_rate": 0.0003152754772571609, + "loss": 0.7996, + "num_input_tokens_seen": 73374792, + "step": 126380 + }, + { + "epoch": 18.82409889782544, + "grad_norm": 0.0712890625, + "learning_rate": 0.00031487797267410963, + "loss": 0.7973, + "num_input_tokens_seen": 73378024, + "step": 126385 + }, + { + "epoch": 18.824843610366397, + "grad_norm": 0.056640625, + "learning_rate": 0.0003144807161812718, + "loss": 0.8133, + "num_input_tokens_seen": 73381192, + "step": 126390 + }, + { + "epoch": 18.825588322907358, + "grad_norm": 0.0517578125, + "learning_rate": 0.00031408370778535696, + "loss": 0.8094, + "num_input_tokens_seen": 73384104, + "step": 126395 + }, + { + "epoch": 18.826333035448318, + "grad_norm": 0.049560546875, + "learning_rate": 0.00031368694749307145, + "loss": 0.7885, + "num_input_tokens_seen": 73387080, + "step": 126400 + }, + { + "epoch": 18.827077747989275, + "grad_norm": 0.046630859375, + "learning_rate": 0.0003132904353111182, + "loss": 0.8125, + "num_input_tokens_seen": 73389928, + "step": 126405 + }, + { + "epoch": 18.827822460530236, + "grad_norm": 0.03662109375, + "learning_rate": 0.0003128941712461969, + "loss": 0.8127, + "num_input_tokens_seen": 73392488, + "step": 126410 + }, + { + "epoch": 18.828567173071196, + "grad_norm": 0.0712890625, + "learning_rate": 0.0003124981553049988, + "loss": 0.8166, + "num_input_tokens_seen": 73395240, + "step": 126415 + }, + { + "epoch": 18.829311885612153, + "grad_norm": 0.04638671875, + "learning_rate": 0.0003121023874942186, + "loss": 0.8015, + "num_input_tokens_seen": 73398216, + "step": 126420 + }, + { + "epoch": 18.830056598153114, + "grad_norm": 0.0537109375, + "learning_rate": 0.0003117068678205409, + "loss": 0.7886, + "num_input_tokens_seen": 73401128, + "step": 126425 + }, + { + "epoch": 18.83080131069407, + "grad_norm": 0.053955078125, + "learning_rate": 0.00031131159629064706, + "loss": 0.7831, + "num_input_tokens_seen": 73404328, + "step": 126430 + }, + { + "epoch": 18.83154602323503, + "grad_norm": 0.038330078125, + "learning_rate": 0.0003109165729112151, + "loss": 0.7947, + "num_input_tokens_seen": 73406952, + "step": 126435 + }, + { + "epoch": 18.83229073577599, + "grad_norm": 0.03955078125, + "learning_rate": 0.000310521797688918, + "loss": 0.7897, + "num_input_tokens_seen": 73410152, + "step": 126440 + }, + { + "epoch": 18.83303544831695, + "grad_norm": 0.037841796875, + "learning_rate": 0.00031012727063042534, + "loss": 0.7973, + "num_input_tokens_seen": 73412744, + "step": 126445 + }, + { + "epoch": 18.83378016085791, + "grad_norm": 0.049560546875, + "learning_rate": 0.0003097329917424002, + "loss": 0.8055, + "num_input_tokens_seen": 73415688, + "step": 126450 + }, + { + "epoch": 18.83452487339887, + "grad_norm": 0.05126953125, + "learning_rate": 0.0003093389610315089, + "loss": 0.7817, + "num_input_tokens_seen": 73418440, + "step": 126455 + }, + { + "epoch": 18.835269585939827, + "grad_norm": 0.09521484375, + "learning_rate": 0.0003089451785044045, + "loss": 0.8131, + "num_input_tokens_seen": 73421288, + "step": 126460 + }, + { + "epoch": 18.836014298480787, + "grad_norm": 0.03955078125, + "learning_rate": 0.00030855164416773826, + "loss": 0.7651, + "num_input_tokens_seen": 73424264, + "step": 126465 + }, + { + "epoch": 18.836759011021744, + "grad_norm": 0.068359375, + "learning_rate": 0.0003081583580281616, + "loss": 0.7972, + "num_input_tokens_seen": 73427272, + "step": 126470 + }, + { + "epoch": 18.837503723562705, + "grad_norm": 0.044677734375, + "learning_rate": 0.0003077653200923158, + "loss": 0.7985, + "num_input_tokens_seen": 73430472, + "step": 126475 + }, + { + "epoch": 18.838248436103665, + "grad_norm": 0.051025390625, + "learning_rate": 0.00030737253036684396, + "loss": 0.784, + "num_input_tokens_seen": 73433672, + "step": 126480 + }, + { + "epoch": 18.838993148644622, + "grad_norm": 0.041748046875, + "learning_rate": 0.00030697998885837905, + "loss": 0.803, + "num_input_tokens_seen": 73436552, + "step": 126485 + }, + { + "epoch": 18.839737861185583, + "grad_norm": 0.057861328125, + "learning_rate": 0.00030658769557355414, + "loss": 0.8116, + "num_input_tokens_seen": 73439272, + "step": 126490 + }, + { + "epoch": 18.84048257372654, + "grad_norm": 0.07861328125, + "learning_rate": 0.0003061956505189972, + "loss": 0.8034, + "num_input_tokens_seen": 73442216, + "step": 126495 + }, + { + "epoch": 18.8412272862675, + "grad_norm": 0.05078125, + "learning_rate": 0.0003058038537013313, + "loss": 0.799, + "num_input_tokens_seen": 73445160, + "step": 126500 + }, + { + "epoch": 18.84197199880846, + "grad_norm": 0.0390625, + "learning_rate": 0.0003054123051271745, + "loss": 0.7963, + "num_input_tokens_seen": 73448072, + "step": 126505 + }, + { + "epoch": 18.842716711349418, + "grad_norm": 0.058837890625, + "learning_rate": 0.00030502100480313987, + "loss": 0.8006, + "num_input_tokens_seen": 73451144, + "step": 126510 + }, + { + "epoch": 18.84346142389038, + "grad_norm": 0.037109375, + "learning_rate": 0.00030462995273584204, + "loss": 0.8123, + "num_input_tokens_seen": 73454056, + "step": 126515 + }, + { + "epoch": 18.84420613643134, + "grad_norm": 0.046630859375, + "learning_rate": 0.00030423914893188406, + "loss": 0.8018, + "num_input_tokens_seen": 73457064, + "step": 126520 + }, + { + "epoch": 18.844950848972296, + "grad_norm": 0.0294189453125, + "learning_rate": 0.0003038485933978707, + "loss": 0.8012, + "num_input_tokens_seen": 73459816, + "step": 126525 + }, + { + "epoch": 18.845695561513256, + "grad_norm": 0.03369140625, + "learning_rate": 0.0003034582861403967, + "loss": 0.7982, + "num_input_tokens_seen": 73462248, + "step": 126530 + }, + { + "epoch": 18.846440274054213, + "grad_norm": 0.04638671875, + "learning_rate": 0.0003030682271660617, + "loss": 0.797, + "num_input_tokens_seen": 73465160, + "step": 126535 + }, + { + "epoch": 18.847184986595174, + "grad_norm": 0.03759765625, + "learning_rate": 0.0003026784164814489, + "loss": 0.8096, + "num_input_tokens_seen": 73468072, + "step": 126540 + }, + { + "epoch": 18.847929699136134, + "grad_norm": 0.039306640625, + "learning_rate": 0.0003022888540931479, + "loss": 0.8215, + "num_input_tokens_seen": 73471176, + "step": 126545 + }, + { + "epoch": 18.84867441167709, + "grad_norm": 0.03564453125, + "learning_rate": 0.00030189954000773845, + "loss": 0.8109, + "num_input_tokens_seen": 73473896, + "step": 126550 + }, + { + "epoch": 18.849419124218052, + "grad_norm": 0.055419921875, + "learning_rate": 0.000301510474231797, + "loss": 0.7916, + "num_input_tokens_seen": 73476968, + "step": 126555 + }, + { + "epoch": 18.850163836759013, + "grad_norm": 0.053466796875, + "learning_rate": 0.00030112165677189827, + "loss": 0.7956, + "num_input_tokens_seen": 73480168, + "step": 126560 + }, + { + "epoch": 18.85090854929997, + "grad_norm": 0.037353515625, + "learning_rate": 0.0003007330876346087, + "loss": 0.8065, + "num_input_tokens_seen": 73483016, + "step": 126565 + }, + { + "epoch": 18.85165326184093, + "grad_norm": 0.0390625, + "learning_rate": 0.0003003447668264947, + "loss": 0.7876, + "num_input_tokens_seen": 73485800, + "step": 126570 + }, + { + "epoch": 18.852397974381887, + "grad_norm": 0.052734375, + "learning_rate": 0.00029995669435411597, + "loss": 0.7868, + "num_input_tokens_seen": 73488616, + "step": 126575 + }, + { + "epoch": 18.853142686922848, + "grad_norm": 0.037841796875, + "learning_rate": 0.000299568870224029, + "loss": 0.7863, + "num_input_tokens_seen": 73491560, + "step": 126580 + }, + { + "epoch": 18.853887399463808, + "grad_norm": 0.1708984375, + "learning_rate": 0.0002991812944427835, + "loss": 0.8256, + "num_input_tokens_seen": 73494504, + "step": 126585 + }, + { + "epoch": 18.854632112004765, + "grad_norm": 0.0458984375, + "learning_rate": 0.00029879396701692926, + "loss": 0.7956, + "num_input_tokens_seen": 73497672, + "step": 126590 + }, + { + "epoch": 18.855376824545726, + "grad_norm": 0.0308837890625, + "learning_rate": 0.0002984068879530094, + "loss": 0.7871, + "num_input_tokens_seen": 73500552, + "step": 126595 + }, + { + "epoch": 18.856121537086686, + "grad_norm": 0.043212890625, + "learning_rate": 0.0002980200572575636, + "loss": 0.7844, + "num_input_tokens_seen": 73503592, + "step": 126600 + }, + { + "epoch": 18.856866249627643, + "grad_norm": 0.040771484375, + "learning_rate": 0.00029763347493712674, + "loss": 0.7885, + "num_input_tokens_seen": 73506664, + "step": 126605 + }, + { + "epoch": 18.857610962168604, + "grad_norm": 0.08447265625, + "learning_rate": 0.0002972471409982269, + "loss": 0.8023, + "num_input_tokens_seen": 73509480, + "step": 126610 + }, + { + "epoch": 18.85835567470956, + "grad_norm": 0.03662109375, + "learning_rate": 0.00029686105544739715, + "loss": 0.7949, + "num_input_tokens_seen": 73512392, + "step": 126615 + }, + { + "epoch": 18.85910038725052, + "grad_norm": 0.0439453125, + "learning_rate": 0.00029647521829115396, + "loss": 0.8022, + "num_input_tokens_seen": 73515208, + "step": 126620 + }, + { + "epoch": 18.85984509979148, + "grad_norm": 0.03662109375, + "learning_rate": 0.0002960896295360188, + "loss": 0.7982, + "num_input_tokens_seen": 73518376, + "step": 126625 + }, + { + "epoch": 18.86058981233244, + "grad_norm": 0.11572265625, + "learning_rate": 0.0002957042891885048, + "loss": 0.7984, + "num_input_tokens_seen": 73521096, + "step": 126630 + }, + { + "epoch": 18.8613345248734, + "grad_norm": 0.09619140625, + "learning_rate": 0.0002953191972551233, + "loss": 0.8115, + "num_input_tokens_seen": 73524008, + "step": 126635 + }, + { + "epoch": 18.86207923741436, + "grad_norm": 0.041015625, + "learning_rate": 0.0002949343537423776, + "loss": 0.8071, + "num_input_tokens_seen": 73526792, + "step": 126640 + }, + { + "epoch": 18.862823949955317, + "grad_norm": 0.052978515625, + "learning_rate": 0.00029454975865676915, + "loss": 0.8048, + "num_input_tokens_seen": 73529352, + "step": 126645 + }, + { + "epoch": 18.863568662496277, + "grad_norm": 0.042236328125, + "learning_rate": 0.0002941654120047976, + "loss": 0.8164, + "num_input_tokens_seen": 73532392, + "step": 126650 + }, + { + "epoch": 18.864313375037234, + "grad_norm": 0.046630859375, + "learning_rate": 0.0002937813137929546, + "loss": 0.8093, + "num_input_tokens_seen": 73535400, + "step": 126655 + }, + { + "epoch": 18.865058087578195, + "grad_norm": 0.052734375, + "learning_rate": 0.0002933974640277315, + "loss": 0.792, + "num_input_tokens_seen": 73538536, + "step": 126660 + }, + { + "epoch": 18.865802800119155, + "grad_norm": 0.026123046875, + "learning_rate": 0.0002930138627156098, + "loss": 0.7997, + "num_input_tokens_seen": 73541288, + "step": 126665 + }, + { + "epoch": 18.866547512660112, + "grad_norm": 0.056884765625, + "learning_rate": 0.00029263050986307103, + "loss": 0.8018, + "num_input_tokens_seen": 73544232, + "step": 126670 + }, + { + "epoch": 18.867292225201073, + "grad_norm": 0.03759765625, + "learning_rate": 0.0002922474054765933, + "loss": 0.7913, + "num_input_tokens_seen": 73547176, + "step": 126675 + }, + { + "epoch": 18.868036937742033, + "grad_norm": 0.0830078125, + "learning_rate": 0.0002918645495626448, + "loss": 0.802, + "num_input_tokens_seen": 73550056, + "step": 126680 + }, + { + "epoch": 18.86878165028299, + "grad_norm": 0.06396484375, + "learning_rate": 0.00029148194212769873, + "loss": 0.7722, + "num_input_tokens_seen": 73553096, + "step": 126685 + }, + { + "epoch": 18.86952636282395, + "grad_norm": 0.037841796875, + "learning_rate": 0.0002910995831782148, + "loss": 0.8082, + "num_input_tokens_seen": 73556200, + "step": 126690 + }, + { + "epoch": 18.870271075364908, + "grad_norm": 0.044921875, + "learning_rate": 0.00029071747272065295, + "loss": 0.7852, + "num_input_tokens_seen": 73558824, + "step": 126695 + }, + { + "epoch": 18.87101578790587, + "grad_norm": 0.03515625, + "learning_rate": 0.0002903356107614713, + "loss": 0.8081, + "num_input_tokens_seen": 73561512, + "step": 126700 + }, + { + "epoch": 18.87176050044683, + "grad_norm": 0.055419921875, + "learning_rate": 0.00028995399730711967, + "loss": 0.8195, + "num_input_tokens_seen": 73564648, + "step": 126705 + }, + { + "epoch": 18.872505212987786, + "grad_norm": 0.0277099609375, + "learning_rate": 0.00028957263236404463, + "loss": 0.7909, + "num_input_tokens_seen": 73567368, + "step": 126710 + }, + { + "epoch": 18.873249925528746, + "grad_norm": 0.037841796875, + "learning_rate": 0.0002891915159386876, + "loss": 0.7882, + "num_input_tokens_seen": 73570216, + "step": 126715 + }, + { + "epoch": 18.873994638069703, + "grad_norm": 0.0294189453125, + "learning_rate": 0.00028881064803748855, + "loss": 0.8037, + "num_input_tokens_seen": 73572968, + "step": 126720 + }, + { + "epoch": 18.874739350610664, + "grad_norm": 0.039794921875, + "learning_rate": 0.0002884300286668823, + "loss": 0.8001, + "num_input_tokens_seen": 73576008, + "step": 126725 + }, + { + "epoch": 18.875484063151625, + "grad_norm": 0.047607421875, + "learning_rate": 0.0002880496578332986, + "loss": 0.7974, + "num_input_tokens_seen": 73578856, + "step": 126730 + }, + { + "epoch": 18.87622877569258, + "grad_norm": 0.037353515625, + "learning_rate": 0.00028766953554316244, + "loss": 0.7833, + "num_input_tokens_seen": 73581704, + "step": 126735 + }, + { + "epoch": 18.876973488233542, + "grad_norm": 0.041259765625, + "learning_rate": 0.0002872896618028969, + "loss": 0.7935, + "num_input_tokens_seen": 73584776, + "step": 126740 + }, + { + "epoch": 18.877718200774503, + "grad_norm": 0.043701171875, + "learning_rate": 0.00028691003661892033, + "loss": 0.8231, + "num_input_tokens_seen": 73588008, + "step": 126745 + }, + { + "epoch": 18.87846291331546, + "grad_norm": 0.046142578125, + "learning_rate": 0.00028653065999764417, + "loss": 0.8121, + "num_input_tokens_seen": 73591048, + "step": 126750 + }, + { + "epoch": 18.87920762585642, + "grad_norm": 0.06591796875, + "learning_rate": 0.00028615153194547823, + "loss": 0.7924, + "num_input_tokens_seen": 73594248, + "step": 126755 + }, + { + "epoch": 18.879952338397377, + "grad_norm": 0.1044921875, + "learning_rate": 0.0002857726524688292, + "loss": 0.7688, + "num_input_tokens_seen": 73597096, + "step": 126760 + }, + { + "epoch": 18.880697050938338, + "grad_norm": 0.0517578125, + "learning_rate": 0.0002853940215740952, + "loss": 0.7901, + "num_input_tokens_seen": 73600168, + "step": 126765 + }, + { + "epoch": 18.881441763479298, + "grad_norm": 0.0439453125, + "learning_rate": 0.0002850156392676728, + "loss": 0.8059, + "num_input_tokens_seen": 73603144, + "step": 126770 + }, + { + "epoch": 18.882186476020255, + "grad_norm": 0.032958984375, + "learning_rate": 0.00028463750555595687, + "loss": 0.8016, + "num_input_tokens_seen": 73606248, + "step": 126775 + }, + { + "epoch": 18.882931188561216, + "grad_norm": 0.058837890625, + "learning_rate": 0.0002842596204453357, + "loss": 0.7935, + "num_input_tokens_seen": 73609256, + "step": 126780 + }, + { + "epoch": 18.883675901102176, + "grad_norm": 0.04833984375, + "learning_rate": 0.0002838819839421891, + "loss": 0.8033, + "num_input_tokens_seen": 73611816, + "step": 126785 + }, + { + "epoch": 18.884420613643133, + "grad_norm": 0.056884765625, + "learning_rate": 0.00028350459605290036, + "loss": 0.7873, + "num_input_tokens_seen": 73614696, + "step": 126790 + }, + { + "epoch": 18.885165326184094, + "grad_norm": 0.054931640625, + "learning_rate": 0.0002831274567838443, + "loss": 0.8102, + "num_input_tokens_seen": 73617736, + "step": 126795 + }, + { + "epoch": 18.88591003872505, + "grad_norm": 0.051513671875, + "learning_rate": 0.00028275056614139257, + "loss": 0.8079, + "num_input_tokens_seen": 73620616, + "step": 126800 + }, + { + "epoch": 18.88665475126601, + "grad_norm": 0.058837890625, + "learning_rate": 0.00028237392413191177, + "loss": 0.8248, + "num_input_tokens_seen": 73624040, + "step": 126805 + }, + { + "epoch": 18.88739946380697, + "grad_norm": 0.042236328125, + "learning_rate": 0.00028199753076176503, + "loss": 0.783, + "num_input_tokens_seen": 73627176, + "step": 126810 + }, + { + "epoch": 18.88814417634793, + "grad_norm": 0.04345703125, + "learning_rate": 0.0002816213860373107, + "loss": 0.7986, + "num_input_tokens_seen": 73630280, + "step": 126815 + }, + { + "epoch": 18.88888888888889, + "grad_norm": 0.0703125, + "learning_rate": 0.0002812454899649069, + "loss": 0.7985, + "num_input_tokens_seen": 73632968, + "step": 126820 + }, + { + "epoch": 18.88963360142985, + "grad_norm": 0.03857421875, + "learning_rate": 0.00028086984255089873, + "loss": 0.8088, + "num_input_tokens_seen": 73636008, + "step": 126825 + }, + { + "epoch": 18.890378313970807, + "grad_norm": 0.03271484375, + "learning_rate": 0.000280494443801636, + "loss": 0.8031, + "num_input_tokens_seen": 73638664, + "step": 126830 + }, + { + "epoch": 18.891123026511767, + "grad_norm": 0.0308837890625, + "learning_rate": 0.00028011929372345865, + "loss": 0.7865, + "num_input_tokens_seen": 73641608, + "step": 126835 + }, + { + "epoch": 18.891867739052724, + "grad_norm": 0.038818359375, + "learning_rate": 0.000279744392322705, + "loss": 0.8057, + "num_input_tokens_seen": 73644392, + "step": 126840 + }, + { + "epoch": 18.892612451593685, + "grad_norm": 0.06982421875, + "learning_rate": 0.0002793697396057082, + "loss": 0.7952, + "num_input_tokens_seen": 73647400, + "step": 126845 + }, + { + "epoch": 18.893357164134645, + "grad_norm": 0.050537109375, + "learning_rate": 0.0002789953355787983, + "loss": 0.7853, + "num_input_tokens_seen": 73650152, + "step": 126850 + }, + { + "epoch": 18.894101876675602, + "grad_norm": 0.037353515625, + "learning_rate": 0.00027862118024830185, + "loss": 0.7913, + "num_input_tokens_seen": 73653032, + "step": 126855 + }, + { + "epoch": 18.894846589216563, + "grad_norm": 0.059326171875, + "learning_rate": 0.0002782472736205388, + "loss": 0.7906, + "num_input_tokens_seen": 73656264, + "step": 126860 + }, + { + "epoch": 18.89559130175752, + "grad_norm": 0.04248046875, + "learning_rate": 0.0002778736157018241, + "loss": 0.7777, + "num_input_tokens_seen": 73658952, + "step": 126865 + }, + { + "epoch": 18.89633601429848, + "grad_norm": 0.054443359375, + "learning_rate": 0.00027750020649847107, + "loss": 0.7912, + "num_input_tokens_seen": 73661896, + "step": 126870 + }, + { + "epoch": 18.89708072683944, + "grad_norm": 0.027099609375, + "learning_rate": 0.0002771270460167896, + "loss": 0.8087, + "num_input_tokens_seen": 73664648, + "step": 126875 + }, + { + "epoch": 18.897825439380398, + "grad_norm": 0.057373046875, + "learning_rate": 0.0002767541342630847, + "loss": 0.7907, + "num_input_tokens_seen": 73667368, + "step": 126880 + }, + { + "epoch": 18.89857015192136, + "grad_norm": 0.040283203125, + "learning_rate": 0.0002763814712436513, + "loss": 0.8094, + "num_input_tokens_seen": 73670056, + "step": 126885 + }, + { + "epoch": 18.89931486446232, + "grad_norm": 0.05078125, + "learning_rate": 0.00027600905696479103, + "loss": 0.8097, + "num_input_tokens_seen": 73672744, + "step": 126890 + }, + { + "epoch": 18.900059577003276, + "grad_norm": 0.0556640625, + "learning_rate": 0.0002756368914327939, + "loss": 0.8134, + "num_input_tokens_seen": 73675496, + "step": 126895 + }, + { + "epoch": 18.900804289544237, + "grad_norm": 0.1025390625, + "learning_rate": 0.00027526497465394484, + "loss": 0.8052, + "num_input_tokens_seen": 73678600, + "step": 126900 + }, + { + "epoch": 18.901549002085194, + "grad_norm": 0.0693359375, + "learning_rate": 0.0002748933066345288, + "loss": 0.8029, + "num_input_tokens_seen": 73681224, + "step": 126905 + }, + { + "epoch": 18.902293714626154, + "grad_norm": 0.030029296875, + "learning_rate": 0.0002745218873808225, + "loss": 0.8018, + "num_input_tokens_seen": 73684328, + "step": 126910 + }, + { + "epoch": 18.903038427167115, + "grad_norm": 0.04345703125, + "learning_rate": 0.00027415071689910586, + "loss": 0.7701, + "num_input_tokens_seen": 73686920, + "step": 126915 + }, + { + "epoch": 18.90378313970807, + "grad_norm": 0.5, + "learning_rate": 0.0002737797951956422, + "loss": 0.8296, + "num_input_tokens_seen": 73689704, + "step": 126920 + }, + { + "epoch": 18.904527852249032, + "grad_norm": 0.0281982421875, + "learning_rate": 0.0002734091222767049, + "loss": 0.7883, + "num_input_tokens_seen": 73692552, + "step": 126925 + }, + { + "epoch": 18.905272564789993, + "grad_norm": 0.053466796875, + "learning_rate": 0.0002730386981485522, + "loss": 0.7862, + "num_input_tokens_seen": 73695624, + "step": 126930 + }, + { + "epoch": 18.90601727733095, + "grad_norm": 0.04638671875, + "learning_rate": 0.00027266852281744244, + "loss": 0.8164, + "num_input_tokens_seen": 73698280, + "step": 126935 + }, + { + "epoch": 18.90676198987191, + "grad_norm": 0.06494140625, + "learning_rate": 0.0002722985962896307, + "loss": 0.782, + "num_input_tokens_seen": 73701064, + "step": 126940 + }, + { + "epoch": 18.907506702412867, + "grad_norm": 0.037353515625, + "learning_rate": 0.00027192891857136524, + "loss": 0.7874, + "num_input_tokens_seen": 73703880, + "step": 126945 + }, + { + "epoch": 18.908251414953828, + "grad_norm": 0.1201171875, + "learning_rate": 0.0002715594896688911, + "loss": 0.7978, + "num_input_tokens_seen": 73706792, + "step": 126950 + }, + { + "epoch": 18.908996127494788, + "grad_norm": 0.03955078125, + "learning_rate": 0.0002711903095884499, + "loss": 0.8044, + "num_input_tokens_seen": 73709672, + "step": 126955 + }, + { + "epoch": 18.909740840035745, + "grad_norm": 0.07275390625, + "learning_rate": 0.00027082137833628003, + "loss": 0.8057, + "num_input_tokens_seen": 73712616, + "step": 126960 + }, + { + "epoch": 18.910485552576706, + "grad_norm": 0.3515625, + "learning_rate": 0.0002704526959186132, + "loss": 0.8275, + "num_input_tokens_seen": 73715496, + "step": 126965 + }, + { + "epoch": 18.911230265117666, + "grad_norm": 0.051513671875, + "learning_rate": 0.0002700842623416777, + "loss": 0.7972, + "num_input_tokens_seen": 73718248, + "step": 126970 + }, + { + "epoch": 18.911974977658623, + "grad_norm": 0.052978515625, + "learning_rate": 0.00026971607761169526, + "loss": 0.7864, + "num_input_tokens_seen": 73721320, + "step": 126975 + }, + { + "epoch": 18.912719690199584, + "grad_norm": 0.048828125, + "learning_rate": 0.00026934814173489084, + "loss": 0.7931, + "num_input_tokens_seen": 73724136, + "step": 126980 + }, + { + "epoch": 18.91346440274054, + "grad_norm": 0.0517578125, + "learning_rate": 0.00026898045471747785, + "loss": 0.7924, + "num_input_tokens_seen": 73726952, + "step": 126985 + }, + { + "epoch": 18.9142091152815, + "grad_norm": 0.04052734375, + "learning_rate": 0.0002686130165656664, + "loss": 0.8341, + "num_input_tokens_seen": 73730184, + "step": 126990 + }, + { + "epoch": 18.914953827822462, + "grad_norm": 0.0703125, + "learning_rate": 0.00026824582728566805, + "loss": 0.8024, + "num_input_tokens_seen": 73733224, + "step": 126995 + }, + { + "epoch": 18.91569854036342, + "grad_norm": 0.051025390625, + "learning_rate": 0.0002678788868836812, + "loss": 0.8108, + "num_input_tokens_seen": 73736040, + "step": 127000 + }, + { + "epoch": 18.91644325290438, + "grad_norm": 0.039306640625, + "learning_rate": 0.0002675121953659093, + "loss": 0.7929, + "num_input_tokens_seen": 73739016, + "step": 127005 + }, + { + "epoch": 18.917187965445336, + "grad_norm": 0.060302734375, + "learning_rate": 0.000267145752738544, + "loss": 0.8181, + "num_input_tokens_seen": 73741672, + "step": 127010 + }, + { + "epoch": 18.917932677986297, + "grad_norm": 0.055908203125, + "learning_rate": 0.00026677955900778036, + "loss": 0.7925, + "num_input_tokens_seen": 73744904, + "step": 127015 + }, + { + "epoch": 18.918677390527257, + "grad_norm": 0.05322265625, + "learning_rate": 0.00026641361417980013, + "loss": 0.7952, + "num_input_tokens_seen": 73747560, + "step": 127020 + }, + { + "epoch": 18.919422103068214, + "grad_norm": 0.0634765625, + "learning_rate": 0.00026604791826078663, + "loss": 0.7722, + "num_input_tokens_seen": 73750472, + "step": 127025 + }, + { + "epoch": 18.920166815609175, + "grad_norm": 0.03857421875, + "learning_rate": 0.00026568247125691836, + "loss": 0.7557, + "num_input_tokens_seen": 73753128, + "step": 127030 + }, + { + "epoch": 18.920911528150135, + "grad_norm": 0.0546875, + "learning_rate": 0.0002653172731743719, + "loss": 0.7947, + "num_input_tokens_seen": 73756296, + "step": 127035 + }, + { + "epoch": 18.921656240691092, + "grad_norm": 0.126953125, + "learning_rate": 0.00026495232401931246, + "loss": 0.8081, + "num_input_tokens_seen": 73760552, + "step": 127040 + }, + { + "epoch": 18.922400953232053, + "grad_norm": 0.09716796875, + "learning_rate": 0.00026458762379790666, + "loss": 0.7978, + "num_input_tokens_seen": 73763528, + "step": 127045 + }, + { + "epoch": 18.92314566577301, + "grad_norm": 0.10498046875, + "learning_rate": 0.00026422317251631797, + "loss": 0.8036, + "num_input_tokens_seen": 73766600, + "step": 127050 + }, + { + "epoch": 18.92389037831397, + "grad_norm": 0.042724609375, + "learning_rate": 0.00026385897018070146, + "loss": 0.783, + "num_input_tokens_seen": 73769416, + "step": 127055 + }, + { + "epoch": 18.92463509085493, + "grad_norm": 0.037841796875, + "learning_rate": 0.0002634950167972105, + "loss": 0.8043, + "num_input_tokens_seen": 73772232, + "step": 127060 + }, + { + "epoch": 18.925379803395888, + "grad_norm": 0.03857421875, + "learning_rate": 0.00026313131237199524, + "loss": 0.7848, + "num_input_tokens_seen": 73775272, + "step": 127065 + }, + { + "epoch": 18.92612451593685, + "grad_norm": 0.053466796875, + "learning_rate": 0.0002627678569111974, + "loss": 0.7959, + "num_input_tokens_seen": 73778376, + "step": 127070 + }, + { + "epoch": 18.92686922847781, + "grad_norm": 0.05419921875, + "learning_rate": 0.00026240465042095873, + "loss": 0.7917, + "num_input_tokens_seen": 73781320, + "step": 127075 + }, + { + "epoch": 18.927613941018766, + "grad_norm": 0.0771484375, + "learning_rate": 0.0002620416929074126, + "loss": 0.7934, + "num_input_tokens_seen": 73784136, + "step": 127080 + }, + { + "epoch": 18.928358653559727, + "grad_norm": 0.04150390625, + "learning_rate": 0.0002616789843766959, + "loss": 0.8048, + "num_input_tokens_seen": 73787272, + "step": 127085 + }, + { + "epoch": 18.929103366100684, + "grad_norm": 0.06103515625, + "learning_rate": 0.00026131652483493196, + "loss": 0.8005, + "num_input_tokens_seen": 73790088, + "step": 127090 + }, + { + "epoch": 18.929848078641644, + "grad_norm": 0.046142578125, + "learning_rate": 0.00026095431428824754, + "loss": 0.7925, + "num_input_tokens_seen": 73792904, + "step": 127095 + }, + { + "epoch": 18.930592791182605, + "grad_norm": 0.049072265625, + "learning_rate": 0.00026059235274275783, + "loss": 0.7958, + "num_input_tokens_seen": 73795976, + "step": 127100 + }, + { + "epoch": 18.93133750372356, + "grad_norm": 0.08984375, + "learning_rate": 0.00026023064020457953, + "loss": 0.7909, + "num_input_tokens_seen": 73799080, + "step": 127105 + }, + { + "epoch": 18.932082216264522, + "grad_norm": 0.036865234375, + "learning_rate": 0.0002598691766798261, + "loss": 0.806, + "num_input_tokens_seen": 73801992, + "step": 127110 + }, + { + "epoch": 18.932826928805483, + "grad_norm": 0.041748046875, + "learning_rate": 0.00025950796217459935, + "loss": 0.79, + "num_input_tokens_seen": 73804872, + "step": 127115 + }, + { + "epoch": 18.93357164134644, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0002591469966950044, + "loss": 0.7831, + "num_input_tokens_seen": 73807720, + "step": 127120 + }, + { + "epoch": 18.9343163538874, + "grad_norm": 0.0751953125, + "learning_rate": 0.0002587862802471397, + "loss": 0.791, + "num_input_tokens_seen": 73810568, + "step": 127125 + }, + { + "epoch": 18.935061066428357, + "grad_norm": 0.0341796875, + "learning_rate": 0.00025842581283709863, + "loss": 0.8003, + "num_input_tokens_seen": 73813352, + "step": 127130 + }, + { + "epoch": 18.935805778969318, + "grad_norm": 0.059326171875, + "learning_rate": 0.00025806559447096977, + "loss": 0.7739, + "num_input_tokens_seen": 73816456, + "step": 127135 + }, + { + "epoch": 18.93655049151028, + "grad_norm": 0.06787109375, + "learning_rate": 0.00025770562515483984, + "loss": 0.7862, + "num_input_tokens_seen": 73819656, + "step": 127140 + }, + { + "epoch": 18.937295204051235, + "grad_norm": 0.047119140625, + "learning_rate": 0.00025734590489479237, + "loss": 0.7925, + "num_input_tokens_seen": 73822440, + "step": 127145 + }, + { + "epoch": 18.938039916592196, + "grad_norm": 0.05224609375, + "learning_rate": 0.0002569864336968991, + "loss": 0.8108, + "num_input_tokens_seen": 73825256, + "step": 127150 + }, + { + "epoch": 18.938784629133156, + "grad_norm": 0.041748046875, + "learning_rate": 0.00025662721156723687, + "loss": 0.7948, + "num_input_tokens_seen": 73828008, + "step": 127155 + }, + { + "epoch": 18.939529341674113, + "grad_norm": 0.05224609375, + "learning_rate": 0.0002562682385118742, + "loss": 0.7829, + "num_input_tokens_seen": 73831080, + "step": 127160 + }, + { + "epoch": 18.940274054215074, + "grad_norm": 0.04833984375, + "learning_rate": 0.0002559095145368745, + "loss": 0.8349, + "num_input_tokens_seen": 73834344, + "step": 127165 + }, + { + "epoch": 18.94101876675603, + "grad_norm": 0.048828125, + "learning_rate": 0.00025555103964829796, + "loss": 0.781, + "num_input_tokens_seen": 73837352, + "step": 127170 + }, + { + "epoch": 18.94176347929699, + "grad_norm": 0.042236328125, + "learning_rate": 0.00025519281385220135, + "loss": 0.8127, + "num_input_tokens_seen": 73840616, + "step": 127175 + }, + { + "epoch": 18.942508191837952, + "grad_norm": 0.0380859375, + "learning_rate": 0.00025483483715463495, + "loss": 0.8058, + "num_input_tokens_seen": 73843240, + "step": 127180 + }, + { + "epoch": 18.94325290437891, + "grad_norm": 0.056396484375, + "learning_rate": 0.0002544771095616488, + "loss": 0.805, + "num_input_tokens_seen": 73846280, + "step": 127185 + }, + { + "epoch": 18.94399761691987, + "grad_norm": 0.12060546875, + "learning_rate": 0.00025411963107928483, + "loss": 0.7898, + "num_input_tokens_seen": 73849320, + "step": 127190 + }, + { + "epoch": 18.94474232946083, + "grad_norm": 0.037353515625, + "learning_rate": 0.00025376240171358144, + "loss": 0.8005, + "num_input_tokens_seen": 73852200, + "step": 127195 + }, + { + "epoch": 18.945487042001787, + "grad_norm": 0.050048828125, + "learning_rate": 0.0002534054214705772, + "loss": 0.8153, + "num_input_tokens_seen": 73855368, + "step": 127200 + }, + { + "epoch": 18.946231754542747, + "grad_norm": 0.03515625, + "learning_rate": 0.00025304869035629895, + "loss": 0.7757, + "num_input_tokens_seen": 73858184, + "step": 127205 + }, + { + "epoch": 18.946976467083704, + "grad_norm": 0.041015625, + "learning_rate": 0.0002526922083767769, + "loss": 0.7919, + "num_input_tokens_seen": 73861000, + "step": 127210 + }, + { + "epoch": 18.947721179624665, + "grad_norm": 0.04541015625, + "learning_rate": 0.00025233597553802944, + "loss": 0.8072, + "num_input_tokens_seen": 73863848, + "step": 127215 + }, + { + "epoch": 18.948465892165625, + "grad_norm": 0.0498046875, + "learning_rate": 0.0002519799918460769, + "loss": 0.7936, + "num_input_tokens_seen": 73866728, + "step": 127220 + }, + { + "epoch": 18.949210604706582, + "grad_norm": 0.087890625, + "learning_rate": 0.0002516242573069344, + "loss": 0.8137, + "num_input_tokens_seen": 73869384, + "step": 127225 + }, + { + "epoch": 18.949955317247543, + "grad_norm": 0.0849609375, + "learning_rate": 0.00025126877192661044, + "loss": 0.8095, + "num_input_tokens_seen": 73872168, + "step": 127230 + }, + { + "epoch": 18.9507000297885, + "grad_norm": 0.02197265625, + "learning_rate": 0.0002509135357111103, + "loss": 0.8048, + "num_input_tokens_seen": 73874856, + "step": 127235 + }, + { + "epoch": 18.95144474232946, + "grad_norm": 0.03955078125, + "learning_rate": 0.00025055854866643575, + "loss": 0.8058, + "num_input_tokens_seen": 73877960, + "step": 127240 + }, + { + "epoch": 18.95218945487042, + "grad_norm": 0.03515625, + "learning_rate": 0.00025020381079858377, + "loss": 0.8076, + "num_input_tokens_seen": 73880808, + "step": 127245 + }, + { + "epoch": 18.952934167411378, + "grad_norm": 0.056884765625, + "learning_rate": 0.00024984932211354614, + "loss": 0.793, + "num_input_tokens_seen": 73883880, + "step": 127250 + }, + { + "epoch": 18.95367887995234, + "grad_norm": 0.052734375, + "learning_rate": 0.00024949508261731476, + "loss": 0.7896, + "num_input_tokens_seen": 73886792, + "step": 127255 + }, + { + "epoch": 18.9544235924933, + "grad_norm": 0.0791015625, + "learning_rate": 0.0002491410923158699, + "loss": 0.7838, + "num_input_tokens_seen": 73889352, + "step": 127260 + }, + { + "epoch": 18.955168305034256, + "grad_norm": 0.078125, + "learning_rate": 0.00024878735121519667, + "loss": 0.7962, + "num_input_tokens_seen": 73892296, + "step": 127265 + }, + { + "epoch": 18.955913017575217, + "grad_norm": 0.052978515625, + "learning_rate": 0.0002484338593212687, + "loss": 0.785, + "num_input_tokens_seen": 73894888, + "step": 127270 + }, + { + "epoch": 18.956657730116174, + "grad_norm": 0.04638671875, + "learning_rate": 0.0002480806166400545, + "loss": 0.786, + "num_input_tokens_seen": 73897704, + "step": 127275 + }, + { + "epoch": 18.957402442657134, + "grad_norm": 0.046875, + "learning_rate": 0.0002477276231775294, + "loss": 0.7774, + "num_input_tokens_seen": 73900680, + "step": 127280 + }, + { + "epoch": 18.958147155198095, + "grad_norm": 0.048095703125, + "learning_rate": 0.00024737487893964846, + "loss": 0.794, + "num_input_tokens_seen": 73903688, + "step": 127285 + }, + { + "epoch": 18.95889186773905, + "grad_norm": 0.0252685546875, + "learning_rate": 0.00024702238393237874, + "loss": 0.8113, + "num_input_tokens_seen": 73906280, + "step": 127290 + }, + { + "epoch": 18.959636580280012, + "grad_norm": 0.06298828125, + "learning_rate": 0.00024667013816167036, + "loss": 0.7932, + "num_input_tokens_seen": 73909288, + "step": 127295 + }, + { + "epoch": 18.960381292820973, + "grad_norm": 0.055908203125, + "learning_rate": 0.00024631814163347533, + "loss": 0.8101, + "num_input_tokens_seen": 73912200, + "step": 127300 + }, + { + "epoch": 18.96112600536193, + "grad_norm": 0.052734375, + "learning_rate": 0.00024596639435374044, + "loss": 0.8089, + "num_input_tokens_seen": 73914760, + "step": 127305 + }, + { + "epoch": 18.96187071790289, + "grad_norm": 0.037109375, + "learning_rate": 0.00024561489632840604, + "loss": 0.7961, + "num_input_tokens_seen": 73917576, + "step": 127310 + }, + { + "epoch": 18.962615430443847, + "grad_norm": 0.037109375, + "learning_rate": 0.000245263647563414, + "loss": 0.7842, + "num_input_tokens_seen": 73920456, + "step": 127315 + }, + { + "epoch": 18.963360142984808, + "grad_norm": 0.045654296875, + "learning_rate": 0.00024491264806469627, + "loss": 0.8098, + "num_input_tokens_seen": 73923720, + "step": 127320 + }, + { + "epoch": 18.96410485552577, + "grad_norm": 0.040771484375, + "learning_rate": 0.00024456189783818305, + "loss": 0.7808, + "num_input_tokens_seen": 73926504, + "step": 127325 + }, + { + "epoch": 18.964849568066725, + "grad_norm": 0.05517578125, + "learning_rate": 0.00024421139688979965, + "loss": 0.8053, + "num_input_tokens_seen": 73929384, + "step": 127330 + }, + { + "epoch": 18.965594280607686, + "grad_norm": 0.0732421875, + "learning_rate": 0.00024386114522546796, + "loss": 0.7669, + "num_input_tokens_seen": 73932328, + "step": 127335 + }, + { + "epoch": 18.966338993148646, + "grad_norm": 0.050048828125, + "learning_rate": 0.00024351114285110496, + "loss": 0.7928, + "num_input_tokens_seen": 73935176, + "step": 127340 + }, + { + "epoch": 18.967083705689603, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00024316138977262258, + "loss": 0.8033, + "num_input_tokens_seen": 73937896, + "step": 127345 + }, + { + "epoch": 18.967828418230564, + "grad_norm": 0.1708984375, + "learning_rate": 0.00024281188599593106, + "loss": 0.7876, + "num_input_tokens_seen": 73941064, + "step": 127350 + }, + { + "epoch": 18.96857313077152, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00024246263152693403, + "loss": 0.7903, + "num_input_tokens_seen": 73943880, + "step": 127355 + }, + { + "epoch": 18.96931784331248, + "grad_norm": 0.029052734375, + "learning_rate": 0.00024211362637153175, + "loss": 0.8012, + "num_input_tokens_seen": 73946728, + "step": 127360 + }, + { + "epoch": 18.970062555853442, + "grad_norm": 0.04150390625, + "learning_rate": 0.00024176487053562123, + "loss": 0.8059, + "num_input_tokens_seen": 73949672, + "step": 127365 + }, + { + "epoch": 18.9708072683944, + "grad_norm": 0.0478515625, + "learning_rate": 0.0002414163640250927, + "loss": 0.7799, + "num_input_tokens_seen": 73952264, + "step": 127370 + }, + { + "epoch": 18.97155198093536, + "grad_norm": 0.053466796875, + "learning_rate": 0.0002410681068458348, + "loss": 0.8156, + "num_input_tokens_seen": 73955240, + "step": 127375 + }, + { + "epoch": 18.972296693476316, + "grad_norm": 0.0439453125, + "learning_rate": 0.00024072009900373114, + "loss": 0.8159, + "num_input_tokens_seen": 73958120, + "step": 127380 + }, + { + "epoch": 18.973041406017277, + "grad_norm": 0.07568359375, + "learning_rate": 0.00024037234050466037, + "loss": 0.8091, + "num_input_tokens_seen": 73961128, + "step": 127385 + }, + { + "epoch": 18.973786118558237, + "grad_norm": 0.042236328125, + "learning_rate": 0.0002400248313544978, + "loss": 0.7899, + "num_input_tokens_seen": 73964040, + "step": 127390 + }, + { + "epoch": 18.974530831099194, + "grad_norm": 0.03173828125, + "learning_rate": 0.0002396775715591154, + "loss": 0.8069, + "num_input_tokens_seen": 73966888, + "step": 127395 + }, + { + "epoch": 18.975275543640155, + "grad_norm": 0.06396484375, + "learning_rate": 0.00023933056112437844, + "loss": 0.8041, + "num_input_tokens_seen": 73969960, + "step": 127400 + }, + { + "epoch": 18.976020256181116, + "grad_norm": 0.0400390625, + "learning_rate": 0.00023898380005614895, + "loss": 0.784, + "num_input_tokens_seen": 73972648, + "step": 127405 + }, + { + "epoch": 18.976764968722073, + "grad_norm": 0.0634765625, + "learning_rate": 0.0002386372883602855, + "loss": 0.7811, + "num_input_tokens_seen": 73975496, + "step": 127410 + }, + { + "epoch": 18.977509681263033, + "grad_norm": 0.0751953125, + "learning_rate": 0.0002382910260426435, + "loss": 0.8081, + "num_input_tokens_seen": 73978440, + "step": 127415 + }, + { + "epoch": 18.97825439380399, + "grad_norm": 0.046142578125, + "learning_rate": 0.0002379450131090699, + "loss": 0.8153, + "num_input_tokens_seen": 73981224, + "step": 127420 + }, + { + "epoch": 18.97899910634495, + "grad_norm": 0.06689453125, + "learning_rate": 0.00023759924956541332, + "loss": 0.7952, + "num_input_tokens_seen": 73984456, + "step": 127425 + }, + { + "epoch": 18.97974381888591, + "grad_norm": 0.0732421875, + "learning_rate": 0.00023725373541751414, + "loss": 0.7761, + "num_input_tokens_seen": 73987304, + "step": 127430 + }, + { + "epoch": 18.980488531426868, + "grad_norm": 0.044921875, + "learning_rate": 0.000236908470671206, + "loss": 0.7894, + "num_input_tokens_seen": 73990056, + "step": 127435 + }, + { + "epoch": 18.98123324396783, + "grad_norm": 0.047607421875, + "learning_rate": 0.00023656345533232758, + "loss": 0.798, + "num_input_tokens_seen": 73993416, + "step": 127440 + }, + { + "epoch": 18.98197795650879, + "grad_norm": 0.046142578125, + "learning_rate": 0.00023621868940670252, + "loss": 0.8127, + "num_input_tokens_seen": 73996104, + "step": 127445 + }, + { + "epoch": 18.982722669049746, + "grad_norm": 0.05810546875, + "learning_rate": 0.00023587417290015953, + "loss": 0.7754, + "num_input_tokens_seen": 73998600, + "step": 127450 + }, + { + "epoch": 18.983467381590707, + "grad_norm": 0.03759765625, + "learning_rate": 0.00023552990581851563, + "loss": 0.7759, + "num_input_tokens_seen": 74001256, + "step": 127455 + }, + { + "epoch": 18.984212094131664, + "grad_norm": 0.044677734375, + "learning_rate": 0.00023518588816758777, + "loss": 0.7966, + "num_input_tokens_seen": 74004104, + "step": 127460 + }, + { + "epoch": 18.984956806672624, + "grad_norm": 0.0390625, + "learning_rate": 0.00023484211995318971, + "loss": 0.7916, + "num_input_tokens_seen": 74007208, + "step": 127465 + }, + { + "epoch": 18.985701519213585, + "grad_norm": 0.0390625, + "learning_rate": 0.00023449860118112507, + "loss": 0.7937, + "num_input_tokens_seen": 74009864, + "step": 127470 + }, + { + "epoch": 18.98644623175454, + "grad_norm": 0.0322265625, + "learning_rate": 0.00023415533185720092, + "loss": 0.7963, + "num_input_tokens_seen": 74013224, + "step": 127475 + }, + { + "epoch": 18.987190944295502, + "grad_norm": 0.041015625, + "learning_rate": 0.0002338123119872143, + "loss": 0.7916, + "num_input_tokens_seen": 74016264, + "step": 127480 + }, + { + "epoch": 18.987935656836463, + "grad_norm": 0.06982421875, + "learning_rate": 0.0002334695415769622, + "loss": 0.7777, + "num_input_tokens_seen": 74019496, + "step": 127485 + }, + { + "epoch": 18.98868036937742, + "grad_norm": 0.0361328125, + "learning_rate": 0.00023312702063223333, + "loss": 0.8087, + "num_input_tokens_seen": 74022120, + "step": 127490 + }, + { + "epoch": 18.98942508191838, + "grad_norm": 0.040283203125, + "learning_rate": 0.00023278474915881475, + "loss": 0.7816, + "num_input_tokens_seen": 74024936, + "step": 127495 + }, + { + "epoch": 18.990169794459337, + "grad_norm": 0.04345703125, + "learning_rate": 0.00023244272716249013, + "loss": 0.7877, + "num_input_tokens_seen": 74028008, + "step": 127500 + }, + { + "epoch": 18.990914507000298, + "grad_norm": 0.0311279296875, + "learning_rate": 0.00023210095464903656, + "loss": 0.7967, + "num_input_tokens_seen": 74030888, + "step": 127505 + }, + { + "epoch": 18.99165921954126, + "grad_norm": 0.04150390625, + "learning_rate": 0.00023175943162422773, + "loss": 0.8136, + "num_input_tokens_seen": 74033480, + "step": 127510 + }, + { + "epoch": 18.992403932082215, + "grad_norm": 0.06005859375, + "learning_rate": 0.00023141815809383235, + "loss": 0.7946, + "num_input_tokens_seen": 74036424, + "step": 127515 + }, + { + "epoch": 18.993148644623176, + "grad_norm": 0.0859375, + "learning_rate": 0.00023107713406361752, + "loss": 0.8115, + "num_input_tokens_seen": 74039464, + "step": 127520 + }, + { + "epoch": 18.993893357164133, + "grad_norm": 0.03759765625, + "learning_rate": 0.00023073635953934355, + "loss": 0.7958, + "num_input_tokens_seen": 74042056, + "step": 127525 + }, + { + "epoch": 18.994638069705093, + "grad_norm": 0.059326171875, + "learning_rate": 0.00023039583452676924, + "loss": 0.8182, + "num_input_tokens_seen": 74045096, + "step": 127530 + }, + { + "epoch": 18.995382782246054, + "grad_norm": 0.04052734375, + "learning_rate": 0.00023005555903164498, + "loss": 0.8084, + "num_input_tokens_seen": 74048168, + "step": 127535 + }, + { + "epoch": 18.99612749478701, + "grad_norm": 0.056884765625, + "learning_rate": 0.0002297155330597228, + "loss": 0.8185, + "num_input_tokens_seen": 74050984, + "step": 127540 + }, + { + "epoch": 18.99687220732797, + "grad_norm": 0.05859375, + "learning_rate": 0.00022937575661674314, + "loss": 0.7743, + "num_input_tokens_seen": 74054344, + "step": 127545 + }, + { + "epoch": 18.997616919868932, + "grad_norm": 0.0260009765625, + "learning_rate": 0.00022903622970844804, + "loss": 0.797, + "num_input_tokens_seen": 74057288, + "step": 127550 + }, + { + "epoch": 18.99836163240989, + "grad_norm": 0.34375, + "learning_rate": 0.00022869695234057296, + "loss": 0.8024, + "num_input_tokens_seen": 74060264, + "step": 127555 + }, + { + "epoch": 18.99910634495085, + "grad_norm": 0.053466796875, + "learning_rate": 0.0002283579245188516, + "loss": 0.8231, + "num_input_tokens_seen": 74063336, + "step": 127560 + }, + { + "epoch": 18.999851057491806, + "grad_norm": 0.0791015625, + "learning_rate": 0.00022801914624900942, + "loss": 0.8225, + "num_input_tokens_seen": 74065864, + "step": 127565 + }, + { + "epoch": 19.0, + "eval_loss": 0.799495279788971, + "eval_runtime": 70.6051, + "eval_samples_per_second": 42.263, + "eval_steps_per_second": 10.566, + "num_input_tokens_seen": 74065984, + "step": 127566 + }, + { + "epoch": 19.000595770032767, + "grad_norm": 0.061279296875, + "learning_rate": 0.00022768061753676848, + "loss": 0.8254, + "num_input_tokens_seen": 74068512, + "step": 127570 + }, + { + "epoch": 19.001340482573728, + "grad_norm": 0.0771484375, + "learning_rate": 0.0002273423383878509, + "loss": 0.8153, + "num_input_tokens_seen": 74071232, + "step": 127575 + }, + { + "epoch": 19.002085195114685, + "grad_norm": 0.041259765625, + "learning_rate": 0.00022700430880796872, + "loss": 0.808, + "num_input_tokens_seen": 74074112, + "step": 127580 + }, + { + "epoch": 19.002829907655645, + "grad_norm": 0.03857421875, + "learning_rate": 0.00022666652880283576, + "loss": 0.7901, + "num_input_tokens_seen": 74076992, + "step": 127585 + }, + { + "epoch": 19.003574620196606, + "grad_norm": 0.043212890625, + "learning_rate": 0.00022632899837815744, + "loss": 0.8185, + "num_input_tokens_seen": 74080160, + "step": 127590 + }, + { + "epoch": 19.004319332737563, + "grad_norm": 0.0361328125, + "learning_rate": 0.0002259917175396342, + "loss": 0.7981, + "num_input_tokens_seen": 74083264, + "step": 127595 + }, + { + "epoch": 19.005064045278523, + "grad_norm": 0.03759765625, + "learning_rate": 0.0002256546862929648, + "loss": 0.7958, + "num_input_tokens_seen": 74086176, + "step": 127600 + }, + { + "epoch": 19.00580875781948, + "grad_norm": 0.042724609375, + "learning_rate": 0.00022531790464384304, + "loss": 0.7837, + "num_input_tokens_seen": 74089184, + "step": 127605 + }, + { + "epoch": 19.00655347036044, + "grad_norm": 0.1630859375, + "learning_rate": 0.00022498137259796102, + "loss": 0.8297, + "num_input_tokens_seen": 74092096, + "step": 127610 + }, + { + "epoch": 19.0072981829014, + "grad_norm": 0.04833984375, + "learning_rate": 0.00022464509016100086, + "loss": 0.8101, + "num_input_tokens_seen": 74094976, + "step": 127615 + }, + { + "epoch": 19.008042895442358, + "grad_norm": 0.04345703125, + "learning_rate": 0.0002243090573386447, + "loss": 0.7928, + "num_input_tokens_seen": 74097856, + "step": 127620 + }, + { + "epoch": 19.00878760798332, + "grad_norm": 0.04638671875, + "learning_rate": 0.0002239732741365713, + "loss": 0.7969, + "num_input_tokens_seen": 74100768, + "step": 127625 + }, + { + "epoch": 19.00953232052428, + "grad_norm": 0.060791015625, + "learning_rate": 0.00022363774056045115, + "loss": 0.8086, + "num_input_tokens_seen": 74103488, + "step": 127630 + }, + { + "epoch": 19.010277033065236, + "grad_norm": 0.03955078125, + "learning_rate": 0.00022330245661595304, + "loss": 0.8003, + "num_input_tokens_seen": 74106400, + "step": 127635 + }, + { + "epoch": 19.011021745606197, + "grad_norm": 0.056640625, + "learning_rate": 0.00022296742230874076, + "loss": 0.784, + "num_input_tokens_seen": 74109184, + "step": 127640 + }, + { + "epoch": 19.011766458147154, + "grad_norm": 0.11962890625, + "learning_rate": 0.00022263263764447648, + "loss": 0.8033, + "num_input_tokens_seen": 74112096, + "step": 127645 + }, + { + "epoch": 19.012511170688114, + "grad_norm": 0.031494140625, + "learning_rate": 0.00022229810262881233, + "loss": 0.7927, + "num_input_tokens_seen": 74115040, + "step": 127650 + }, + { + "epoch": 19.013255883229075, + "grad_norm": 0.10009765625, + "learning_rate": 0.00022196381726740543, + "loss": 0.7871, + "num_input_tokens_seen": 74118176, + "step": 127655 + }, + { + "epoch": 19.01400059577003, + "grad_norm": 0.051025390625, + "learning_rate": 0.00022162978156589795, + "loss": 0.8108, + "num_input_tokens_seen": 74120800, + "step": 127660 + }, + { + "epoch": 19.014745308310992, + "grad_norm": 0.037109375, + "learning_rate": 0.00022129599552993539, + "loss": 0.7993, + "num_input_tokens_seen": 74123680, + "step": 127665 + }, + { + "epoch": 19.015490020851953, + "grad_norm": 0.08642578125, + "learning_rate": 0.0002209624591651582, + "loss": 0.7897, + "num_input_tokens_seen": 74126528, + "step": 127670 + }, + { + "epoch": 19.01623473339291, + "grad_norm": 0.031982421875, + "learning_rate": 0.00022062917247719693, + "loss": 0.7991, + "num_input_tokens_seen": 74129280, + "step": 127675 + }, + { + "epoch": 19.01697944593387, + "grad_norm": 0.0263671875, + "learning_rate": 0.00022029613547168702, + "loss": 0.8117, + "num_input_tokens_seen": 74132384, + "step": 127680 + }, + { + "epoch": 19.017724158474827, + "grad_norm": 0.032958984375, + "learning_rate": 0.00021996334815425065, + "loss": 0.7837, + "num_input_tokens_seen": 74134976, + "step": 127685 + }, + { + "epoch": 19.018468871015788, + "grad_norm": 0.058349609375, + "learning_rate": 0.00021963081053051335, + "loss": 0.7863, + "num_input_tokens_seen": 74138208, + "step": 127690 + }, + { + "epoch": 19.01921358355675, + "grad_norm": 0.041259765625, + "learning_rate": 0.00021929852260609229, + "loss": 0.7841, + "num_input_tokens_seen": 74141088, + "step": 127695 + }, + { + "epoch": 19.019958296097705, + "grad_norm": 0.04638671875, + "learning_rate": 0.0002189664843865996, + "loss": 0.7948, + "num_input_tokens_seen": 74143936, + "step": 127700 + }, + { + "epoch": 19.020703008638666, + "grad_norm": 0.0419921875, + "learning_rate": 0.00021863469587764416, + "loss": 0.7821, + "num_input_tokens_seen": 74146688, + "step": 127705 + }, + { + "epoch": 19.021447721179623, + "grad_norm": 0.053955078125, + "learning_rate": 0.00021830315708483316, + "loss": 0.7954, + "num_input_tokens_seen": 74149472, + "step": 127710 + }, + { + "epoch": 19.022192433720583, + "grad_norm": 0.037109375, + "learning_rate": 0.0002179718680137671, + "loss": 0.8013, + "num_input_tokens_seen": 74152512, + "step": 127715 + }, + { + "epoch": 19.022937146261544, + "grad_norm": 0.0654296875, + "learning_rate": 0.00021764082867004153, + "loss": 0.8049, + "num_input_tokens_seen": 74155488, + "step": 127720 + }, + { + "epoch": 19.0236818588025, + "grad_norm": 0.058837890625, + "learning_rate": 0.00021731003905925026, + "loss": 0.8229, + "num_input_tokens_seen": 74158240, + "step": 127725 + }, + { + "epoch": 19.02442657134346, + "grad_norm": 0.0361328125, + "learning_rate": 0.00021697949918698055, + "loss": 0.7845, + "num_input_tokens_seen": 74160992, + "step": 127730 + }, + { + "epoch": 19.025171283884422, + "grad_norm": 0.03076171875, + "learning_rate": 0.00021664920905881956, + "loss": 0.8069, + "num_input_tokens_seen": 74164192, + "step": 127735 + }, + { + "epoch": 19.02591599642538, + "grad_norm": 0.061767578125, + "learning_rate": 0.00021631916868034283, + "loss": 0.7855, + "num_input_tokens_seen": 74167104, + "step": 127740 + }, + { + "epoch": 19.02666070896634, + "grad_norm": 0.05810546875, + "learning_rate": 0.0002159893780571309, + "loss": 0.7783, + "num_input_tokens_seen": 74169888, + "step": 127745 + }, + { + "epoch": 19.027405421507297, + "grad_norm": 0.11767578125, + "learning_rate": 0.00021565983719475101, + "loss": 0.7977, + "num_input_tokens_seen": 74172960, + "step": 127750 + }, + { + "epoch": 19.028150134048257, + "grad_norm": 0.057373046875, + "learning_rate": 0.00021533054609877366, + "loss": 0.797, + "num_input_tokens_seen": 74175712, + "step": 127755 + }, + { + "epoch": 19.028894846589218, + "grad_norm": 0.05517578125, + "learning_rate": 0.00021500150477475944, + "loss": 0.7994, + "num_input_tokens_seen": 74178592, + "step": 127760 + }, + { + "epoch": 19.029639559130175, + "grad_norm": 0.048583984375, + "learning_rate": 0.00021467271322826552, + "loss": 0.8268, + "num_input_tokens_seen": 74181888, + "step": 127765 + }, + { + "epoch": 19.030384271671135, + "grad_norm": 0.05908203125, + "learning_rate": 0.0002143441714648525, + "loss": 0.8034, + "num_input_tokens_seen": 74184672, + "step": 127770 + }, + { + "epoch": 19.031128984212096, + "grad_norm": 0.058349609375, + "learning_rate": 0.0002140158794900643, + "loss": 0.7946, + "num_input_tokens_seen": 74187456, + "step": 127775 + }, + { + "epoch": 19.031873696753053, + "grad_norm": 0.056884765625, + "learning_rate": 0.0002136878373094514, + "loss": 0.8432, + "num_input_tokens_seen": 74190496, + "step": 127780 + }, + { + "epoch": 19.032618409294013, + "grad_norm": 0.083984375, + "learning_rate": 0.00021336004492855442, + "loss": 0.8042, + "num_input_tokens_seen": 74193408, + "step": 127785 + }, + { + "epoch": 19.03336312183497, + "grad_norm": 0.0269775390625, + "learning_rate": 0.0002130325023529106, + "loss": 0.8109, + "num_input_tokens_seen": 74196096, + "step": 127790 + }, + { + "epoch": 19.03410783437593, + "grad_norm": 0.05419921875, + "learning_rate": 0.00021270520958805548, + "loss": 0.7996, + "num_input_tokens_seen": 74199104, + "step": 127795 + }, + { + "epoch": 19.03485254691689, + "grad_norm": 0.0517578125, + "learning_rate": 0.00021237816663951468, + "loss": 0.8019, + "num_input_tokens_seen": 74201952, + "step": 127800 + }, + { + "epoch": 19.035597259457848, + "grad_norm": 0.037841796875, + "learning_rate": 0.00021205137351281542, + "loss": 0.7941, + "num_input_tokens_seen": 74204640, + "step": 127805 + }, + { + "epoch": 19.03634197199881, + "grad_norm": 0.028076171875, + "learning_rate": 0.0002117248302134783, + "loss": 0.806, + "num_input_tokens_seen": 74207584, + "step": 127810 + }, + { + "epoch": 19.03708668453977, + "grad_norm": 0.0517578125, + "learning_rate": 0.00021139853674702225, + "loss": 0.7772, + "num_input_tokens_seen": 74210304, + "step": 127815 + }, + { + "epoch": 19.037831397080726, + "grad_norm": 0.058837890625, + "learning_rate": 0.00021107249311895615, + "loss": 0.7958, + "num_input_tokens_seen": 74213184, + "step": 127820 + }, + { + "epoch": 19.038576109621687, + "grad_norm": 0.041259765625, + "learning_rate": 0.00021074669933478894, + "loss": 0.8072, + "num_input_tokens_seen": 74216512, + "step": 127825 + }, + { + "epoch": 19.039320822162644, + "grad_norm": 0.03173828125, + "learning_rate": 0.00021042115540002626, + "loss": 0.8072, + "num_input_tokens_seen": 74219328, + "step": 127830 + }, + { + "epoch": 19.040065534703604, + "grad_norm": 0.07177734375, + "learning_rate": 0.00021009586132016532, + "loss": 0.7993, + "num_input_tokens_seen": 74222432, + "step": 127835 + }, + { + "epoch": 19.040810247244565, + "grad_norm": 0.03271484375, + "learning_rate": 0.00020977081710070344, + "loss": 0.8277, + "num_input_tokens_seen": 74225376, + "step": 127840 + }, + { + "epoch": 19.041554959785522, + "grad_norm": 0.048095703125, + "learning_rate": 0.0002094460227471312, + "loss": 0.7847, + "num_input_tokens_seen": 74228384, + "step": 127845 + }, + { + "epoch": 19.042299672326482, + "grad_norm": 0.041748046875, + "learning_rate": 0.00020912147826493755, + "loss": 0.8017, + "num_input_tokens_seen": 74231424, + "step": 127850 + }, + { + "epoch": 19.043044384867443, + "grad_norm": 0.056884765625, + "learning_rate": 0.00020879718365960143, + "loss": 0.8033, + "num_input_tokens_seen": 74234720, + "step": 127855 + }, + { + "epoch": 19.0437890974084, + "grad_norm": 0.034912109375, + "learning_rate": 0.00020847313893660512, + "loss": 0.8078, + "num_input_tokens_seen": 74237472, + "step": 127860 + }, + { + "epoch": 19.04453380994936, + "grad_norm": 0.051513671875, + "learning_rate": 0.00020814934410142094, + "loss": 0.7968, + "num_input_tokens_seen": 74240288, + "step": 127865 + }, + { + "epoch": 19.045278522490317, + "grad_norm": 0.052001953125, + "learning_rate": 0.00020782579915952113, + "loss": 0.7992, + "num_input_tokens_seen": 74242976, + "step": 127870 + }, + { + "epoch": 19.046023235031278, + "grad_norm": 0.040771484375, + "learning_rate": 0.00020750250411636972, + "loss": 0.7901, + "num_input_tokens_seen": 74245664, + "step": 127875 + }, + { + "epoch": 19.04676794757224, + "grad_norm": 0.04931640625, + "learning_rate": 0.00020717945897742894, + "loss": 0.8017, + "num_input_tokens_seen": 74248576, + "step": 127880 + }, + { + "epoch": 19.047512660113195, + "grad_norm": 0.0291748046875, + "learning_rate": 0.00020685666374815613, + "loss": 0.8025, + "num_input_tokens_seen": 74251392, + "step": 127885 + }, + { + "epoch": 19.048257372654156, + "grad_norm": 0.0732421875, + "learning_rate": 0.00020653411843400693, + "loss": 0.7863, + "num_input_tokens_seen": 74254432, + "step": 127890 + }, + { + "epoch": 19.049002085195113, + "grad_norm": 0.0299072265625, + "learning_rate": 0.00020621182304042694, + "loss": 0.7938, + "num_input_tokens_seen": 74257248, + "step": 127895 + }, + { + "epoch": 19.049746797736073, + "grad_norm": 0.037109375, + "learning_rate": 0.00020588977757286185, + "loss": 0.7958, + "num_input_tokens_seen": 74260032, + "step": 127900 + }, + { + "epoch": 19.050491510277034, + "grad_norm": 0.043212890625, + "learning_rate": 0.0002055679820367523, + "loss": 0.7928, + "num_input_tokens_seen": 74263040, + "step": 127905 + }, + { + "epoch": 19.05123622281799, + "grad_norm": 0.0771484375, + "learning_rate": 0.00020524643643753725, + "loss": 0.8015, + "num_input_tokens_seen": 74266176, + "step": 127910 + }, + { + "epoch": 19.05198093535895, + "grad_norm": 0.0625, + "learning_rate": 0.00020492514078064572, + "loss": 0.7951, + "num_input_tokens_seen": 74268992, + "step": 127915 + }, + { + "epoch": 19.052725647899912, + "grad_norm": 0.134765625, + "learning_rate": 0.00020460409507150833, + "loss": 0.816, + "num_input_tokens_seen": 74271872, + "step": 127920 + }, + { + "epoch": 19.05347036044087, + "grad_norm": 0.046875, + "learning_rate": 0.00020428329931554412, + "loss": 0.8078, + "num_input_tokens_seen": 74274688, + "step": 127925 + }, + { + "epoch": 19.05421507298183, + "grad_norm": 0.041259765625, + "learning_rate": 0.0002039627535181787, + "loss": 0.7916, + "num_input_tokens_seen": 74277824, + "step": 127930 + }, + { + "epoch": 19.054959785522787, + "grad_norm": 0.09912109375, + "learning_rate": 0.0002036424576848228, + "loss": 0.7968, + "num_input_tokens_seen": 74280768, + "step": 127935 + }, + { + "epoch": 19.055704498063747, + "grad_norm": 0.0257568359375, + "learning_rate": 0.00020332241182089206, + "loss": 0.7985, + "num_input_tokens_seen": 74283936, + "step": 127940 + }, + { + "epoch": 19.056449210604708, + "grad_norm": 0.056640625, + "learning_rate": 0.00020300261593178713, + "loss": 0.8019, + "num_input_tokens_seen": 74286976, + "step": 127945 + }, + { + "epoch": 19.057193923145665, + "grad_norm": 0.020751953125, + "learning_rate": 0.00020268307002291707, + "loss": 0.7867, + "num_input_tokens_seen": 74289888, + "step": 127950 + }, + { + "epoch": 19.057938635686625, + "grad_norm": 0.062255859375, + "learning_rate": 0.00020236377409967588, + "loss": 0.8003, + "num_input_tokens_seen": 74292704, + "step": 127955 + }, + { + "epoch": 19.058683348227586, + "grad_norm": 0.04931640625, + "learning_rate": 0.0002020447281674592, + "loss": 0.795, + "num_input_tokens_seen": 74295968, + "step": 127960 + }, + { + "epoch": 19.059428060768543, + "grad_norm": 0.06640625, + "learning_rate": 0.00020172593223165614, + "loss": 0.8012, + "num_input_tokens_seen": 74299168, + "step": 127965 + }, + { + "epoch": 19.060172773309503, + "grad_norm": 0.07421875, + "learning_rate": 0.00020140738629765396, + "loss": 0.7869, + "num_input_tokens_seen": 74302112, + "step": 127970 + }, + { + "epoch": 19.06091748585046, + "grad_norm": 0.0537109375, + "learning_rate": 0.00020108909037083343, + "loss": 0.8064, + "num_input_tokens_seen": 74304800, + "step": 127975 + }, + { + "epoch": 19.06166219839142, + "grad_norm": 0.06591796875, + "learning_rate": 0.00020077104445657024, + "loss": 0.7967, + "num_input_tokens_seen": 74307648, + "step": 127980 + }, + { + "epoch": 19.06240691093238, + "grad_norm": 0.055908203125, + "learning_rate": 0.00020045324856024003, + "loss": 0.7803, + "num_input_tokens_seen": 74310496, + "step": 127985 + }, + { + "epoch": 19.06315162347334, + "grad_norm": 0.053466796875, + "learning_rate": 0.0002001357026872119, + "loss": 0.7796, + "num_input_tokens_seen": 74313600, + "step": 127990 + }, + { + "epoch": 19.0638963360143, + "grad_norm": 0.03955078125, + "learning_rate": 0.00019981840684284657, + "loss": 0.8018, + "num_input_tokens_seen": 74316544, + "step": 127995 + }, + { + "epoch": 19.06464104855526, + "grad_norm": 0.042724609375, + "learning_rate": 0.0001995013610325097, + "loss": 0.8006, + "num_input_tokens_seen": 74319392, + "step": 128000 + }, + { + "epoch": 19.065385761096216, + "grad_norm": 0.043701171875, + "learning_rate": 0.00019918456526155204, + "loss": 0.7995, + "num_input_tokens_seen": 74322464, + "step": 128005 + }, + { + "epoch": 19.066130473637177, + "grad_norm": 0.05517578125, + "learning_rate": 0.00019886801953533095, + "loss": 0.7921, + "num_input_tokens_seen": 74325376, + "step": 128010 + }, + { + "epoch": 19.066875186178134, + "grad_norm": 0.048828125, + "learning_rate": 0.00019855172385918883, + "loss": 0.7778, + "num_input_tokens_seen": 74328224, + "step": 128015 + }, + { + "epoch": 19.067619898719094, + "grad_norm": 0.08203125, + "learning_rate": 0.00019823567823847477, + "loss": 0.7902, + "num_input_tokens_seen": 74331424, + "step": 128020 + }, + { + "epoch": 19.068364611260055, + "grad_norm": 0.059814453125, + "learning_rate": 0.0001979198826785228, + "loss": 0.7865, + "num_input_tokens_seen": 74334464, + "step": 128025 + }, + { + "epoch": 19.069109323801012, + "grad_norm": 0.029541015625, + "learning_rate": 0.00019760433718467196, + "loss": 0.8002, + "num_input_tokens_seen": 74337504, + "step": 128030 + }, + { + "epoch": 19.069854036341972, + "grad_norm": 0.050537109375, + "learning_rate": 0.00019728904176225136, + "loss": 0.7925, + "num_input_tokens_seen": 74340352, + "step": 128035 + }, + { + "epoch": 19.070598748882933, + "grad_norm": 0.042724609375, + "learning_rate": 0.00019697399641658506, + "loss": 0.8008, + "num_input_tokens_seen": 74343264, + "step": 128040 + }, + { + "epoch": 19.07134346142389, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001966592011530005, + "loss": 0.81, + "num_input_tokens_seen": 74345952, + "step": 128045 + }, + { + "epoch": 19.07208817396485, + "grad_norm": 0.0888671875, + "learning_rate": 0.0001963446559768117, + "loss": 0.7987, + "num_input_tokens_seen": 74348768, + "step": 128050 + }, + { + "epoch": 19.072832886505807, + "grad_norm": 0.06884765625, + "learning_rate": 0.00019603036089333448, + "loss": 0.8006, + "num_input_tokens_seen": 74351776, + "step": 128055 + }, + { + "epoch": 19.073577599046768, + "grad_norm": 0.07373046875, + "learning_rate": 0.00019571631590787784, + "loss": 0.7961, + "num_input_tokens_seen": 74354592, + "step": 128060 + }, + { + "epoch": 19.07432231158773, + "grad_norm": 0.05322265625, + "learning_rate": 0.00019540252102574928, + "loss": 0.796, + "num_input_tokens_seen": 74357664, + "step": 128065 + }, + { + "epoch": 19.075067024128685, + "grad_norm": 0.034912109375, + "learning_rate": 0.00019508897625224785, + "loss": 0.7843, + "num_input_tokens_seen": 74360480, + "step": 128070 + }, + { + "epoch": 19.075811736669646, + "grad_norm": 0.07763671875, + "learning_rate": 0.000194775681592671, + "loss": 0.8079, + "num_input_tokens_seen": 74363584, + "step": 128075 + }, + { + "epoch": 19.076556449210603, + "grad_norm": 0.038330078125, + "learning_rate": 0.00019446263705231114, + "loss": 0.7991, + "num_input_tokens_seen": 74366432, + "step": 128080 + }, + { + "epoch": 19.077301161751564, + "grad_norm": 0.056396484375, + "learning_rate": 0.0001941498426364574, + "loss": 0.7859, + "num_input_tokens_seen": 74369472, + "step": 128085 + }, + { + "epoch": 19.078045874292524, + "grad_norm": 0.03173828125, + "learning_rate": 0.00019383729835039553, + "loss": 0.807, + "num_input_tokens_seen": 74372288, + "step": 128090 + }, + { + "epoch": 19.07879058683348, + "grad_norm": 0.041259765625, + "learning_rate": 0.00019352500419940133, + "loss": 0.7953, + "num_input_tokens_seen": 74375040, + "step": 128095 + }, + { + "epoch": 19.07953529937444, + "grad_norm": 0.04833984375, + "learning_rate": 0.00019321296018875555, + "loss": 0.819, + "num_input_tokens_seen": 74377792, + "step": 128100 + }, + { + "epoch": 19.080280011915402, + "grad_norm": 0.036376953125, + "learning_rate": 0.00019290116632372732, + "loss": 0.8107, + "num_input_tokens_seen": 74380864, + "step": 128105 + }, + { + "epoch": 19.08102472445636, + "grad_norm": 0.0380859375, + "learning_rate": 0.00019258962260958578, + "loss": 0.8398, + "num_input_tokens_seen": 74384160, + "step": 128110 + }, + { + "epoch": 19.08176943699732, + "grad_norm": 0.043212890625, + "learning_rate": 0.00019227832905159168, + "loss": 0.7876, + "num_input_tokens_seen": 74387072, + "step": 128115 + }, + { + "epoch": 19.082514149538277, + "grad_norm": 0.06396484375, + "learning_rate": 0.00019196728565500586, + "loss": 0.7899, + "num_input_tokens_seen": 74389920, + "step": 128120 + }, + { + "epoch": 19.083258862079237, + "grad_norm": 0.212890625, + "learning_rate": 0.0001916564924250824, + "loss": 0.8274, + "num_input_tokens_seen": 74392896, + "step": 128125 + }, + { + "epoch": 19.084003574620198, + "grad_norm": 0.062255859375, + "learning_rate": 0.0001913459493670705, + "loss": 0.8116, + "num_input_tokens_seen": 74395776, + "step": 128130 + }, + { + "epoch": 19.084748287161155, + "grad_norm": 0.035888671875, + "learning_rate": 0.0001910356564862209, + "loss": 0.795, + "num_input_tokens_seen": 74398624, + "step": 128135 + }, + { + "epoch": 19.085492999702115, + "grad_norm": 0.03955078125, + "learning_rate": 0.0001907256137877694, + "loss": 0.801, + "num_input_tokens_seen": 74401568, + "step": 128140 + }, + { + "epoch": 19.086237712243076, + "grad_norm": 0.0390625, + "learning_rate": 0.00019041582127696022, + "loss": 0.8215, + "num_input_tokens_seen": 74404544, + "step": 128145 + }, + { + "epoch": 19.086982424784033, + "grad_norm": 0.0283203125, + "learning_rate": 0.00019010627895902244, + "loss": 0.7945, + "num_input_tokens_seen": 74407392, + "step": 128150 + }, + { + "epoch": 19.087727137324993, + "grad_norm": 0.041259765625, + "learning_rate": 0.00018979698683918854, + "loss": 0.8114, + "num_input_tokens_seen": 74410112, + "step": 128155 + }, + { + "epoch": 19.08847184986595, + "grad_norm": 0.04345703125, + "learning_rate": 0.00018948794492268105, + "loss": 0.7794, + "num_input_tokens_seen": 74412832, + "step": 128160 + }, + { + "epoch": 19.08921656240691, + "grad_norm": 0.0625, + "learning_rate": 0.0001891791532147208, + "loss": 0.8009, + "num_input_tokens_seen": 74415648, + "step": 128165 + }, + { + "epoch": 19.08996127494787, + "grad_norm": 0.044677734375, + "learning_rate": 0.00018887061172052688, + "loss": 0.8126, + "num_input_tokens_seen": 74418592, + "step": 128170 + }, + { + "epoch": 19.09070598748883, + "grad_norm": 0.03369140625, + "learning_rate": 0.00018856232044531018, + "loss": 0.8162, + "num_input_tokens_seen": 74421632, + "step": 128175 + }, + { + "epoch": 19.09145070002979, + "grad_norm": 0.0439453125, + "learning_rate": 0.00018825427939427985, + "loss": 0.7847, + "num_input_tokens_seen": 74424576, + "step": 128180 + }, + { + "epoch": 19.09219541257075, + "grad_norm": 0.055419921875, + "learning_rate": 0.0001879464885726384, + "loss": 0.7852, + "num_input_tokens_seen": 74427296, + "step": 128185 + }, + { + "epoch": 19.092940125111706, + "grad_norm": 0.036865234375, + "learning_rate": 0.00018763894798558834, + "loss": 0.8124, + "num_input_tokens_seen": 74430048, + "step": 128190 + }, + { + "epoch": 19.093684837652667, + "grad_norm": 0.09375, + "learning_rate": 0.00018733165763832382, + "loss": 0.8524, + "num_input_tokens_seen": 74432992, + "step": 128195 + }, + { + "epoch": 19.094429550193624, + "grad_norm": 0.05859375, + "learning_rate": 0.00018702461753603404, + "loss": 0.8098, + "num_input_tokens_seen": 74436032, + "step": 128200 + }, + { + "epoch": 19.095174262734584, + "grad_norm": 0.038330078125, + "learning_rate": 0.00018671782768390986, + "loss": 0.7878, + "num_input_tokens_seen": 74439008, + "step": 128205 + }, + { + "epoch": 19.095918975275545, + "grad_norm": 0.025634765625, + "learning_rate": 0.00018641128808713047, + "loss": 0.7952, + "num_input_tokens_seen": 74441760, + "step": 128210 + }, + { + "epoch": 19.096663687816502, + "grad_norm": 0.0546875, + "learning_rate": 0.00018610499875087837, + "loss": 0.827, + "num_input_tokens_seen": 74444544, + "step": 128215 + }, + { + "epoch": 19.097408400357462, + "grad_norm": 0.06884765625, + "learning_rate": 0.0001857989596803261, + "loss": 0.7851, + "num_input_tokens_seen": 74447456, + "step": 128220 + }, + { + "epoch": 19.098153112898423, + "grad_norm": 0.03955078125, + "learning_rate": 0.00018549317088064287, + "loss": 0.7959, + "num_input_tokens_seen": 74450304, + "step": 128225 + }, + { + "epoch": 19.09889782543938, + "grad_norm": 0.09716796875, + "learning_rate": 0.0001851876323569962, + "loss": 0.7768, + "num_input_tokens_seen": 74453184, + "step": 128230 + }, + { + "epoch": 19.09964253798034, + "grad_norm": 0.04541015625, + "learning_rate": 0.00018488234411454695, + "loss": 0.7997, + "num_input_tokens_seen": 74456096, + "step": 128235 + }, + { + "epoch": 19.100387250521297, + "grad_norm": 0.042724609375, + "learning_rate": 0.00018457730615845436, + "loss": 0.7843, + "num_input_tokens_seen": 74458816, + "step": 128240 + }, + { + "epoch": 19.101131963062258, + "grad_norm": 0.0693359375, + "learning_rate": 0.00018427251849386926, + "loss": 0.7902, + "num_input_tokens_seen": 74461888, + "step": 128245 + }, + { + "epoch": 19.10187667560322, + "grad_norm": 0.03271484375, + "learning_rate": 0.00018396798112594258, + "loss": 0.8022, + "num_input_tokens_seen": 74464960, + "step": 128250 + }, + { + "epoch": 19.102621388144176, + "grad_norm": 0.027587890625, + "learning_rate": 0.00018366369405981687, + "loss": 0.8085, + "num_input_tokens_seen": 74468000, + "step": 128255 + }, + { + "epoch": 19.103366100685136, + "grad_norm": 0.04931640625, + "learning_rate": 0.0001833596573006363, + "loss": 0.8058, + "num_input_tokens_seen": 74470976, + "step": 128260 + }, + { + "epoch": 19.104110813226093, + "grad_norm": 0.038330078125, + "learning_rate": 0.00018305587085353515, + "loss": 0.7899, + "num_input_tokens_seen": 74473920, + "step": 128265 + }, + { + "epoch": 19.104855525767054, + "grad_norm": 0.052001953125, + "learning_rate": 0.00018275233472364594, + "loss": 0.8055, + "num_input_tokens_seen": 74476544, + "step": 128270 + }, + { + "epoch": 19.105600238308014, + "grad_norm": 0.054931640625, + "learning_rate": 0.00018244904891609625, + "loss": 0.7782, + "num_input_tokens_seen": 74479424, + "step": 128275 + }, + { + "epoch": 19.10634495084897, + "grad_norm": 0.053466796875, + "learning_rate": 0.00018214601343600865, + "loss": 0.7882, + "num_input_tokens_seen": 74482656, + "step": 128280 + }, + { + "epoch": 19.10708966338993, + "grad_norm": 0.057373046875, + "learning_rate": 0.00018184322828850575, + "loss": 0.7863, + "num_input_tokens_seen": 74485408, + "step": 128285 + }, + { + "epoch": 19.107834375930892, + "grad_norm": 0.06396484375, + "learning_rate": 0.00018154069347870006, + "loss": 0.7954, + "num_input_tokens_seen": 74488800, + "step": 128290 + }, + { + "epoch": 19.10857908847185, + "grad_norm": 0.06591796875, + "learning_rate": 0.0001812384090117042, + "loss": 0.8325, + "num_input_tokens_seen": 74491904, + "step": 128295 + }, + { + "epoch": 19.10932380101281, + "grad_norm": 0.0289306640625, + "learning_rate": 0.00018093637489262403, + "loss": 0.7968, + "num_input_tokens_seen": 74494496, + "step": 128300 + }, + { + "epoch": 19.110068513553767, + "grad_norm": 0.052490234375, + "learning_rate": 0.00018063459112656222, + "loss": 0.798, + "num_input_tokens_seen": 74497472, + "step": 128305 + }, + { + "epoch": 19.110813226094727, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001803330577186163, + "loss": 0.7914, + "num_input_tokens_seen": 74500288, + "step": 128310 + }, + { + "epoch": 19.111557938635688, + "grad_norm": 0.0556640625, + "learning_rate": 0.00018003177467388386, + "loss": 0.7889, + "num_input_tokens_seen": 74504576, + "step": 128315 + }, + { + "epoch": 19.112302651176645, + "grad_norm": 0.038818359375, + "learning_rate": 0.00017973074199745086, + "loss": 0.8044, + "num_input_tokens_seen": 74507360, + "step": 128320 + }, + { + "epoch": 19.113047363717605, + "grad_norm": 0.05908203125, + "learning_rate": 0.00017942995969440323, + "loss": 0.7932, + "num_input_tokens_seen": 74510112, + "step": 128325 + }, + { + "epoch": 19.113792076258566, + "grad_norm": 0.111328125, + "learning_rate": 0.00017912942776982522, + "loss": 0.8119, + "num_input_tokens_seen": 74512960, + "step": 128330 + }, + { + "epoch": 19.114536788799523, + "grad_norm": 0.04833984375, + "learning_rate": 0.00017882914622878942, + "loss": 0.8058, + "num_input_tokens_seen": 74515936, + "step": 128335 + }, + { + "epoch": 19.115281501340483, + "grad_norm": 0.05517578125, + "learning_rate": 0.00017852911507637513, + "loss": 0.8082, + "num_input_tokens_seen": 74518816, + "step": 128340 + }, + { + "epoch": 19.11602621388144, + "grad_norm": 0.037109375, + "learning_rate": 0.00017822933431764498, + "loss": 0.8361, + "num_input_tokens_seen": 74521472, + "step": 128345 + }, + { + "epoch": 19.1167709264224, + "grad_norm": 0.05712890625, + "learning_rate": 0.00017792980395766822, + "loss": 0.7928, + "num_input_tokens_seen": 74524352, + "step": 128350 + }, + { + "epoch": 19.11751563896336, + "grad_norm": 0.036376953125, + "learning_rate": 0.00017763052400150247, + "loss": 0.7933, + "num_input_tokens_seen": 74527264, + "step": 128355 + }, + { + "epoch": 19.11826035150432, + "grad_norm": 0.0400390625, + "learning_rate": 0.0001773314944542037, + "loss": 0.8001, + "num_input_tokens_seen": 74529952, + "step": 128360 + }, + { + "epoch": 19.11900506404528, + "grad_norm": 0.080078125, + "learning_rate": 0.00017703271532082453, + "loss": 0.7926, + "num_input_tokens_seen": 74532896, + "step": 128365 + }, + { + "epoch": 19.11974977658624, + "grad_norm": 0.04931640625, + "learning_rate": 0.00017673418660641094, + "loss": 0.8039, + "num_input_tokens_seen": 74535872, + "step": 128370 + }, + { + "epoch": 19.120494489127196, + "grad_norm": 0.037109375, + "learning_rate": 0.00017643590831600885, + "loss": 0.7992, + "num_input_tokens_seen": 74538912, + "step": 128375 + }, + { + "epoch": 19.121239201668157, + "grad_norm": 0.04248046875, + "learning_rate": 0.00017613788045465594, + "loss": 0.7981, + "num_input_tokens_seen": 74541760, + "step": 128380 + }, + { + "epoch": 19.121983914209114, + "grad_norm": 0.03662109375, + "learning_rate": 0.0001758401030273865, + "loss": 0.8015, + "num_input_tokens_seen": 74544896, + "step": 128385 + }, + { + "epoch": 19.122728626750074, + "grad_norm": 0.07568359375, + "learning_rate": 0.0001755425760392315, + "loss": 0.7834, + "num_input_tokens_seen": 74547616, + "step": 128390 + }, + { + "epoch": 19.123473339291035, + "grad_norm": 0.041748046875, + "learning_rate": 0.00017524529949522027, + "loss": 0.7976, + "num_input_tokens_seen": 74550432, + "step": 128395 + }, + { + "epoch": 19.124218051831992, + "grad_norm": 0.04833984375, + "learning_rate": 0.00017494827340037045, + "loss": 0.795, + "num_input_tokens_seen": 74553408, + "step": 128400 + }, + { + "epoch": 19.124962764372953, + "grad_norm": 0.05126953125, + "learning_rate": 0.000174651497759703, + "loss": 0.7911, + "num_input_tokens_seen": 74556480, + "step": 128405 + }, + { + "epoch": 19.12570747691391, + "grad_norm": 0.0478515625, + "learning_rate": 0.00017435497257823062, + "loss": 0.8006, + "num_input_tokens_seen": 74559296, + "step": 128410 + }, + { + "epoch": 19.12645218945487, + "grad_norm": 0.038330078125, + "learning_rate": 0.00017405869786096261, + "loss": 0.8229, + "num_input_tokens_seen": 74562400, + "step": 128415 + }, + { + "epoch": 19.12719690199583, + "grad_norm": 0.06396484375, + "learning_rate": 0.00017376267361290498, + "loss": 0.7762, + "num_input_tokens_seen": 74565344, + "step": 128420 + }, + { + "epoch": 19.127941614536788, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00017346689983905705, + "loss": 0.8204, + "num_input_tokens_seen": 74568320, + "step": 128425 + }, + { + "epoch": 19.128686327077748, + "grad_norm": 0.05126953125, + "learning_rate": 0.00017317137654441816, + "loss": 0.809, + "num_input_tokens_seen": 74571296, + "step": 128430 + }, + { + "epoch": 19.12943103961871, + "grad_norm": 0.053466796875, + "learning_rate": 0.0001728761037339793, + "loss": 0.7712, + "num_input_tokens_seen": 74574240, + "step": 128435 + }, + { + "epoch": 19.130175752159666, + "grad_norm": 0.04248046875, + "learning_rate": 0.0001725810814127282, + "loss": 0.8091, + "num_input_tokens_seen": 74576992, + "step": 128440 + }, + { + "epoch": 19.130920464700626, + "grad_norm": 0.0732421875, + "learning_rate": 0.00017228630958564917, + "loss": 0.8158, + "num_input_tokens_seen": 74579808, + "step": 128445 + }, + { + "epoch": 19.131665177241583, + "grad_norm": 0.05322265625, + "learning_rate": 0.0001719917882577232, + "loss": 0.7944, + "num_input_tokens_seen": 74582784, + "step": 128450 + }, + { + "epoch": 19.132409889782544, + "grad_norm": 0.033447265625, + "learning_rate": 0.00017169751743392636, + "loss": 0.7933, + "num_input_tokens_seen": 74585632, + "step": 128455 + }, + { + "epoch": 19.133154602323504, + "grad_norm": 0.03515625, + "learning_rate": 0.00017140349711922964, + "loss": 0.8119, + "num_input_tokens_seen": 74588288, + "step": 128460 + }, + { + "epoch": 19.13389931486446, + "grad_norm": 0.03759765625, + "learning_rate": 0.00017110972731859908, + "loss": 0.7956, + "num_input_tokens_seen": 74591424, + "step": 128465 + }, + { + "epoch": 19.13464402740542, + "grad_norm": 0.037841796875, + "learning_rate": 0.00017081620803699736, + "loss": 0.7986, + "num_input_tokens_seen": 74594368, + "step": 128470 + }, + { + "epoch": 19.135388739946382, + "grad_norm": 0.07080078125, + "learning_rate": 0.00017052293927938554, + "loss": 0.7878, + "num_input_tokens_seen": 74597056, + "step": 128475 + }, + { + "epoch": 19.13613345248734, + "grad_norm": 0.040283203125, + "learning_rate": 0.0001702299210507163, + "loss": 0.8242, + "num_input_tokens_seen": 74600000, + "step": 128480 + }, + { + "epoch": 19.1368781650283, + "grad_norm": 0.0556640625, + "learning_rate": 0.00016993715335593905, + "loss": 0.8113, + "num_input_tokens_seen": 74602944, + "step": 128485 + }, + { + "epoch": 19.137622877569257, + "grad_norm": 0.041259765625, + "learning_rate": 0.00016964463620000312, + "loss": 0.7895, + "num_input_tokens_seen": 74605760, + "step": 128490 + }, + { + "epoch": 19.138367590110217, + "grad_norm": 0.055908203125, + "learning_rate": 0.00016935236958784628, + "loss": 0.7923, + "num_input_tokens_seen": 74608800, + "step": 128495 + }, + { + "epoch": 19.139112302651178, + "grad_norm": 0.09765625, + "learning_rate": 0.00016906035352440784, + "loss": 0.7943, + "num_input_tokens_seen": 74611648, + "step": 128500 + }, + { + "epoch": 19.139857015192135, + "grad_norm": 0.05615234375, + "learning_rate": 0.00016876858801462223, + "loss": 0.807, + "num_input_tokens_seen": 74614720, + "step": 128505 + }, + { + "epoch": 19.140601727733095, + "grad_norm": 0.06640625, + "learning_rate": 0.00016847707306341718, + "loss": 0.8162, + "num_input_tokens_seen": 74617696, + "step": 128510 + }, + { + "epoch": 19.141346440274056, + "grad_norm": 0.041015625, + "learning_rate": 0.00016818580867571873, + "loss": 0.8045, + "num_input_tokens_seen": 74620512, + "step": 128515 + }, + { + "epoch": 19.142091152815013, + "grad_norm": 0.042724609375, + "learning_rate": 0.00016789479485644464, + "loss": 0.7806, + "num_input_tokens_seen": 74623424, + "step": 128520 + }, + { + "epoch": 19.142835865355973, + "grad_norm": 0.05615234375, + "learning_rate": 0.00016760403161051595, + "loss": 0.8044, + "num_input_tokens_seen": 74626272, + "step": 128525 + }, + { + "epoch": 19.14358057789693, + "grad_norm": 0.287109375, + "learning_rate": 0.00016731351894283875, + "loss": 0.814, + "num_input_tokens_seen": 74628960, + "step": 128530 + }, + { + "epoch": 19.14432529043789, + "grad_norm": 0.039794921875, + "learning_rate": 0.00016702325685832742, + "loss": 0.8052, + "num_input_tokens_seen": 74631936, + "step": 128535 + }, + { + "epoch": 19.14507000297885, + "grad_norm": 0.0869140625, + "learning_rate": 0.00016673324536188138, + "loss": 0.7944, + "num_input_tokens_seen": 74634976, + "step": 128540 + }, + { + "epoch": 19.14581471551981, + "grad_norm": 0.054443359375, + "learning_rate": 0.00016644348445840173, + "loss": 0.7974, + "num_input_tokens_seen": 74637888, + "step": 128545 + }, + { + "epoch": 19.14655942806077, + "grad_norm": 0.05126953125, + "learning_rate": 0.00016615397415278287, + "loss": 0.7972, + "num_input_tokens_seen": 74640768, + "step": 128550 + }, + { + "epoch": 19.14730414060173, + "grad_norm": 0.04443359375, + "learning_rate": 0.00016586471444991757, + "loss": 0.798, + "num_input_tokens_seen": 74643648, + "step": 128555 + }, + { + "epoch": 19.148048853142686, + "grad_norm": 0.09716796875, + "learning_rate": 0.00016557570535469024, + "loss": 0.797, + "num_input_tokens_seen": 74646720, + "step": 128560 + }, + { + "epoch": 19.148793565683647, + "grad_norm": 0.03662109375, + "learning_rate": 0.00016528694687198363, + "loss": 0.7935, + "num_input_tokens_seen": 74649312, + "step": 128565 + }, + { + "epoch": 19.149538278224604, + "grad_norm": 0.04541015625, + "learning_rate": 0.0001649984390066772, + "loss": 0.8174, + "num_input_tokens_seen": 74652256, + "step": 128570 + }, + { + "epoch": 19.150282990765565, + "grad_norm": 0.06591796875, + "learning_rate": 0.0001647101817636437, + "loss": 0.7764, + "num_input_tokens_seen": 74654976, + "step": 128575 + }, + { + "epoch": 19.151027703306525, + "grad_norm": 0.058349609375, + "learning_rate": 0.00016442217514775424, + "loss": 0.8051, + "num_input_tokens_seen": 74657856, + "step": 128580 + }, + { + "epoch": 19.151772415847482, + "grad_norm": 0.05419921875, + "learning_rate": 0.00016413441916387494, + "loss": 0.8122, + "num_input_tokens_seen": 74660832, + "step": 128585 + }, + { + "epoch": 19.152517128388443, + "grad_norm": 0.052001953125, + "learning_rate": 0.0001638469138168652, + "loss": 0.7799, + "num_input_tokens_seen": 74663488, + "step": 128590 + }, + { + "epoch": 19.1532618409294, + "grad_norm": 0.038818359375, + "learning_rate": 0.00016355965911158288, + "loss": 0.8063, + "num_input_tokens_seen": 74666432, + "step": 128595 + }, + { + "epoch": 19.15400655347036, + "grad_norm": 0.07861328125, + "learning_rate": 0.00016327265505288068, + "loss": 0.7942, + "num_input_tokens_seen": 74669184, + "step": 128600 + }, + { + "epoch": 19.15475126601132, + "grad_norm": 0.052734375, + "learning_rate": 0.00016298590164560978, + "loss": 0.7974, + "num_input_tokens_seen": 74671968, + "step": 128605 + }, + { + "epoch": 19.155495978552278, + "grad_norm": 0.043212890625, + "learning_rate": 0.0001626993988946096, + "loss": 0.8111, + "num_input_tokens_seen": 74674816, + "step": 128610 + }, + { + "epoch": 19.156240691093238, + "grad_norm": 0.03759765625, + "learning_rate": 0.00016241314680472463, + "loss": 0.7904, + "num_input_tokens_seen": 74677728, + "step": 128615 + }, + { + "epoch": 19.1569854036342, + "grad_norm": 0.0576171875, + "learning_rate": 0.0001621271453807893, + "loss": 0.7901, + "num_input_tokens_seen": 74680672, + "step": 128620 + }, + { + "epoch": 19.157730116175156, + "grad_norm": 0.0576171875, + "learning_rate": 0.00016184139462763647, + "loss": 0.8083, + "num_input_tokens_seen": 74683584, + "step": 128625 + }, + { + "epoch": 19.158474828716116, + "grad_norm": 0.048828125, + "learning_rate": 0.00016155589455009054, + "loss": 0.7842, + "num_input_tokens_seen": 74686368, + "step": 128630 + }, + { + "epoch": 19.159219541257073, + "grad_norm": 0.023681640625, + "learning_rate": 0.00016127064515297773, + "loss": 0.8035, + "num_input_tokens_seen": 74688992, + "step": 128635 + }, + { + "epoch": 19.159964253798034, + "grad_norm": 0.05029296875, + "learning_rate": 0.0001609856464411158, + "loss": 0.8193, + "num_input_tokens_seen": 74691904, + "step": 128640 + }, + { + "epoch": 19.160708966338994, + "grad_norm": 0.0556640625, + "learning_rate": 0.00016070089841931757, + "loss": 0.7972, + "num_input_tokens_seen": 74694848, + "step": 128645 + }, + { + "epoch": 19.16145367887995, + "grad_norm": 0.03662109375, + "learning_rate": 0.00016041640109239918, + "loss": 0.8342, + "num_input_tokens_seen": 74697952, + "step": 128650 + }, + { + "epoch": 19.16219839142091, + "grad_norm": 0.045654296875, + "learning_rate": 0.00016013215446516015, + "loss": 0.8265, + "num_input_tokens_seen": 74700736, + "step": 128655 + }, + { + "epoch": 19.162943103961872, + "grad_norm": 0.03759765625, + "learning_rate": 0.0001598481585424083, + "loss": 0.8066, + "num_input_tokens_seen": 74703520, + "step": 128660 + }, + { + "epoch": 19.16368781650283, + "grad_norm": 0.057861328125, + "learning_rate": 0.00015956441332893643, + "loss": 0.7938, + "num_input_tokens_seen": 74706336, + "step": 128665 + }, + { + "epoch": 19.16443252904379, + "grad_norm": 0.044921875, + "learning_rate": 0.00015928091882954243, + "loss": 0.7916, + "num_input_tokens_seen": 74709088, + "step": 128670 + }, + { + "epoch": 19.165177241584747, + "grad_norm": 0.033203125, + "learning_rate": 0.00015899767504901408, + "loss": 0.7966, + "num_input_tokens_seen": 74712224, + "step": 128675 + }, + { + "epoch": 19.165921954125707, + "grad_norm": 0.046630859375, + "learning_rate": 0.0001587146819921359, + "loss": 0.8019, + "num_input_tokens_seen": 74715200, + "step": 128680 + }, + { + "epoch": 19.166666666666668, + "grad_norm": 0.0634765625, + "learning_rate": 0.00015843193966368906, + "loss": 0.7838, + "num_input_tokens_seen": 74718176, + "step": 128685 + }, + { + "epoch": 19.167411379207625, + "grad_norm": 0.036376953125, + "learning_rate": 0.00015814944806845143, + "loss": 0.8092, + "num_input_tokens_seen": 74720928, + "step": 128690 + }, + { + "epoch": 19.168156091748585, + "grad_norm": 0.0478515625, + "learning_rate": 0.00015786720721119252, + "loss": 0.8068, + "num_input_tokens_seen": 74723936, + "step": 128695 + }, + { + "epoch": 19.168900804289546, + "grad_norm": 0.055908203125, + "learning_rate": 0.0001575852170966835, + "loss": 0.7948, + "num_input_tokens_seen": 74726784, + "step": 128700 + }, + { + "epoch": 19.169645516830503, + "grad_norm": 0.03857421875, + "learning_rate": 0.0001573034777296872, + "loss": 0.8148, + "num_input_tokens_seen": 74729408, + "step": 128705 + }, + { + "epoch": 19.170390229371463, + "grad_norm": 0.189453125, + "learning_rate": 0.00015702198911496157, + "loss": 0.8254, + "num_input_tokens_seen": 74732640, + "step": 128710 + }, + { + "epoch": 19.17113494191242, + "grad_norm": 0.051025390625, + "learning_rate": 0.00015674075125726437, + "loss": 0.7913, + "num_input_tokens_seen": 74735296, + "step": 128715 + }, + { + "epoch": 19.17187965445338, + "grad_norm": 0.03271484375, + "learning_rate": 0.00015645976416134687, + "loss": 0.8031, + "num_input_tokens_seen": 74738112, + "step": 128720 + }, + { + "epoch": 19.17262436699434, + "grad_norm": 0.025390625, + "learning_rate": 0.00015617902783195524, + "loss": 0.8035, + "num_input_tokens_seen": 74740864, + "step": 128725 + }, + { + "epoch": 19.1733690795353, + "grad_norm": 0.06787109375, + "learning_rate": 0.00015589854227383238, + "loss": 0.8023, + "num_input_tokens_seen": 74743520, + "step": 128730 + }, + { + "epoch": 19.17411379207626, + "grad_norm": 0.0439453125, + "learning_rate": 0.00015561830749171612, + "loss": 0.7863, + "num_input_tokens_seen": 74746400, + "step": 128735 + }, + { + "epoch": 19.17485850461722, + "grad_norm": 0.091796875, + "learning_rate": 0.00015533832349034105, + "loss": 0.7936, + "num_input_tokens_seen": 74749280, + "step": 128740 + }, + { + "epoch": 19.175603217158177, + "grad_norm": 0.03662109375, + "learning_rate": 0.00015505859027443834, + "loss": 0.7776, + "num_input_tokens_seen": 74752192, + "step": 128745 + }, + { + "epoch": 19.176347929699137, + "grad_norm": 0.06591796875, + "learning_rate": 0.00015477910784873427, + "loss": 0.8084, + "num_input_tokens_seen": 74755360, + "step": 128750 + }, + { + "epoch": 19.177092642240094, + "grad_norm": 0.0625, + "learning_rate": 0.0001544998762179467, + "loss": 0.8093, + "num_input_tokens_seen": 74758304, + "step": 128755 + }, + { + "epoch": 19.177837354781055, + "grad_norm": 0.03857421875, + "learning_rate": 0.00015422089538679685, + "loss": 0.8136, + "num_input_tokens_seen": 74761856, + "step": 128760 + }, + { + "epoch": 19.178582067322015, + "grad_norm": 0.050537109375, + "learning_rate": 0.00015394216535999594, + "loss": 0.7974, + "num_input_tokens_seen": 74764864, + "step": 128765 + }, + { + "epoch": 19.179326779862972, + "grad_norm": 0.0498046875, + "learning_rate": 0.00015366368614225356, + "loss": 0.7901, + "num_input_tokens_seen": 74767488, + "step": 128770 + }, + { + "epoch": 19.180071492403933, + "grad_norm": 0.03662109375, + "learning_rate": 0.00015338545773827428, + "loss": 0.8021, + "num_input_tokens_seen": 74770368, + "step": 128775 + }, + { + "epoch": 19.18081620494489, + "grad_norm": 0.037109375, + "learning_rate": 0.00015310748015275765, + "loss": 0.8021, + "num_input_tokens_seen": 74773120, + "step": 128780 + }, + { + "epoch": 19.18156091748585, + "grad_norm": 0.06005859375, + "learning_rate": 0.0001528297533904016, + "loss": 0.788, + "num_input_tokens_seen": 74776000, + "step": 128785 + }, + { + "epoch": 19.18230563002681, + "grad_norm": 0.0888671875, + "learning_rate": 0.00015255227745589406, + "loss": 0.8082, + "num_input_tokens_seen": 74778720, + "step": 128790 + }, + { + "epoch": 19.183050342567768, + "grad_norm": 0.05712890625, + "learning_rate": 0.00015227505235392792, + "loss": 0.7885, + "num_input_tokens_seen": 74781824, + "step": 128795 + }, + { + "epoch": 19.183795055108728, + "grad_norm": 0.034423828125, + "learning_rate": 0.0001519980780891844, + "loss": 0.796, + "num_input_tokens_seen": 74784800, + "step": 128800 + }, + { + "epoch": 19.18453976764969, + "grad_norm": 0.080078125, + "learning_rate": 0.0001517213546663415, + "loss": 0.7813, + "num_input_tokens_seen": 74787872, + "step": 128805 + }, + { + "epoch": 19.185284480190646, + "grad_norm": 0.05859375, + "learning_rate": 0.00015144488209007544, + "loss": 0.7794, + "num_input_tokens_seen": 74790496, + "step": 128810 + }, + { + "epoch": 19.186029192731606, + "grad_norm": 0.048095703125, + "learning_rate": 0.00015116866036505748, + "loss": 0.8014, + "num_input_tokens_seen": 74793120, + "step": 128815 + }, + { + "epoch": 19.186773905272563, + "grad_norm": 0.044189453125, + "learning_rate": 0.00015089268949595392, + "loss": 0.794, + "num_input_tokens_seen": 74795808, + "step": 128820 + }, + { + "epoch": 19.187518617813524, + "grad_norm": 0.04248046875, + "learning_rate": 0.0001506169694874243, + "loss": 0.8031, + "num_input_tokens_seen": 74798688, + "step": 128825 + }, + { + "epoch": 19.188263330354484, + "grad_norm": 0.056396484375, + "learning_rate": 0.00015034150034412997, + "loss": 0.8012, + "num_input_tokens_seen": 74801248, + "step": 128830 + }, + { + "epoch": 19.18900804289544, + "grad_norm": 0.033203125, + "learning_rate": 0.00015006628207072214, + "loss": 0.783, + "num_input_tokens_seen": 74803936, + "step": 128835 + }, + { + "epoch": 19.189752755436402, + "grad_norm": 0.03857421875, + "learning_rate": 0.0001497913146718538, + "loss": 0.8133, + "num_input_tokens_seen": 74807008, + "step": 128840 + }, + { + "epoch": 19.190497467977362, + "grad_norm": 0.02587890625, + "learning_rate": 0.0001495165981521662, + "loss": 0.7861, + "num_input_tokens_seen": 74809952, + "step": 128845 + }, + { + "epoch": 19.19124218051832, + "grad_norm": 0.03857421875, + "learning_rate": 0.00014924213251630235, + "loss": 0.7878, + "num_input_tokens_seen": 74813024, + "step": 128850 + }, + { + "epoch": 19.19198689305928, + "grad_norm": 0.06982421875, + "learning_rate": 0.00014896791776890017, + "loss": 0.8017, + "num_input_tokens_seen": 74816032, + "step": 128855 + }, + { + "epoch": 19.192731605600237, + "grad_norm": 0.04931640625, + "learning_rate": 0.00014869395391459093, + "loss": 0.7955, + "num_input_tokens_seen": 74819200, + "step": 128860 + }, + { + "epoch": 19.193476318141197, + "grad_norm": 0.0269775390625, + "learning_rate": 0.00014842024095800265, + "loss": 0.7809, + "num_input_tokens_seen": 74821888, + "step": 128865 + }, + { + "epoch": 19.194221030682158, + "grad_norm": 0.03759765625, + "learning_rate": 0.00014814677890375992, + "loss": 0.7868, + "num_input_tokens_seen": 74825952, + "step": 128870 + }, + { + "epoch": 19.194965743223115, + "grad_norm": 0.0419921875, + "learning_rate": 0.0001478735677564824, + "loss": 0.7959, + "num_input_tokens_seen": 74828896, + "step": 128875 + }, + { + "epoch": 19.195710455764075, + "grad_norm": 0.051025390625, + "learning_rate": 0.00014760060752078808, + "loss": 0.806, + "num_input_tokens_seen": 74831648, + "step": 128880 + }, + { + "epoch": 19.196455168305036, + "grad_norm": 0.04052734375, + "learning_rate": 0.00014732789820128488, + "loss": 0.8071, + "num_input_tokens_seen": 74834592, + "step": 128885 + }, + { + "epoch": 19.197199880845993, + "grad_norm": 0.05126953125, + "learning_rate": 0.00014705543980258085, + "loss": 0.7814, + "num_input_tokens_seen": 74837504, + "step": 128890 + }, + { + "epoch": 19.197944593386953, + "grad_norm": 0.0927734375, + "learning_rate": 0.0001467832323292789, + "loss": 0.7638, + "num_input_tokens_seen": 74840352, + "step": 128895 + }, + { + "epoch": 19.19868930592791, + "grad_norm": 0.05322265625, + "learning_rate": 0.00014651127578597876, + "loss": 0.7925, + "num_input_tokens_seen": 74843456, + "step": 128900 + }, + { + "epoch": 19.19943401846887, + "grad_norm": 0.142578125, + "learning_rate": 0.0001462395701772734, + "loss": 0.8455, + "num_input_tokens_seen": 74846240, + "step": 128905 + }, + { + "epoch": 19.20017873100983, + "grad_norm": 0.083984375, + "learning_rate": 0.00014596811550775411, + "loss": 0.7873, + "num_input_tokens_seen": 74849184, + "step": 128910 + }, + { + "epoch": 19.20092344355079, + "grad_norm": 0.0478515625, + "learning_rate": 0.00014569691178200727, + "loss": 0.8201, + "num_input_tokens_seen": 74852160, + "step": 128915 + }, + { + "epoch": 19.20166815609175, + "grad_norm": 0.158203125, + "learning_rate": 0.0001454259590046142, + "loss": 0.8406, + "num_input_tokens_seen": 74855136, + "step": 128920 + }, + { + "epoch": 19.202412868632706, + "grad_norm": 0.039794921875, + "learning_rate": 0.0001451552571801512, + "loss": 0.7838, + "num_input_tokens_seen": 74858272, + "step": 128925 + }, + { + "epoch": 19.203157581173667, + "grad_norm": 0.05029296875, + "learning_rate": 0.00014488480631319134, + "loss": 0.7904, + "num_input_tokens_seen": 74861440, + "step": 128930 + }, + { + "epoch": 19.203902293714627, + "grad_norm": 0.0634765625, + "learning_rate": 0.00014461460640830592, + "loss": 0.8275, + "num_input_tokens_seen": 74864160, + "step": 128935 + }, + { + "epoch": 19.204647006255584, + "grad_norm": 0.055419921875, + "learning_rate": 0.00014434465747005797, + "loss": 0.7898, + "num_input_tokens_seen": 74866976, + "step": 128940 + }, + { + "epoch": 19.205391718796545, + "grad_norm": 0.037841796875, + "learning_rate": 0.00014407495950300884, + "loss": 0.8023, + "num_input_tokens_seen": 74869664, + "step": 128945 + }, + { + "epoch": 19.206136431337505, + "grad_norm": 0.036376953125, + "learning_rate": 0.00014380551251171492, + "loss": 0.7927, + "num_input_tokens_seen": 74872480, + "step": 128950 + }, + { + "epoch": 19.206881143878462, + "grad_norm": 0.0498046875, + "learning_rate": 0.0001435363165007275, + "loss": 0.8095, + "num_input_tokens_seen": 74875232, + "step": 128955 + }, + { + "epoch": 19.207625856419423, + "grad_norm": 0.04736328125, + "learning_rate": 0.00014326737147459466, + "loss": 0.7875, + "num_input_tokens_seen": 74878080, + "step": 128960 + }, + { + "epoch": 19.20837056896038, + "grad_norm": 0.052001953125, + "learning_rate": 0.00014299867743785777, + "loss": 0.7895, + "num_input_tokens_seen": 74880992, + "step": 128965 + }, + { + "epoch": 19.20911528150134, + "grad_norm": 0.045166015625, + "learning_rate": 0.00014273023439505982, + "loss": 0.8062, + "num_input_tokens_seen": 74884288, + "step": 128970 + }, + { + "epoch": 19.2098599940423, + "grad_norm": 0.06103515625, + "learning_rate": 0.0001424620423507339, + "loss": 0.7921, + "num_input_tokens_seen": 74887008, + "step": 128975 + }, + { + "epoch": 19.210604706583258, + "grad_norm": 0.052001953125, + "learning_rate": 0.00014219410130941135, + "loss": 0.7849, + "num_input_tokens_seen": 74889856, + "step": 128980 + }, + { + "epoch": 19.21134941912422, + "grad_norm": 0.049072265625, + "learning_rate": 0.00014192641127561855, + "loss": 0.7778, + "num_input_tokens_seen": 74892544, + "step": 128985 + }, + { + "epoch": 19.21209413166518, + "grad_norm": 0.0537109375, + "learning_rate": 0.00014165897225387858, + "loss": 0.7846, + "num_input_tokens_seen": 74895360, + "step": 128990 + }, + { + "epoch": 19.212838844206136, + "grad_norm": 0.049072265625, + "learning_rate": 0.0001413917842487078, + "loss": 0.7889, + "num_input_tokens_seen": 74898240, + "step": 128995 + }, + { + "epoch": 19.213583556747096, + "grad_norm": 0.055908203125, + "learning_rate": 0.0001411248472646226, + "loss": 0.79, + "num_input_tokens_seen": 74901152, + "step": 129000 + }, + { + "epoch": 19.214328269288053, + "grad_norm": 0.06591796875, + "learning_rate": 0.00014085816130613105, + "loss": 0.7905, + "num_input_tokens_seen": 74903968, + "step": 129005 + }, + { + "epoch": 19.215072981829014, + "grad_norm": 0.0301513671875, + "learning_rate": 0.00014059172637773788, + "loss": 0.7998, + "num_input_tokens_seen": 74906816, + "step": 129010 + }, + { + "epoch": 19.215817694369974, + "grad_norm": 0.0517578125, + "learning_rate": 0.00014032554248394612, + "loss": 0.7988, + "num_input_tokens_seen": 74909632, + "step": 129015 + }, + { + "epoch": 19.21656240691093, + "grad_norm": 0.031982421875, + "learning_rate": 0.00014005960962925056, + "loss": 0.7908, + "num_input_tokens_seen": 74912224, + "step": 129020 + }, + { + "epoch": 19.217307119451892, + "grad_norm": 0.050537109375, + "learning_rate": 0.00013979392781814592, + "loss": 0.7896, + "num_input_tokens_seen": 74915200, + "step": 129025 + }, + { + "epoch": 19.218051831992852, + "grad_norm": 0.053466796875, + "learning_rate": 0.0001395284970551186, + "loss": 0.7943, + "num_input_tokens_seen": 74917984, + "step": 129030 + }, + { + "epoch": 19.21879654453381, + "grad_norm": 0.031982421875, + "learning_rate": 0.00013926331734465501, + "loss": 0.7946, + "num_input_tokens_seen": 74920928, + "step": 129035 + }, + { + "epoch": 19.21954125707477, + "grad_norm": 0.060791015625, + "learning_rate": 0.00013899838869123327, + "loss": 0.8115, + "num_input_tokens_seen": 74923520, + "step": 129040 + }, + { + "epoch": 19.220285969615727, + "grad_norm": 0.1103515625, + "learning_rate": 0.00013873371109932974, + "loss": 0.782, + "num_input_tokens_seen": 74926528, + "step": 129045 + }, + { + "epoch": 19.221030682156687, + "grad_norm": 0.0791015625, + "learning_rate": 0.0001384692845734159, + "loss": 0.8154, + "num_input_tokens_seen": 74929152, + "step": 129050 + }, + { + "epoch": 19.221775394697648, + "grad_norm": 0.0634765625, + "learning_rate": 0.00013820510911795813, + "loss": 0.7981, + "num_input_tokens_seen": 74932160, + "step": 129055 + }, + { + "epoch": 19.222520107238605, + "grad_norm": 0.041259765625, + "learning_rate": 0.00013794118473742123, + "loss": 0.806, + "num_input_tokens_seen": 74934688, + "step": 129060 + }, + { + "epoch": 19.223264819779565, + "grad_norm": 0.0517578125, + "learning_rate": 0.00013767751143626162, + "loss": 0.8143, + "num_input_tokens_seen": 74937568, + "step": 129065 + }, + { + "epoch": 19.224009532320526, + "grad_norm": 0.06689453125, + "learning_rate": 0.00013741408921893572, + "loss": 0.8036, + "num_input_tokens_seen": 74940864, + "step": 129070 + }, + { + "epoch": 19.224754244861483, + "grad_norm": 0.037353515625, + "learning_rate": 0.0001371509180898933, + "loss": 0.8106, + "num_input_tokens_seen": 74943776, + "step": 129075 + }, + { + "epoch": 19.225498957402444, + "grad_norm": 0.046875, + "learning_rate": 0.00013688799805358087, + "loss": 0.8095, + "num_input_tokens_seen": 74946592, + "step": 129080 + }, + { + "epoch": 19.2262436699434, + "grad_norm": 0.049072265625, + "learning_rate": 0.00013662532911443815, + "loss": 0.7978, + "num_input_tokens_seen": 74949696, + "step": 129085 + }, + { + "epoch": 19.22698838248436, + "grad_norm": 0.06103515625, + "learning_rate": 0.0001363629112769049, + "loss": 0.7697, + "num_input_tokens_seen": 74952512, + "step": 129090 + }, + { + "epoch": 19.22773309502532, + "grad_norm": 0.03271484375, + "learning_rate": 0.0001361007445454143, + "loss": 0.7871, + "num_input_tokens_seen": 74955296, + "step": 129095 + }, + { + "epoch": 19.22847780756628, + "grad_norm": 0.04931640625, + "learning_rate": 0.0001358388289243928, + "loss": 0.8085, + "num_input_tokens_seen": 74958208, + "step": 129100 + }, + { + "epoch": 19.22922252010724, + "grad_norm": 0.0498046875, + "learning_rate": 0.0001355771644182685, + "loss": 0.8117, + "num_input_tokens_seen": 74961056, + "step": 129105 + }, + { + "epoch": 19.229967232648196, + "grad_norm": 0.05322265625, + "learning_rate": 0.00013531575103145954, + "loss": 0.7992, + "num_input_tokens_seen": 74964064, + "step": 129110 + }, + { + "epoch": 19.230711945189157, + "grad_norm": 0.058837890625, + "learning_rate": 0.00013505458876838405, + "loss": 0.8018, + "num_input_tokens_seen": 74966848, + "step": 129115 + }, + { + "epoch": 19.231456657730117, + "grad_norm": 0.039794921875, + "learning_rate": 0.0001347936776334535, + "loss": 0.7985, + "num_input_tokens_seen": 74969664, + "step": 129120 + }, + { + "epoch": 19.232201370271074, + "grad_norm": 0.047607421875, + "learning_rate": 0.00013453301763107273, + "loss": 0.7951, + "num_input_tokens_seen": 74972384, + "step": 129125 + }, + { + "epoch": 19.232946082812035, + "grad_norm": 0.04736328125, + "learning_rate": 0.0001342726087656515, + "loss": 0.7919, + "num_input_tokens_seen": 74975360, + "step": 129130 + }, + { + "epoch": 19.233690795352995, + "grad_norm": 0.05126953125, + "learning_rate": 0.000134012451041583, + "loss": 0.8068, + "num_input_tokens_seen": 74978272, + "step": 129135 + }, + { + "epoch": 19.234435507893952, + "grad_norm": 0.0279541015625, + "learning_rate": 0.000133752544463267, + "loss": 0.807, + "num_input_tokens_seen": 74981088, + "step": 129140 + }, + { + "epoch": 19.235180220434913, + "grad_norm": 0.0810546875, + "learning_rate": 0.0001334928890350917, + "loss": 0.814, + "num_input_tokens_seen": 74983840, + "step": 129145 + }, + { + "epoch": 19.23592493297587, + "grad_norm": 0.03955078125, + "learning_rate": 0.0001332334847614436, + "loss": 0.801, + "num_input_tokens_seen": 74986784, + "step": 129150 + }, + { + "epoch": 19.23666964551683, + "grad_norm": 0.058837890625, + "learning_rate": 0.00013297433164670747, + "loss": 0.7979, + "num_input_tokens_seen": 74989472, + "step": 129155 + }, + { + "epoch": 19.23741435805779, + "grad_norm": 0.05078125, + "learning_rate": 0.00013271542969525818, + "loss": 0.7987, + "num_input_tokens_seen": 74992416, + "step": 129160 + }, + { + "epoch": 19.238159070598748, + "grad_norm": 0.0400390625, + "learning_rate": 0.00013245677891147222, + "loss": 0.7949, + "num_input_tokens_seen": 74995392, + "step": 129165 + }, + { + "epoch": 19.23890378313971, + "grad_norm": 0.041015625, + "learning_rate": 0.00013219837929971778, + "loss": 0.7922, + "num_input_tokens_seen": 74998112, + "step": 129170 + }, + { + "epoch": 19.23964849568067, + "grad_norm": 0.041259765625, + "learning_rate": 0.00013194023086436133, + "loss": 0.7865, + "num_input_tokens_seen": 75001024, + "step": 129175 + }, + { + "epoch": 19.240393208221626, + "grad_norm": 0.0537109375, + "learning_rate": 0.00013168233360976277, + "loss": 0.7955, + "num_input_tokens_seen": 75004256, + "step": 129180 + }, + { + "epoch": 19.241137920762586, + "grad_norm": 0.0361328125, + "learning_rate": 0.00013142468754028024, + "loss": 0.7797, + "num_input_tokens_seen": 75007040, + "step": 129185 + }, + { + "epoch": 19.241882633303543, + "grad_norm": 0.035888671875, + "learning_rate": 0.00013116729266026528, + "loss": 0.7867, + "num_input_tokens_seen": 75009888, + "step": 129190 + }, + { + "epoch": 19.242627345844504, + "grad_norm": 0.042724609375, + "learning_rate": 0.0001309101489740677, + "loss": 0.8038, + "num_input_tokens_seen": 75012448, + "step": 129195 + }, + { + "epoch": 19.243372058385464, + "grad_norm": 0.040771484375, + "learning_rate": 0.00013065325648603076, + "loss": 0.8028, + "num_input_tokens_seen": 75015360, + "step": 129200 + }, + { + "epoch": 19.24411677092642, + "grad_norm": 0.036376953125, + "learning_rate": 0.00013039661520049427, + "loss": 0.8018, + "num_input_tokens_seen": 75018464, + "step": 129205 + }, + { + "epoch": 19.244861483467382, + "grad_norm": 0.04296875, + "learning_rate": 0.0001301402251217948, + "loss": 0.7961, + "num_input_tokens_seen": 75021760, + "step": 129210 + }, + { + "epoch": 19.245606196008342, + "grad_norm": 0.0498046875, + "learning_rate": 0.00012988408625426217, + "loss": 0.8046, + "num_input_tokens_seen": 75024864, + "step": 129215 + }, + { + "epoch": 19.2463509085493, + "grad_norm": 0.051513671875, + "learning_rate": 0.00012962819860222463, + "loss": 0.7812, + "num_input_tokens_seen": 75027296, + "step": 129220 + }, + { + "epoch": 19.24709562109026, + "grad_norm": 0.06201171875, + "learning_rate": 0.00012937256217000537, + "loss": 0.7727, + "num_input_tokens_seen": 75030144, + "step": 129225 + }, + { + "epoch": 19.247840333631217, + "grad_norm": 0.032470703125, + "learning_rate": 0.00012911717696192427, + "loss": 0.8139, + "num_input_tokens_seen": 75033088, + "step": 129230 + }, + { + "epoch": 19.248585046172177, + "grad_norm": 0.0537109375, + "learning_rate": 0.0001288620429822912, + "loss": 0.7939, + "num_input_tokens_seen": 75035968, + "step": 129235 + }, + { + "epoch": 19.249329758713138, + "grad_norm": 0.07470703125, + "learning_rate": 0.00012860716023542273, + "loss": 0.7785, + "num_input_tokens_seen": 75038944, + "step": 129240 + }, + { + "epoch": 19.250074471254095, + "grad_norm": 0.058349609375, + "learning_rate": 0.00012835252872562042, + "loss": 0.7943, + "num_input_tokens_seen": 75041984, + "step": 129245 + }, + { + "epoch": 19.250819183795056, + "grad_norm": 0.046630859375, + "learning_rate": 0.0001280981484571858, + "loss": 0.8077, + "num_input_tokens_seen": 75044736, + "step": 129250 + }, + { + "epoch": 19.251563896336016, + "grad_norm": 0.0625, + "learning_rate": 0.00012784401943442047, + "loss": 0.8036, + "num_input_tokens_seen": 75047808, + "step": 129255 + }, + { + "epoch": 19.252308608876973, + "grad_norm": 0.045654296875, + "learning_rate": 0.00012759014166161263, + "loss": 0.8073, + "num_input_tokens_seen": 75050688, + "step": 129260 + }, + { + "epoch": 19.253053321417934, + "grad_norm": 0.07861328125, + "learning_rate": 0.00012733651514305555, + "loss": 0.8086, + "num_input_tokens_seen": 75053760, + "step": 129265 + }, + { + "epoch": 19.25379803395889, + "grad_norm": 0.064453125, + "learning_rate": 0.00012708313988303078, + "loss": 0.7939, + "num_input_tokens_seen": 75056544, + "step": 129270 + }, + { + "epoch": 19.25454274649985, + "grad_norm": 0.060791015625, + "learning_rate": 0.00012683001588582155, + "loss": 0.7955, + "num_input_tokens_seen": 75059520, + "step": 129275 + }, + { + "epoch": 19.25528745904081, + "grad_norm": 0.0458984375, + "learning_rate": 0.0001265771431557028, + "loss": 0.7926, + "num_input_tokens_seen": 75062208, + "step": 129280 + }, + { + "epoch": 19.25603217158177, + "grad_norm": 0.06982421875, + "learning_rate": 0.00012632452169694607, + "loss": 0.7783, + "num_input_tokens_seen": 75065184, + "step": 129285 + }, + { + "epoch": 19.25677688412273, + "grad_norm": 0.053955078125, + "learning_rate": 0.00012607215151381966, + "loss": 0.7982, + "num_input_tokens_seen": 75067904, + "step": 129290 + }, + { + "epoch": 19.257521596663686, + "grad_norm": 0.10888671875, + "learning_rate": 0.00012582003261058682, + "loss": 0.8093, + "num_input_tokens_seen": 75070848, + "step": 129295 + }, + { + "epoch": 19.258266309204647, + "grad_norm": 0.04736328125, + "learning_rate": 0.00012556816499150746, + "loss": 0.7808, + "num_input_tokens_seen": 75073984, + "step": 129300 + }, + { + "epoch": 19.259011021745607, + "grad_norm": 0.06494140625, + "learning_rate": 0.00012531654866083485, + "loss": 0.8046, + "num_input_tokens_seen": 75076768, + "step": 129305 + }, + { + "epoch": 19.259755734286564, + "grad_norm": 0.0390625, + "learning_rate": 0.00012506518362282227, + "loss": 0.8015, + "num_input_tokens_seen": 75079968, + "step": 129310 + }, + { + "epoch": 19.260500446827525, + "grad_norm": 0.037109375, + "learning_rate": 0.00012481406988171462, + "loss": 0.7875, + "num_input_tokens_seen": 75082592, + "step": 129315 + }, + { + "epoch": 19.261245159368485, + "grad_norm": 0.048095703125, + "learning_rate": 0.00012456320744175687, + "loss": 0.8142, + "num_input_tokens_seen": 75085696, + "step": 129320 + }, + { + "epoch": 19.261989871909442, + "grad_norm": 0.050048828125, + "learning_rate": 0.00012431259630718227, + "loss": 0.7838, + "num_input_tokens_seen": 75089184, + "step": 129325 + }, + { + "epoch": 19.262734584450403, + "grad_norm": 0.04638671875, + "learning_rate": 0.00012406223648222912, + "loss": 0.8058, + "num_input_tokens_seen": 75091872, + "step": 129330 + }, + { + "epoch": 19.26347929699136, + "grad_norm": 0.03173828125, + "learning_rate": 0.00012381212797112572, + "loss": 0.8059, + "num_input_tokens_seen": 75094944, + "step": 129335 + }, + { + "epoch": 19.26422400953232, + "grad_norm": 0.037841796875, + "learning_rate": 0.00012356227077809533, + "loss": 0.7953, + "num_input_tokens_seen": 75097568, + "step": 129340 + }, + { + "epoch": 19.26496872207328, + "grad_norm": 0.04833984375, + "learning_rate": 0.00012331266490736125, + "loss": 0.7809, + "num_input_tokens_seen": 75100160, + "step": 129345 + }, + { + "epoch": 19.265713434614238, + "grad_norm": 0.03955078125, + "learning_rate": 0.0001230633103631401, + "loss": 0.7956, + "num_input_tokens_seen": 75103232, + "step": 129350 + }, + { + "epoch": 19.2664581471552, + "grad_norm": 0.045166015625, + "learning_rate": 0.00012281420714964353, + "loss": 0.7971, + "num_input_tokens_seen": 75106656, + "step": 129355 + }, + { + "epoch": 19.26720285969616, + "grad_norm": 0.025634765625, + "learning_rate": 0.00012256535527108148, + "loss": 0.8313, + "num_input_tokens_seen": 75109504, + "step": 129360 + }, + { + "epoch": 19.267947572237116, + "grad_norm": 0.064453125, + "learning_rate": 0.0001223167547316556, + "loss": 0.7762, + "num_input_tokens_seen": 75112384, + "step": 129365 + }, + { + "epoch": 19.268692284778076, + "grad_norm": 0.07861328125, + "learning_rate": 0.00012206840553556753, + "loss": 0.8116, + "num_input_tokens_seen": 75115648, + "step": 129370 + }, + { + "epoch": 19.269436997319033, + "grad_norm": 0.033935546875, + "learning_rate": 0.00012182030768701223, + "loss": 0.7923, + "num_input_tokens_seen": 75118496, + "step": 129375 + }, + { + "epoch": 19.270181709859994, + "grad_norm": 0.032958984375, + "learning_rate": 0.00012157246119018139, + "loss": 0.799, + "num_input_tokens_seen": 75121088, + "step": 129380 + }, + { + "epoch": 19.270926422400954, + "grad_norm": 0.07421875, + "learning_rate": 0.00012132486604926329, + "loss": 0.7971, + "num_input_tokens_seen": 75123776, + "step": 129385 + }, + { + "epoch": 19.27167113494191, + "grad_norm": 0.053466796875, + "learning_rate": 0.00012107752226843792, + "loss": 0.7996, + "num_input_tokens_seen": 75126912, + "step": 129390 + }, + { + "epoch": 19.272415847482872, + "grad_norm": 0.027099609375, + "learning_rate": 0.00012083042985188696, + "loss": 0.8045, + "num_input_tokens_seen": 75129696, + "step": 129395 + }, + { + "epoch": 19.273160560023832, + "grad_norm": 0.026123046875, + "learning_rate": 0.00012058358880378205, + "loss": 0.8028, + "num_input_tokens_seen": 75132512, + "step": 129400 + }, + { + "epoch": 19.27390527256479, + "grad_norm": 0.0361328125, + "learning_rate": 0.00012033699912829653, + "loss": 0.7883, + "num_input_tokens_seen": 75135360, + "step": 129405 + }, + { + "epoch": 19.27464998510575, + "grad_norm": 0.05859375, + "learning_rate": 0.00012009066082959373, + "loss": 0.8094, + "num_input_tokens_seen": 75138240, + "step": 129410 + }, + { + "epoch": 19.275394697646707, + "grad_norm": 0.033447265625, + "learning_rate": 0.00011984457391183367, + "loss": 0.7738, + "num_input_tokens_seen": 75141152, + "step": 129415 + }, + { + "epoch": 19.276139410187668, + "grad_norm": 0.0458984375, + "learning_rate": 0.000119598738379178, + "loss": 0.7824, + "num_input_tokens_seen": 75143840, + "step": 129420 + }, + { + "epoch": 19.276884122728628, + "grad_norm": 0.057861328125, + "learning_rate": 0.00011935315423577841, + "loss": 0.7731, + "num_input_tokens_seen": 75146592, + "step": 129425 + }, + { + "epoch": 19.277628835269585, + "grad_norm": 0.0634765625, + "learning_rate": 0.00011910782148578158, + "loss": 0.7863, + "num_input_tokens_seen": 75149216, + "step": 129430 + }, + { + "epoch": 19.278373547810546, + "grad_norm": 0.03857421875, + "learning_rate": 0.00011886274013333586, + "loss": 0.786, + "num_input_tokens_seen": 75151744, + "step": 129435 + }, + { + "epoch": 19.279118260351503, + "grad_norm": 0.0361328125, + "learning_rate": 0.00011861791018257794, + "loss": 0.7894, + "num_input_tokens_seen": 75154688, + "step": 129440 + }, + { + "epoch": 19.279862972892463, + "grad_norm": 0.05615234375, + "learning_rate": 0.00011837333163764618, + "loss": 0.7889, + "num_input_tokens_seen": 75157408, + "step": 129445 + }, + { + "epoch": 19.280607685433424, + "grad_norm": 0.044921875, + "learning_rate": 0.00011812900450267228, + "loss": 0.7952, + "num_input_tokens_seen": 75160352, + "step": 129450 + }, + { + "epoch": 19.28135239797438, + "grad_norm": 0.0556640625, + "learning_rate": 0.00011788492878178291, + "loss": 0.79, + "num_input_tokens_seen": 75163328, + "step": 129455 + }, + { + "epoch": 19.28209711051534, + "grad_norm": 0.052734375, + "learning_rate": 0.00011764110447910314, + "loss": 0.7929, + "num_input_tokens_seen": 75166176, + "step": 129460 + }, + { + "epoch": 19.2828418230563, + "grad_norm": 0.025390625, + "learning_rate": 0.00011739753159874965, + "loss": 0.792, + "num_input_tokens_seen": 75168896, + "step": 129465 + }, + { + "epoch": 19.28358653559726, + "grad_norm": 0.046875, + "learning_rate": 0.00011715421014484084, + "loss": 0.7908, + "num_input_tokens_seen": 75172160, + "step": 129470 + }, + { + "epoch": 19.28433124813822, + "grad_norm": 0.369140625, + "learning_rate": 0.00011691114012148506, + "loss": 0.8161, + "num_input_tokens_seen": 75175072, + "step": 129475 + }, + { + "epoch": 19.285075960679176, + "grad_norm": 0.038330078125, + "learning_rate": 0.00011666832153278906, + "loss": 0.8095, + "num_input_tokens_seen": 75178144, + "step": 129480 + }, + { + "epoch": 19.285820673220137, + "grad_norm": 0.04052734375, + "learning_rate": 0.00011642575438285451, + "loss": 0.7796, + "num_input_tokens_seen": 75180800, + "step": 129485 + }, + { + "epoch": 19.286565385761097, + "grad_norm": 0.0419921875, + "learning_rate": 0.00011618343867577818, + "loss": 0.7947, + "num_input_tokens_seen": 75183360, + "step": 129490 + }, + { + "epoch": 19.287310098302054, + "grad_norm": 0.040283203125, + "learning_rate": 0.00011594137441565844, + "loss": 0.8096, + "num_input_tokens_seen": 75186240, + "step": 129495 + }, + { + "epoch": 19.288054810843015, + "grad_norm": 0.053955078125, + "learning_rate": 0.00011569956160657868, + "loss": 0.7949, + "num_input_tokens_seen": 75189088, + "step": 129500 + }, + { + "epoch": 19.288799523383975, + "grad_norm": 0.04296875, + "learning_rate": 0.00011545800025262898, + "loss": 0.8041, + "num_input_tokens_seen": 75192064, + "step": 129505 + }, + { + "epoch": 19.289544235924932, + "grad_norm": 0.0341796875, + "learning_rate": 0.00011521669035788772, + "loss": 0.8078, + "num_input_tokens_seen": 75195136, + "step": 129510 + }, + { + "epoch": 19.290288948465893, + "grad_norm": 0.053955078125, + "learning_rate": 0.00011497563192643334, + "loss": 0.8017, + "num_input_tokens_seen": 75198080, + "step": 129515 + }, + { + "epoch": 19.29103366100685, + "grad_norm": 0.03564453125, + "learning_rate": 0.00011473482496233589, + "loss": 0.801, + "num_input_tokens_seen": 75201184, + "step": 129520 + }, + { + "epoch": 19.29177837354781, + "grad_norm": 0.031494140625, + "learning_rate": 0.00011449426946966545, + "loss": 0.8088, + "num_input_tokens_seen": 75204064, + "step": 129525 + }, + { + "epoch": 19.29252308608877, + "grad_norm": 0.057861328125, + "learning_rate": 0.00011425396545248545, + "loss": 0.7945, + "num_input_tokens_seen": 75207328, + "step": 129530 + }, + { + "epoch": 19.293267798629728, + "grad_norm": 0.10595703125, + "learning_rate": 0.00011401391291485596, + "loss": 0.8012, + "num_input_tokens_seen": 75210400, + "step": 129535 + }, + { + "epoch": 19.29401251117069, + "grad_norm": 0.07177734375, + "learning_rate": 0.00011377411186083208, + "loss": 0.8014, + "num_input_tokens_seen": 75213248, + "step": 129540 + }, + { + "epoch": 19.29475722371165, + "grad_norm": 0.03515625, + "learning_rate": 0.0001135345622944639, + "loss": 0.7998, + "num_input_tokens_seen": 75215936, + "step": 129545 + }, + { + "epoch": 19.295501936252606, + "grad_norm": 0.058837890625, + "learning_rate": 0.00011329526421979985, + "loss": 0.7949, + "num_input_tokens_seen": 75218464, + "step": 129550 + }, + { + "epoch": 19.296246648793566, + "grad_norm": 0.06591796875, + "learning_rate": 0.00011305621764088169, + "loss": 0.7924, + "num_input_tokens_seen": 75221024, + "step": 129555 + }, + { + "epoch": 19.296991361334523, + "grad_norm": 0.05615234375, + "learning_rate": 0.00011281742256174954, + "loss": 0.7895, + "num_input_tokens_seen": 75223808, + "step": 129560 + }, + { + "epoch": 19.297736073875484, + "grad_norm": 0.053955078125, + "learning_rate": 0.00011257887898643514, + "loss": 0.782, + "num_input_tokens_seen": 75226784, + "step": 129565 + }, + { + "epoch": 19.298480786416444, + "grad_norm": 0.036865234375, + "learning_rate": 0.00011234058691896864, + "loss": 0.798, + "num_input_tokens_seen": 75229632, + "step": 129570 + }, + { + "epoch": 19.2992254989574, + "grad_norm": 0.04052734375, + "learning_rate": 0.00011210254636337845, + "loss": 0.8014, + "num_input_tokens_seen": 75232608, + "step": 129575 + }, + { + "epoch": 19.299970211498362, + "grad_norm": 0.036865234375, + "learning_rate": 0.00011186475732368472, + "loss": 0.8041, + "num_input_tokens_seen": 75235264, + "step": 129580 + }, + { + "epoch": 19.300714924039323, + "grad_norm": 0.043212890625, + "learning_rate": 0.00011162721980390421, + "loss": 0.815, + "num_input_tokens_seen": 75238080, + "step": 129585 + }, + { + "epoch": 19.30145963658028, + "grad_norm": 0.039306640625, + "learning_rate": 0.00011138993380804874, + "loss": 0.8007, + "num_input_tokens_seen": 75240832, + "step": 129590 + }, + { + "epoch": 19.30220434912124, + "grad_norm": 0.0556640625, + "learning_rate": 0.00011115289934012839, + "loss": 0.7985, + "num_input_tokens_seen": 75243744, + "step": 129595 + }, + { + "epoch": 19.302949061662197, + "grad_norm": 0.029541015625, + "learning_rate": 0.00011091611640414833, + "loss": 0.7933, + "num_input_tokens_seen": 75246688, + "step": 129600 + }, + { + "epoch": 19.303693774203158, + "grad_norm": 0.034423828125, + "learning_rate": 0.00011067958500410701, + "loss": 0.8066, + "num_input_tokens_seen": 75249856, + "step": 129605 + }, + { + "epoch": 19.304438486744118, + "grad_norm": 0.044189453125, + "learning_rate": 0.0001104433051440029, + "loss": 0.8037, + "num_input_tokens_seen": 75252640, + "step": 129610 + }, + { + "epoch": 19.305183199285075, + "grad_norm": 0.05615234375, + "learning_rate": 0.0001102072768278245, + "loss": 0.8167, + "num_input_tokens_seen": 75255648, + "step": 129615 + }, + { + "epoch": 19.305927911826036, + "grad_norm": 0.07568359375, + "learning_rate": 0.00010997150005956191, + "loss": 0.7907, + "num_input_tokens_seen": 75258592, + "step": 129620 + }, + { + "epoch": 19.306672624366993, + "grad_norm": 0.0546875, + "learning_rate": 0.0001097359748431953, + "loss": 0.8336, + "num_input_tokens_seen": 75261472, + "step": 129625 + }, + { + "epoch": 19.307417336907953, + "grad_norm": 0.07470703125, + "learning_rate": 0.00010950070118270816, + "loss": 0.8076, + "num_input_tokens_seen": 75264288, + "step": 129630 + }, + { + "epoch": 19.308162049448914, + "grad_norm": 0.041015625, + "learning_rate": 0.00010926567908207062, + "loss": 0.7928, + "num_input_tokens_seen": 75267328, + "step": 129635 + }, + { + "epoch": 19.30890676198987, + "grad_norm": 0.0830078125, + "learning_rate": 0.0001090309085452562, + "loss": 0.801, + "num_input_tokens_seen": 75270272, + "step": 129640 + }, + { + "epoch": 19.30965147453083, + "grad_norm": 0.053955078125, + "learning_rate": 0.00010879638957623005, + "loss": 0.7891, + "num_input_tokens_seen": 75273120, + "step": 129645 + }, + { + "epoch": 19.31039618707179, + "grad_norm": 0.072265625, + "learning_rate": 0.000108562122178954, + "loss": 0.7973, + "num_input_tokens_seen": 75276352, + "step": 129650 + }, + { + "epoch": 19.31114089961275, + "grad_norm": 0.1240234375, + "learning_rate": 0.00010832810635738654, + "loss": 0.786, + "num_input_tokens_seen": 75279232, + "step": 129655 + }, + { + "epoch": 19.31188561215371, + "grad_norm": 0.06982421875, + "learning_rate": 0.00010809434211547953, + "loss": 0.7878, + "num_input_tokens_seen": 75281984, + "step": 129660 + }, + { + "epoch": 19.312630324694666, + "grad_norm": 0.0517578125, + "learning_rate": 0.00010786082945718311, + "loss": 0.8081, + "num_input_tokens_seen": 75284832, + "step": 129665 + }, + { + "epoch": 19.313375037235627, + "grad_norm": 0.054443359375, + "learning_rate": 0.00010762756838644249, + "loss": 0.785, + "num_input_tokens_seen": 75287680, + "step": 129670 + }, + { + "epoch": 19.314119749776587, + "grad_norm": 0.04296875, + "learning_rate": 0.00010739455890719785, + "loss": 0.8012, + "num_input_tokens_seen": 75290400, + "step": 129675 + }, + { + "epoch": 19.314864462317544, + "grad_norm": 0.050048828125, + "learning_rate": 0.00010716180102338767, + "loss": 0.796, + "num_input_tokens_seen": 75293472, + "step": 129680 + }, + { + "epoch": 19.315609174858505, + "grad_norm": 0.03564453125, + "learning_rate": 0.00010692929473894052, + "loss": 0.7985, + "num_input_tokens_seen": 75296352, + "step": 129685 + }, + { + "epoch": 19.316353887399465, + "grad_norm": 0.03466796875, + "learning_rate": 0.00010669704005778823, + "loss": 0.8024, + "num_input_tokens_seen": 75299456, + "step": 129690 + }, + { + "epoch": 19.317098599940422, + "grad_norm": 0.054443359375, + "learning_rate": 0.000106465036983851, + "loss": 0.7936, + "num_input_tokens_seen": 75302272, + "step": 129695 + }, + { + "epoch": 19.317843312481383, + "grad_norm": 0.046142578125, + "learning_rate": 0.00010623328552105072, + "loss": 0.7961, + "num_input_tokens_seen": 75305184, + "step": 129700 + }, + { + "epoch": 19.31858802502234, + "grad_norm": 0.052734375, + "learning_rate": 0.00010600178567329921, + "loss": 0.7932, + "num_input_tokens_seen": 75307936, + "step": 129705 + }, + { + "epoch": 19.3193327375633, + "grad_norm": 0.06494140625, + "learning_rate": 0.00010577053744451337, + "loss": 0.8193, + "num_input_tokens_seen": 75310848, + "step": 129710 + }, + { + "epoch": 19.32007745010426, + "grad_norm": 0.05810546875, + "learning_rate": 0.0001055395408385934, + "loss": 0.802, + "num_input_tokens_seen": 75313664, + "step": 129715 + }, + { + "epoch": 19.320822162645218, + "grad_norm": 0.038818359375, + "learning_rate": 0.00010530879585944786, + "loss": 0.7769, + "num_input_tokens_seen": 75316640, + "step": 129720 + }, + { + "epoch": 19.32156687518618, + "grad_norm": 0.049072265625, + "learning_rate": 0.00010507830251097028, + "loss": 0.8007, + "num_input_tokens_seen": 75319680, + "step": 129725 + }, + { + "epoch": 19.32231158772714, + "grad_norm": 0.0810546875, + "learning_rate": 0.00010484806079705588, + "loss": 0.8097, + "num_input_tokens_seen": 75322912, + "step": 129730 + }, + { + "epoch": 19.323056300268096, + "grad_norm": 0.06103515625, + "learning_rate": 0.00010461807072159657, + "loss": 0.8025, + "num_input_tokens_seen": 75326272, + "step": 129735 + }, + { + "epoch": 19.323801012809056, + "grad_norm": 0.0703125, + "learning_rate": 0.00010438833228847422, + "loss": 0.8062, + "num_input_tokens_seen": 75329248, + "step": 129740 + }, + { + "epoch": 19.324545725350013, + "grad_norm": 0.04541015625, + "learning_rate": 0.00010415884550157239, + "loss": 0.8146, + "num_input_tokens_seen": 75332128, + "step": 129745 + }, + { + "epoch": 19.325290437890974, + "grad_norm": 0.060302734375, + "learning_rate": 0.00010392961036476799, + "loss": 0.7818, + "num_input_tokens_seen": 75334784, + "step": 129750 + }, + { + "epoch": 19.326035150431935, + "grad_norm": 0.052978515625, + "learning_rate": 0.00010370062688193293, + "loss": 0.8022, + "num_input_tokens_seen": 75337888, + "step": 129755 + }, + { + "epoch": 19.32677986297289, + "grad_norm": 0.0247802734375, + "learning_rate": 0.0001034718950569341, + "loss": 0.7956, + "num_input_tokens_seen": 75340672, + "step": 129760 + }, + { + "epoch": 19.327524575513852, + "grad_norm": 0.025390625, + "learning_rate": 0.00010324341489364007, + "loss": 0.8147, + "num_input_tokens_seen": 75343520, + "step": 129765 + }, + { + "epoch": 19.328269288054813, + "grad_norm": 0.048828125, + "learning_rate": 0.00010301518639590612, + "loss": 0.8186, + "num_input_tokens_seen": 75346304, + "step": 129770 + }, + { + "epoch": 19.32901400059577, + "grad_norm": 0.039794921875, + "learning_rate": 0.0001027872095675908, + "loss": 0.8002, + "num_input_tokens_seen": 75349024, + "step": 129775 + }, + { + "epoch": 19.32975871313673, + "grad_norm": 0.044189453125, + "learning_rate": 0.00010255948441254436, + "loss": 0.7937, + "num_input_tokens_seen": 75352064, + "step": 129780 + }, + { + "epoch": 19.330503425677687, + "grad_norm": 0.033203125, + "learning_rate": 0.0001023320109346154, + "loss": 0.7948, + "num_input_tokens_seen": 75354880, + "step": 129785 + }, + { + "epoch": 19.331248138218648, + "grad_norm": 0.03662109375, + "learning_rate": 0.0001021047891376442, + "loss": 0.8143, + "num_input_tokens_seen": 75357600, + "step": 129790 + }, + { + "epoch": 19.331992850759608, + "grad_norm": 0.044189453125, + "learning_rate": 0.00010187781902547266, + "loss": 0.8049, + "num_input_tokens_seen": 75360384, + "step": 129795 + }, + { + "epoch": 19.332737563300565, + "grad_norm": 0.041259765625, + "learning_rate": 0.00010165110060193272, + "loss": 0.7978, + "num_input_tokens_seen": 75363296, + "step": 129800 + }, + { + "epoch": 19.333482275841526, + "grad_norm": 0.103515625, + "learning_rate": 0.00010142463387085465, + "loss": 0.7877, + "num_input_tokens_seen": 75366080, + "step": 129805 + }, + { + "epoch": 19.334226988382483, + "grad_norm": 0.042724609375, + "learning_rate": 0.00010119841883606705, + "loss": 0.7727, + "num_input_tokens_seen": 75369216, + "step": 129810 + }, + { + "epoch": 19.334971700923443, + "grad_norm": 0.054443359375, + "learning_rate": 0.00010097245550138856, + "loss": 0.7878, + "num_input_tokens_seen": 75371968, + "step": 129815 + }, + { + "epoch": 19.335716413464404, + "grad_norm": 0.09033203125, + "learning_rate": 0.00010074674387063775, + "loss": 0.7886, + "num_input_tokens_seen": 75374784, + "step": 129820 + }, + { + "epoch": 19.33646112600536, + "grad_norm": 0.07861328125, + "learning_rate": 0.00010052128394762826, + "loss": 0.795, + "num_input_tokens_seen": 75377888, + "step": 129825 + }, + { + "epoch": 19.33720583854632, + "grad_norm": 0.0478515625, + "learning_rate": 0.00010029607573616706, + "loss": 0.8055, + "num_input_tokens_seen": 75380672, + "step": 129830 + }, + { + "epoch": 19.337950551087282, + "grad_norm": 0.039306640625, + "learning_rate": 0.00010007111924006273, + "loss": 0.7971, + "num_input_tokens_seen": 75383552, + "step": 129835 + }, + { + "epoch": 19.33869526362824, + "grad_norm": 0.04345703125, + "learning_rate": 9.98464144631106e-05, + "loss": 0.8049, + "num_input_tokens_seen": 75386688, + "step": 129840 + }, + { + "epoch": 19.3394399761692, + "grad_norm": 0.042236328125, + "learning_rate": 9.962196140911095e-05, + "loss": 0.8039, + "num_input_tokens_seen": 75389248, + "step": 129845 + }, + { + "epoch": 19.340184688710156, + "grad_norm": 0.0537109375, + "learning_rate": 9.939776008185408e-05, + "loss": 0.7969, + "num_input_tokens_seen": 75392192, + "step": 129850 + }, + { + "epoch": 19.340929401251117, + "grad_norm": 0.0810546875, + "learning_rate": 9.917381048512863e-05, + "loss": 0.805, + "num_input_tokens_seen": 75395072, + "step": 129855 + }, + { + "epoch": 19.341674113792077, + "grad_norm": 0.04931640625, + "learning_rate": 9.895011262271491e-05, + "loss": 0.8144, + "num_input_tokens_seen": 75398112, + "step": 129860 + }, + { + "epoch": 19.342418826333034, + "grad_norm": 0.07958984375, + "learning_rate": 9.872666649839656e-05, + "loss": 0.8187, + "num_input_tokens_seen": 75400960, + "step": 129865 + }, + { + "epoch": 19.343163538873995, + "grad_norm": 0.095703125, + "learning_rate": 9.85034721159439e-05, + "loss": 0.7836, + "num_input_tokens_seen": 75404032, + "step": 129870 + }, + { + "epoch": 19.343908251414955, + "grad_norm": 0.046875, + "learning_rate": 9.828052947913224e-05, + "loss": 0.7963, + "num_input_tokens_seen": 75406816, + "step": 129875 + }, + { + "epoch": 19.344652963955912, + "grad_norm": 0.025634765625, + "learning_rate": 9.805783859172524e-05, + "loss": 0.7992, + "num_input_tokens_seen": 75409952, + "step": 129880 + }, + { + "epoch": 19.345397676496873, + "grad_norm": 0.0361328125, + "learning_rate": 9.783539945748487e-05, + "loss": 0.8014, + "num_input_tokens_seen": 75412640, + "step": 129885 + }, + { + "epoch": 19.34614238903783, + "grad_norm": 0.049560546875, + "learning_rate": 9.761321208016815e-05, + "loss": 0.7809, + "num_input_tokens_seen": 75415488, + "step": 129890 + }, + { + "epoch": 19.34688710157879, + "grad_norm": 0.037353515625, + "learning_rate": 9.739127646353207e-05, + "loss": 0.7995, + "num_input_tokens_seen": 75418240, + "step": 129895 + }, + { + "epoch": 19.34763181411975, + "grad_norm": 0.038330078125, + "learning_rate": 9.716959261132196e-05, + "loss": 0.7983, + "num_input_tokens_seen": 75421440, + "step": 129900 + }, + { + "epoch": 19.348376526660708, + "grad_norm": 0.031982421875, + "learning_rate": 9.69481605272865e-05, + "loss": 0.8035, + "num_input_tokens_seen": 75424256, + "step": 129905 + }, + { + "epoch": 19.34912123920167, + "grad_norm": 0.050048828125, + "learning_rate": 9.672698021516602e-05, + "loss": 0.8055, + "num_input_tokens_seen": 75427072, + "step": 129910 + }, + { + "epoch": 19.34986595174263, + "grad_norm": 0.045654296875, + "learning_rate": 9.650605167869419e-05, + "loss": 0.7957, + "num_input_tokens_seen": 75429568, + "step": 129915 + }, + { + "epoch": 19.350610664283586, + "grad_norm": 0.11474609375, + "learning_rate": 9.628537492160638e-05, + "loss": 0.8016, + "num_input_tokens_seen": 75432416, + "step": 129920 + }, + { + "epoch": 19.351355376824547, + "grad_norm": 0.041015625, + "learning_rate": 9.60649499476296e-05, + "loss": 0.7964, + "num_input_tokens_seen": 75435424, + "step": 129925 + }, + { + "epoch": 19.352100089365504, + "grad_norm": 0.050537109375, + "learning_rate": 9.584477676048753e-05, + "loss": 0.7927, + "num_input_tokens_seen": 75438336, + "step": 129930 + }, + { + "epoch": 19.352844801906464, + "grad_norm": 0.0272216796875, + "learning_rate": 9.562485536390052e-05, + "loss": 0.8039, + "num_input_tokens_seen": 75441024, + "step": 129935 + }, + { + "epoch": 19.353589514447425, + "grad_norm": 0.0419921875, + "learning_rate": 9.540518576158396e-05, + "loss": 0.7834, + "num_input_tokens_seen": 75444000, + "step": 129940 + }, + { + "epoch": 19.35433422698838, + "grad_norm": 0.060302734375, + "learning_rate": 9.51857679572482e-05, + "loss": 0.8099, + "num_input_tokens_seen": 75446944, + "step": 129945 + }, + { + "epoch": 19.355078939529342, + "grad_norm": 0.05615234375, + "learning_rate": 9.496660195460027e-05, + "loss": 0.8014, + "num_input_tokens_seen": 75449888, + "step": 129950 + }, + { + "epoch": 19.3558236520703, + "grad_norm": 0.050537109375, + "learning_rate": 9.474768775734055e-05, + "loss": 0.8048, + "num_input_tokens_seen": 75452768, + "step": 129955 + }, + { + "epoch": 19.35656836461126, + "grad_norm": 0.034423828125, + "learning_rate": 9.452902536917273e-05, + "loss": 0.7901, + "num_input_tokens_seen": 75455584, + "step": 129960 + }, + { + "epoch": 19.35731307715222, + "grad_norm": 0.0341796875, + "learning_rate": 9.431061479378722e-05, + "loss": 0.8077, + "num_input_tokens_seen": 75458176, + "step": 129965 + }, + { + "epoch": 19.358057789693177, + "grad_norm": 0.035400390625, + "learning_rate": 9.40924560348727e-05, + "loss": 0.7897, + "num_input_tokens_seen": 75461056, + "step": 129970 + }, + { + "epoch": 19.358802502234138, + "grad_norm": 0.052001953125, + "learning_rate": 9.387454909611792e-05, + "loss": 0.8092, + "num_input_tokens_seen": 75463808, + "step": 129975 + }, + { + "epoch": 19.359547214775098, + "grad_norm": 0.0654296875, + "learning_rate": 9.365689398120158e-05, + "loss": 0.8025, + "num_input_tokens_seen": 75467040, + "step": 129980 + }, + { + "epoch": 19.360291927316055, + "grad_norm": 0.1162109375, + "learning_rate": 9.343949069380242e-05, + "loss": 0.7914, + "num_input_tokens_seen": 75469824, + "step": 129985 + }, + { + "epoch": 19.361036639857016, + "grad_norm": 0.036865234375, + "learning_rate": 9.322233923759248e-05, + "loss": 0.7945, + "num_input_tokens_seen": 75472800, + "step": 129990 + }, + { + "epoch": 19.361781352397973, + "grad_norm": 0.05078125, + "learning_rate": 9.300543961624053e-05, + "loss": 0.7896, + "num_input_tokens_seen": 75475680, + "step": 129995 + }, + { + "epoch": 19.362526064938933, + "grad_norm": 0.11279296875, + "learning_rate": 9.278879183341193e-05, + "loss": 0.8081, + "num_input_tokens_seen": 75478368, + "step": 130000 + }, + { + "epoch": 19.363270777479894, + "grad_norm": 0.03955078125, + "learning_rate": 9.257239589276545e-05, + "loss": 0.7963, + "num_input_tokens_seen": 75481024, + "step": 130005 + }, + { + "epoch": 19.36401549002085, + "grad_norm": 0.09326171875, + "learning_rate": 9.235625179795648e-05, + "loss": 0.8168, + "num_input_tokens_seen": 75483936, + "step": 130010 + }, + { + "epoch": 19.36476020256181, + "grad_norm": 0.03759765625, + "learning_rate": 9.21403595526371e-05, + "loss": 0.7959, + "num_input_tokens_seen": 75487008, + "step": 130015 + }, + { + "epoch": 19.365504915102772, + "grad_norm": 0.04833984375, + "learning_rate": 9.192471916045608e-05, + "loss": 0.7671, + "num_input_tokens_seen": 75489824, + "step": 130020 + }, + { + "epoch": 19.36624962764373, + "grad_norm": 0.04296875, + "learning_rate": 9.170933062505382e-05, + "loss": 0.7967, + "num_input_tokens_seen": 75492640, + "step": 130025 + }, + { + "epoch": 19.36699434018469, + "grad_norm": 0.040771484375, + "learning_rate": 9.149419395006907e-05, + "loss": 0.7923, + "num_input_tokens_seen": 75495520, + "step": 130030 + }, + { + "epoch": 19.367739052725646, + "grad_norm": 0.050048828125, + "learning_rate": 9.127930913913895e-05, + "loss": 0.7899, + "num_input_tokens_seen": 75498784, + "step": 130035 + }, + { + "epoch": 19.368483765266607, + "grad_norm": 0.045654296875, + "learning_rate": 9.10646761958922e-05, + "loss": 0.8038, + "num_input_tokens_seen": 75501600, + "step": 130040 + }, + { + "epoch": 19.369228477807567, + "grad_norm": 0.07568359375, + "learning_rate": 9.085029512395593e-05, + "loss": 0.7885, + "num_input_tokens_seen": 75504736, + "step": 130045 + }, + { + "epoch": 19.369973190348524, + "grad_norm": 0.04833984375, + "learning_rate": 9.063616592695056e-05, + "loss": 0.7986, + "num_input_tokens_seen": 75507520, + "step": 130050 + }, + { + "epoch": 19.370717902889485, + "grad_norm": 0.06884765625, + "learning_rate": 9.042228860849322e-05, + "loss": 0.7954, + "num_input_tokens_seen": 75510400, + "step": 130055 + }, + { + "epoch": 19.371462615430445, + "grad_norm": 0.048583984375, + "learning_rate": 9.020866317219766e-05, + "loss": 0.8362, + "num_input_tokens_seen": 75513408, + "step": 130060 + }, + { + "epoch": 19.372207327971402, + "grad_norm": 0.0458984375, + "learning_rate": 8.999528962167435e-05, + "loss": 0.7962, + "num_input_tokens_seen": 75516256, + "step": 130065 + }, + { + "epoch": 19.372952040512363, + "grad_norm": 0.0654296875, + "learning_rate": 8.97821679605254e-05, + "loss": 0.8181, + "num_input_tokens_seen": 75519328, + "step": 130070 + }, + { + "epoch": 19.37369675305332, + "grad_norm": 0.0380859375, + "learning_rate": 8.956929819235293e-05, + "loss": 0.8134, + "num_input_tokens_seen": 75522528, + "step": 130075 + }, + { + "epoch": 19.37444146559428, + "grad_norm": 0.0439453125, + "learning_rate": 8.935668032075405e-05, + "loss": 0.7993, + "num_input_tokens_seen": 75525440, + "step": 130080 + }, + { + "epoch": 19.37518617813524, + "grad_norm": 0.06982421875, + "learning_rate": 8.914431434931924e-05, + "loss": 0.7964, + "num_input_tokens_seen": 75528288, + "step": 130085 + }, + { + "epoch": 19.375930890676198, + "grad_norm": 0.0439453125, + "learning_rate": 8.893220028163562e-05, + "loss": 0.8048, + "num_input_tokens_seen": 75531168, + "step": 130090 + }, + { + "epoch": 19.37667560321716, + "grad_norm": 0.044921875, + "learning_rate": 8.872033812128698e-05, + "loss": 0.7864, + "num_input_tokens_seen": 75533920, + "step": 130095 + }, + { + "epoch": 19.37742031575812, + "grad_norm": 0.04541015625, + "learning_rate": 8.850872787185548e-05, + "loss": 0.8123, + "num_input_tokens_seen": 75536736, + "step": 130100 + }, + { + "epoch": 19.378165028299076, + "grad_norm": 0.0263671875, + "learning_rate": 8.829736953691158e-05, + "loss": 0.8043, + "num_input_tokens_seen": 75539488, + "step": 130105 + }, + { + "epoch": 19.378909740840037, + "grad_norm": 0.0380859375, + "learning_rate": 8.80862631200291e-05, + "loss": 0.7939, + "num_input_tokens_seen": 75542656, + "step": 130110 + }, + { + "epoch": 19.379654453380994, + "grad_norm": 0.033935546875, + "learning_rate": 8.787540862477183e-05, + "loss": 0.7747, + "num_input_tokens_seen": 75545248, + "step": 130115 + }, + { + "epoch": 19.380399165921954, + "grad_norm": 0.029296875, + "learning_rate": 8.766480605470527e-05, + "loss": 0.8041, + "num_input_tokens_seen": 75547872, + "step": 130120 + }, + { + "epoch": 19.381143878462915, + "grad_norm": 0.0634765625, + "learning_rate": 8.745445541338492e-05, + "loss": 0.798, + "num_input_tokens_seen": 75550976, + "step": 130125 + }, + { + "epoch": 19.38188859100387, + "grad_norm": 0.04296875, + "learning_rate": 8.724435670436458e-05, + "loss": 0.793, + "num_input_tokens_seen": 75553920, + "step": 130130 + }, + { + "epoch": 19.382633303544832, + "grad_norm": 0.039794921875, + "learning_rate": 8.703450993119644e-05, + "loss": 0.8073, + "num_input_tokens_seen": 75557216, + "step": 130135 + }, + { + "epoch": 19.38337801608579, + "grad_norm": 0.038818359375, + "learning_rate": 8.682491509742096e-05, + "loss": 0.7957, + "num_input_tokens_seen": 75559904, + "step": 130140 + }, + { + "epoch": 19.38412272862675, + "grad_norm": 0.049560546875, + "learning_rate": 8.661557220658367e-05, + "loss": 0.7918, + "num_input_tokens_seen": 75562816, + "step": 130145 + }, + { + "epoch": 19.38486744116771, + "grad_norm": 0.061279296875, + "learning_rate": 8.640648126221672e-05, + "loss": 0.7783, + "num_input_tokens_seen": 75565600, + "step": 130150 + }, + { + "epoch": 19.385612153708667, + "grad_norm": 0.0361328125, + "learning_rate": 8.619764226785564e-05, + "loss": 0.7878, + "num_input_tokens_seen": 75568640, + "step": 130155 + }, + { + "epoch": 19.386356866249628, + "grad_norm": 0.033935546875, + "learning_rate": 8.59890552270276e-05, + "loss": 0.8185, + "num_input_tokens_seen": 75571712, + "step": 130160 + }, + { + "epoch": 19.38710157879059, + "grad_norm": 0.0281982421875, + "learning_rate": 8.578072014325644e-05, + "loss": 0.8013, + "num_input_tokens_seen": 75574592, + "step": 130165 + }, + { + "epoch": 19.387846291331545, + "grad_norm": 0.06640625, + "learning_rate": 8.557263702006268e-05, + "loss": 0.7943, + "num_input_tokens_seen": 75577376, + "step": 130170 + }, + { + "epoch": 19.388591003872506, + "grad_norm": 0.05712890625, + "learning_rate": 8.536480586096017e-05, + "loss": 0.801, + "num_input_tokens_seen": 75580064, + "step": 130175 + }, + { + "epoch": 19.389335716413463, + "grad_norm": 0.05517578125, + "learning_rate": 8.515722666946113e-05, + "loss": 0.7911, + "num_input_tokens_seen": 75582720, + "step": 130180 + }, + { + "epoch": 19.390080428954423, + "grad_norm": 0.0556640625, + "learning_rate": 8.494989944907105e-05, + "loss": 0.8094, + "num_input_tokens_seen": 75585760, + "step": 130185 + }, + { + "epoch": 19.390825141495384, + "grad_norm": 0.0478515625, + "learning_rate": 8.474282420329381e-05, + "loss": 0.7964, + "num_input_tokens_seen": 75588544, + "step": 130190 + }, + { + "epoch": 19.39156985403634, + "grad_norm": 0.03076171875, + "learning_rate": 8.453600093562829e-05, + "loss": 0.8145, + "num_input_tokens_seen": 75591360, + "step": 130195 + }, + { + "epoch": 19.3923145665773, + "grad_norm": 0.047119140625, + "learning_rate": 8.432942964956668e-05, + "loss": 0.7861, + "num_input_tokens_seen": 75594112, + "step": 130200 + }, + { + "epoch": 19.393059279118262, + "grad_norm": 0.03662109375, + "learning_rate": 8.412311034859954e-05, + "loss": 0.7966, + "num_input_tokens_seen": 75596896, + "step": 130205 + }, + { + "epoch": 19.39380399165922, + "grad_norm": 0.056884765625, + "learning_rate": 8.391704303621405e-05, + "loss": 0.8048, + "num_input_tokens_seen": 75599648, + "step": 130210 + }, + { + "epoch": 19.39454870420018, + "grad_norm": 0.0625, + "learning_rate": 8.371122771588913e-05, + "loss": 0.7753, + "num_input_tokens_seen": 75602400, + "step": 130215 + }, + { + "epoch": 19.395293416741136, + "grad_norm": 0.054931640625, + "learning_rate": 8.350566439110196e-05, + "loss": 0.8062, + "num_input_tokens_seen": 75605184, + "step": 130220 + }, + { + "epoch": 19.396038129282097, + "grad_norm": 0.060791015625, + "learning_rate": 8.330035306532812e-05, + "loss": 0.7944, + "num_input_tokens_seen": 75608224, + "step": 130225 + }, + { + "epoch": 19.396782841823057, + "grad_norm": 0.0303955078125, + "learning_rate": 8.309529374203317e-05, + "loss": 0.7892, + "num_input_tokens_seen": 75611136, + "step": 130230 + }, + { + "epoch": 19.397527554364014, + "grad_norm": 0.055419921875, + "learning_rate": 8.2890486424681e-05, + "loss": 0.8272, + "num_input_tokens_seen": 75614112, + "step": 130235 + }, + { + "epoch": 19.398272266904975, + "grad_norm": 0.0289306640625, + "learning_rate": 8.268593111673383e-05, + "loss": 0.7894, + "num_input_tokens_seen": 75616736, + "step": 130240 + }, + { + "epoch": 19.399016979445936, + "grad_norm": 0.0498046875, + "learning_rate": 8.248162782164725e-05, + "loss": 0.781, + "num_input_tokens_seen": 75619712, + "step": 130245 + }, + { + "epoch": 19.399761691986892, + "grad_norm": 0.031982421875, + "learning_rate": 8.227757654287349e-05, + "loss": 0.8142, + "num_input_tokens_seen": 75622560, + "step": 130250 + }, + { + "epoch": 19.400506404527853, + "grad_norm": 0.046142578125, + "learning_rate": 8.207377728385478e-05, + "loss": 0.8066, + "num_input_tokens_seen": 75625504, + "step": 130255 + }, + { + "epoch": 19.40125111706881, + "grad_norm": 0.07666015625, + "learning_rate": 8.187023004804005e-05, + "loss": 0.7961, + "num_input_tokens_seen": 75628096, + "step": 130260 + }, + { + "epoch": 19.40199582960977, + "grad_norm": 0.049072265625, + "learning_rate": 8.166693483886489e-05, + "loss": 0.7933, + "num_input_tokens_seen": 75630848, + "step": 130265 + }, + { + "epoch": 19.40274054215073, + "grad_norm": 0.048095703125, + "learning_rate": 8.146389165976487e-05, + "loss": 0.8066, + "num_input_tokens_seen": 75633664, + "step": 130270 + }, + { + "epoch": 19.403485254691688, + "grad_norm": 0.03662109375, + "learning_rate": 8.126110051417056e-05, + "loss": 0.7948, + "num_input_tokens_seen": 75636288, + "step": 130275 + }, + { + "epoch": 19.40422996723265, + "grad_norm": 0.0693359375, + "learning_rate": 8.10585614055076e-05, + "loss": 0.7985, + "num_input_tokens_seen": 75639328, + "step": 130280 + }, + { + "epoch": 19.40497467977361, + "grad_norm": 0.06005859375, + "learning_rate": 8.085627433719489e-05, + "loss": 0.7826, + "num_input_tokens_seen": 75642048, + "step": 130285 + }, + { + "epoch": 19.405719392314566, + "grad_norm": 0.045166015625, + "learning_rate": 8.06542393126547e-05, + "loss": 0.7999, + "num_input_tokens_seen": 75644608, + "step": 130290 + }, + { + "epoch": 19.406464104855527, + "grad_norm": 0.029541015625, + "learning_rate": 8.045245633529596e-05, + "loss": 0.8025, + "num_input_tokens_seen": 75647488, + "step": 130295 + }, + { + "epoch": 19.407208817396484, + "grad_norm": 0.07470703125, + "learning_rate": 8.025092540853096e-05, + "loss": 0.7861, + "num_input_tokens_seen": 75650560, + "step": 130300 + }, + { + "epoch": 19.407953529937444, + "grad_norm": 0.054931640625, + "learning_rate": 8.004964653576196e-05, + "loss": 0.8037, + "num_input_tokens_seen": 75653824, + "step": 130305 + }, + { + "epoch": 19.408698242478405, + "grad_norm": 0.040283203125, + "learning_rate": 7.984861972038959e-05, + "loss": 0.7967, + "num_input_tokens_seen": 75656864, + "step": 130310 + }, + { + "epoch": 19.40944295501936, + "grad_norm": 0.111328125, + "learning_rate": 7.964784496581112e-05, + "loss": 0.7875, + "num_input_tokens_seen": 75659616, + "step": 130315 + }, + { + "epoch": 19.410187667560322, + "grad_norm": 0.06494140625, + "learning_rate": 7.944732227541884e-05, + "loss": 0.803, + "num_input_tokens_seen": 75662240, + "step": 130320 + }, + { + "epoch": 19.41093238010128, + "grad_norm": 0.049072265625, + "learning_rate": 7.924705165259837e-05, + "loss": 0.7994, + "num_input_tokens_seen": 75665280, + "step": 130325 + }, + { + "epoch": 19.41167709264224, + "grad_norm": 0.07666015625, + "learning_rate": 7.904703310073535e-05, + "loss": 0.7877, + "num_input_tokens_seen": 75668000, + "step": 130330 + }, + { + "epoch": 19.4124218051832, + "grad_norm": 0.045166015625, + "learning_rate": 7.884726662320706e-05, + "loss": 0.7866, + "num_input_tokens_seen": 75670784, + "step": 130335 + }, + { + "epoch": 19.413166517724157, + "grad_norm": 0.08544921875, + "learning_rate": 7.864775222338915e-05, + "loss": 0.8006, + "num_input_tokens_seen": 75673792, + "step": 130340 + }, + { + "epoch": 19.413911230265118, + "grad_norm": 0.04443359375, + "learning_rate": 7.844848990465225e-05, + "loss": 0.7851, + "num_input_tokens_seen": 75676448, + "step": 130345 + }, + { + "epoch": 19.41465594280608, + "grad_norm": 0.0361328125, + "learning_rate": 7.8249479670362e-05, + "loss": 0.7935, + "num_input_tokens_seen": 75679424, + "step": 130350 + }, + { + "epoch": 19.415400655347035, + "grad_norm": 0.07421875, + "learning_rate": 7.805072152388071e-05, + "loss": 0.8068, + "num_input_tokens_seen": 75682208, + "step": 130355 + }, + { + "epoch": 19.416145367887996, + "grad_norm": 0.03857421875, + "learning_rate": 7.785221546856734e-05, + "loss": 0.8127, + "num_input_tokens_seen": 75685216, + "step": 130360 + }, + { + "epoch": 19.416890080428953, + "grad_norm": 0.03857421875, + "learning_rate": 7.765396150777426e-05, + "loss": 0.8256, + "num_input_tokens_seen": 75688000, + "step": 130365 + }, + { + "epoch": 19.417634792969913, + "grad_norm": 0.0537109375, + "learning_rate": 7.745595964485041e-05, + "loss": 0.8012, + "num_input_tokens_seen": 75690752, + "step": 130370 + }, + { + "epoch": 19.418379505510874, + "grad_norm": 0.05126953125, + "learning_rate": 7.725820988314313e-05, + "loss": 0.8008, + "num_input_tokens_seen": 75693376, + "step": 130375 + }, + { + "epoch": 19.41912421805183, + "grad_norm": 0.054443359375, + "learning_rate": 7.706071222598976e-05, + "loss": 0.7954, + "num_input_tokens_seen": 75696384, + "step": 130380 + }, + { + "epoch": 19.41986893059279, + "grad_norm": 0.0576171875, + "learning_rate": 7.686346667672927e-05, + "loss": 0.7771, + "num_input_tokens_seen": 75699072, + "step": 130385 + }, + { + "epoch": 19.420613643133752, + "grad_norm": 0.0439453125, + "learning_rate": 7.666647323869401e-05, + "loss": 0.7791, + "num_input_tokens_seen": 75701792, + "step": 130390 + }, + { + "epoch": 19.42135835567471, + "grad_norm": 0.05419921875, + "learning_rate": 7.646973191521133e-05, + "loss": 0.8005, + "num_input_tokens_seen": 75704416, + "step": 130395 + }, + { + "epoch": 19.42210306821567, + "grad_norm": 0.0419921875, + "learning_rate": 7.627324270960356e-05, + "loss": 0.794, + "num_input_tokens_seen": 75707200, + "step": 130400 + }, + { + "epoch": 19.422847780756626, + "grad_norm": 0.07763671875, + "learning_rate": 7.607700562519303e-05, + "loss": 0.7993, + "num_input_tokens_seen": 75710208, + "step": 130405 + }, + { + "epoch": 19.423592493297587, + "grad_norm": 0.09521484375, + "learning_rate": 7.588102066529212e-05, + "loss": 0.7877, + "num_input_tokens_seen": 75713024, + "step": 130410 + }, + { + "epoch": 19.424337205838548, + "grad_norm": 0.042724609375, + "learning_rate": 7.568528783321481e-05, + "loss": 0.7987, + "num_input_tokens_seen": 75715776, + "step": 130415 + }, + { + "epoch": 19.425081918379504, + "grad_norm": 0.03955078125, + "learning_rate": 7.548980713226517e-05, + "loss": 0.7782, + "num_input_tokens_seen": 75718720, + "step": 130420 + }, + { + "epoch": 19.425826630920465, + "grad_norm": 0.05078125, + "learning_rate": 7.529457856574717e-05, + "loss": 0.7983, + "num_input_tokens_seen": 75721568, + "step": 130425 + }, + { + "epoch": 19.426571343461426, + "grad_norm": 0.040771484375, + "learning_rate": 7.509960213695821e-05, + "loss": 0.7787, + "num_input_tokens_seen": 75724448, + "step": 130430 + }, + { + "epoch": 19.427316056002383, + "grad_norm": 0.0771484375, + "learning_rate": 7.49048778491923e-05, + "loss": 0.7833, + "num_input_tokens_seen": 75727424, + "step": 130435 + }, + { + "epoch": 19.428060768543343, + "grad_norm": 0.052001953125, + "learning_rate": 7.47104057057385e-05, + "loss": 0.7814, + "num_input_tokens_seen": 75730528, + "step": 130440 + }, + { + "epoch": 19.4288054810843, + "grad_norm": 0.036865234375, + "learning_rate": 7.451618570988416e-05, + "loss": 0.8045, + "num_input_tokens_seen": 75733472, + "step": 130445 + }, + { + "epoch": 19.42955019362526, + "grad_norm": 0.051513671875, + "learning_rate": 7.432221786491e-05, + "loss": 0.8056, + "num_input_tokens_seen": 75736800, + "step": 130450 + }, + { + "epoch": 19.43029490616622, + "grad_norm": 0.056884765625, + "learning_rate": 7.412850217409006e-05, + "loss": 0.7885, + "num_input_tokens_seen": 75740640, + "step": 130455 + }, + { + "epoch": 19.431039618707178, + "grad_norm": 0.0673828125, + "learning_rate": 7.393503864070006e-05, + "loss": 0.779, + "num_input_tokens_seen": 75743648, + "step": 130460 + }, + { + "epoch": 19.43178433124814, + "grad_norm": 0.08642578125, + "learning_rate": 7.374182726800904e-05, + "loss": 0.7993, + "num_input_tokens_seen": 75746368, + "step": 130465 + }, + { + "epoch": 19.432529043789096, + "grad_norm": 0.04833984375, + "learning_rate": 7.354886805927773e-05, + "loss": 0.7929, + "num_input_tokens_seen": 75749152, + "step": 130470 + }, + { + "epoch": 19.433273756330056, + "grad_norm": 0.0419921875, + "learning_rate": 7.335616101776854e-05, + "loss": 0.7873, + "num_input_tokens_seen": 75752352, + "step": 130475 + }, + { + "epoch": 19.434018468871017, + "grad_norm": 0.045654296875, + "learning_rate": 7.31637061467355e-05, + "loss": 0.8013, + "num_input_tokens_seen": 75755328, + "step": 130480 + }, + { + "epoch": 19.434763181411974, + "grad_norm": 0.052978515625, + "learning_rate": 7.297150344943103e-05, + "loss": 0.8024, + "num_input_tokens_seen": 75758368, + "step": 130485 + }, + { + "epoch": 19.435507893952934, + "grad_norm": 0.0244140625, + "learning_rate": 7.277955292910088e-05, + "loss": 0.8058, + "num_input_tokens_seen": 75761280, + "step": 130490 + }, + { + "epoch": 19.436252606493895, + "grad_norm": 0.03759765625, + "learning_rate": 7.258785458898908e-05, + "loss": 0.8141, + "num_input_tokens_seen": 75764192, + "step": 130495 + }, + { + "epoch": 19.43699731903485, + "grad_norm": 0.06591796875, + "learning_rate": 7.239640843233475e-05, + "loss": 0.8109, + "num_input_tokens_seen": 75766944, + "step": 130500 + }, + { + "epoch": 19.437742031575812, + "grad_norm": 0.047119140625, + "learning_rate": 7.220521446237194e-05, + "loss": 0.7903, + "num_input_tokens_seen": 75769888, + "step": 130505 + }, + { + "epoch": 19.43848674411677, + "grad_norm": 0.06591796875, + "learning_rate": 7.201427268232974e-05, + "loss": 0.7957, + "num_input_tokens_seen": 75772640, + "step": 130510 + }, + { + "epoch": 19.43923145665773, + "grad_norm": 0.058349609375, + "learning_rate": 7.182358309543224e-05, + "loss": 0.7872, + "num_input_tokens_seen": 75775456, + "step": 130515 + }, + { + "epoch": 19.43997616919869, + "grad_norm": 0.06689453125, + "learning_rate": 7.16331457049052e-05, + "loss": 0.7733, + "num_input_tokens_seen": 75778112, + "step": 130520 + }, + { + "epoch": 19.440720881739647, + "grad_norm": 0.046875, + "learning_rate": 7.14429605139627e-05, + "loss": 0.7928, + "num_input_tokens_seen": 75781056, + "step": 130525 + }, + { + "epoch": 19.441465594280608, + "grad_norm": 0.0269775390625, + "learning_rate": 7.125302752581885e-05, + "loss": 0.8163, + "num_input_tokens_seen": 75784128, + "step": 130530 + }, + { + "epoch": 19.44221030682157, + "grad_norm": 0.05419921875, + "learning_rate": 7.106334674368275e-05, + "loss": 0.8094, + "num_input_tokens_seen": 75786848, + "step": 130535 + }, + { + "epoch": 19.442955019362525, + "grad_norm": 0.046875, + "learning_rate": 7.08739181707585e-05, + "loss": 0.7964, + "num_input_tokens_seen": 75789984, + "step": 130540 + }, + { + "epoch": 19.443699731903486, + "grad_norm": 0.043212890625, + "learning_rate": 7.068474181024519e-05, + "loss": 0.8107, + "num_input_tokens_seen": 75792736, + "step": 130545 + }, + { + "epoch": 19.444444444444443, + "grad_norm": 0.044921875, + "learning_rate": 7.04958176653403e-05, + "loss": 0.8024, + "num_input_tokens_seen": 75795936, + "step": 130550 + }, + { + "epoch": 19.445189156985403, + "grad_norm": 0.04541015625, + "learning_rate": 7.030714573923458e-05, + "loss": 0.7945, + "num_input_tokens_seen": 75798848, + "step": 130555 + }, + { + "epoch": 19.445933869526364, + "grad_norm": 0.056884765625, + "learning_rate": 7.011872603511549e-05, + "loss": 0.8038, + "num_input_tokens_seen": 75801664, + "step": 130560 + }, + { + "epoch": 19.44667858206732, + "grad_norm": 0.08642578125, + "learning_rate": 6.993055855616714e-05, + "loss": 0.8189, + "num_input_tokens_seen": 75804352, + "step": 130565 + }, + { + "epoch": 19.44742329460828, + "grad_norm": 0.030029296875, + "learning_rate": 6.9742643305567e-05, + "loss": 0.8018, + "num_input_tokens_seen": 75807328, + "step": 130570 + }, + { + "epoch": 19.448168007149242, + "grad_norm": 0.036376953125, + "learning_rate": 6.955498028649088e-05, + "loss": 0.8078, + "num_input_tokens_seen": 75810368, + "step": 130575 + }, + { + "epoch": 19.4489127196902, + "grad_norm": 0.0556640625, + "learning_rate": 6.936756950210954e-05, + "loss": 0.8038, + "num_input_tokens_seen": 75813120, + "step": 130580 + }, + { + "epoch": 19.44965743223116, + "grad_norm": 0.046630859375, + "learning_rate": 6.918041095558713e-05, + "loss": 0.8102, + "num_input_tokens_seen": 75816096, + "step": 130585 + }, + { + "epoch": 19.450402144772116, + "grad_norm": 0.041015625, + "learning_rate": 6.899350465008613e-05, + "loss": 0.8123, + "num_input_tokens_seen": 75819136, + "step": 130590 + }, + { + "epoch": 19.451146857313077, + "grad_norm": 0.09765625, + "learning_rate": 6.880685058876567e-05, + "loss": 0.8016, + "num_input_tokens_seen": 75821824, + "step": 130595 + }, + { + "epoch": 19.451891569854038, + "grad_norm": 0.058349609375, + "learning_rate": 6.862044877477823e-05, + "loss": 0.7976, + "num_input_tokens_seen": 75824576, + "step": 130600 + }, + { + "epoch": 19.452636282394995, + "grad_norm": 0.11279296875, + "learning_rate": 6.843429921127298e-05, + "loss": 0.8095, + "num_input_tokens_seen": 75827680, + "step": 130605 + }, + { + "epoch": 19.453380994935955, + "grad_norm": 0.06640625, + "learning_rate": 6.824840190139403e-05, + "loss": 0.7837, + "num_input_tokens_seen": 75831040, + "step": 130610 + }, + { + "epoch": 19.454125707476916, + "grad_norm": 0.041748046875, + "learning_rate": 6.806275684828222e-05, + "loss": 0.794, + "num_input_tokens_seen": 75833888, + "step": 130615 + }, + { + "epoch": 19.454870420017873, + "grad_norm": 0.0260009765625, + "learning_rate": 6.787736405507505e-05, + "loss": 0.8072, + "num_input_tokens_seen": 75836864, + "step": 130620 + }, + { + "epoch": 19.455615132558833, + "grad_norm": 0.035400390625, + "learning_rate": 6.769222352490167e-05, + "loss": 0.7784, + "num_input_tokens_seen": 75839712, + "step": 130625 + }, + { + "epoch": 19.45635984509979, + "grad_norm": 0.051513671875, + "learning_rate": 6.750733526089292e-05, + "loss": 0.7981, + "num_input_tokens_seen": 75842752, + "step": 130630 + }, + { + "epoch": 19.45710455764075, + "grad_norm": 0.0380859375, + "learning_rate": 6.732269926616963e-05, + "loss": 0.8102, + "num_input_tokens_seen": 75845952, + "step": 130635 + }, + { + "epoch": 19.45784927018171, + "grad_norm": 0.06982421875, + "learning_rate": 6.713831554385429e-05, + "loss": 0.7992, + "num_input_tokens_seen": 75849088, + "step": 130640 + }, + { + "epoch": 19.458593982722668, + "grad_norm": 0.036865234375, + "learning_rate": 6.695418409705944e-05, + "loss": 0.8351, + "num_input_tokens_seen": 75852096, + "step": 130645 + }, + { + "epoch": 19.45933869526363, + "grad_norm": 0.040283203125, + "learning_rate": 6.677030492889591e-05, + "loss": 0.795, + "num_input_tokens_seen": 75854752, + "step": 130650 + }, + { + "epoch": 19.46008340780459, + "grad_norm": 0.04443359375, + "learning_rate": 6.65866780424712e-05, + "loss": 0.8003, + "num_input_tokens_seen": 75857888, + "step": 130655 + }, + { + "epoch": 19.460828120345546, + "grad_norm": 0.1328125, + "learning_rate": 6.640330344088618e-05, + "loss": 0.8403, + "num_input_tokens_seen": 75860608, + "step": 130660 + }, + { + "epoch": 19.461572832886507, + "grad_norm": 0.039794921875, + "learning_rate": 6.622018112724004e-05, + "loss": 0.8192, + "num_input_tokens_seen": 75863872, + "step": 130665 + }, + { + "epoch": 19.462317545427464, + "grad_norm": 0.03857421875, + "learning_rate": 6.603731110462529e-05, + "loss": 0.7937, + "num_input_tokens_seen": 75866848, + "step": 130670 + }, + { + "epoch": 19.463062257968424, + "grad_norm": 0.032958984375, + "learning_rate": 6.58546933761328e-05, + "loss": 0.8072, + "num_input_tokens_seen": 75869856, + "step": 130675 + }, + { + "epoch": 19.463806970509385, + "grad_norm": 0.03662109375, + "learning_rate": 6.567232794484678e-05, + "loss": 0.8098, + "num_input_tokens_seen": 75872480, + "step": 130680 + }, + { + "epoch": 19.464551683050342, + "grad_norm": 0.04052734375, + "learning_rate": 6.549021481384809e-05, + "loss": 0.8056, + "num_input_tokens_seen": 75875680, + "step": 130685 + }, + { + "epoch": 19.465296395591302, + "grad_norm": 0.03662109375, + "learning_rate": 6.53083539862126e-05, + "loss": 0.7664, + "num_input_tokens_seen": 75878560, + "step": 130690 + }, + { + "epoch": 19.46604110813226, + "grad_norm": 0.056396484375, + "learning_rate": 6.51267454650145e-05, + "loss": 0.7905, + "num_input_tokens_seen": 75881728, + "step": 130695 + }, + { + "epoch": 19.46678582067322, + "grad_norm": 0.03955078125, + "learning_rate": 6.494538925332138e-05, + "loss": 0.7843, + "num_input_tokens_seen": 75884896, + "step": 130700 + }, + { + "epoch": 19.46753053321418, + "grad_norm": 0.058837890625, + "learning_rate": 6.476428535419576e-05, + "loss": 0.7977, + "num_input_tokens_seen": 75887904, + "step": 130705 + }, + { + "epoch": 19.468275245755137, + "grad_norm": 0.0400390625, + "learning_rate": 6.458343377069687e-05, + "loss": 0.8199, + "num_input_tokens_seen": 75890560, + "step": 130710 + }, + { + "epoch": 19.469019958296098, + "grad_norm": 0.03173828125, + "learning_rate": 6.440283450588391e-05, + "loss": 0.7867, + "num_input_tokens_seen": 75893280, + "step": 130715 + }, + { + "epoch": 19.46976467083706, + "grad_norm": 0.0252685546875, + "learning_rate": 6.42224875628028e-05, + "loss": 0.8111, + "num_input_tokens_seen": 75896128, + "step": 130720 + }, + { + "epoch": 19.470509383378015, + "grad_norm": 0.08447265625, + "learning_rate": 6.404239294450442e-05, + "loss": 0.7974, + "num_input_tokens_seen": 75899168, + "step": 130725 + }, + { + "epoch": 19.471254095918976, + "grad_norm": 0.04248046875, + "learning_rate": 6.386255065402801e-05, + "loss": 0.8096, + "num_input_tokens_seen": 75902272, + "step": 130730 + }, + { + "epoch": 19.471998808459933, + "grad_norm": 0.047607421875, + "learning_rate": 6.368296069441447e-05, + "loss": 0.7988, + "num_input_tokens_seen": 75905216, + "step": 130735 + }, + { + "epoch": 19.472743521000893, + "grad_norm": 0.0849609375, + "learning_rate": 6.350362306869639e-05, + "loss": 0.7925, + "num_input_tokens_seen": 75908224, + "step": 130740 + }, + { + "epoch": 19.473488233541854, + "grad_norm": 0.064453125, + "learning_rate": 6.332453777990465e-05, + "loss": 0.7883, + "num_input_tokens_seen": 75910880, + "step": 130745 + }, + { + "epoch": 19.47423294608281, + "grad_norm": 0.035400390625, + "learning_rate": 6.314570483106352e-05, + "loss": 0.7838, + "num_input_tokens_seen": 75913888, + "step": 130750 + }, + { + "epoch": 19.47497765862377, + "grad_norm": 0.041259765625, + "learning_rate": 6.29671242251939e-05, + "loss": 0.8097, + "num_input_tokens_seen": 75917216, + "step": 130755 + }, + { + "epoch": 19.475722371164732, + "grad_norm": 0.055419921875, + "learning_rate": 6.278879596531339e-05, + "loss": 0.796, + "num_input_tokens_seen": 75920128, + "step": 130760 + }, + { + "epoch": 19.47646708370569, + "grad_norm": 0.06298828125, + "learning_rate": 6.261072005443457e-05, + "loss": 0.8068, + "num_input_tokens_seen": 75923296, + "step": 130765 + }, + { + "epoch": 19.47721179624665, + "grad_norm": 0.060546875, + "learning_rate": 6.243289649556671e-05, + "loss": 0.7978, + "num_input_tokens_seen": 75926208, + "step": 130770 + }, + { + "epoch": 19.477956508787607, + "grad_norm": 0.035888671875, + "learning_rate": 6.22553252917124e-05, + "loss": 0.8065, + "num_input_tokens_seen": 75929152, + "step": 130775 + }, + { + "epoch": 19.478701221328567, + "grad_norm": 0.056640625, + "learning_rate": 6.207800644587257e-05, + "loss": 0.7891, + "num_input_tokens_seen": 75931968, + "step": 130780 + }, + { + "epoch": 19.479445933869528, + "grad_norm": 0.039306640625, + "learning_rate": 6.190093996104318e-05, + "loss": 0.7926, + "num_input_tokens_seen": 75935072, + "step": 130785 + }, + { + "epoch": 19.480190646410485, + "grad_norm": 0.050048828125, + "learning_rate": 6.172412584021514e-05, + "loss": 0.7696, + "num_input_tokens_seen": 75937856, + "step": 130790 + }, + { + "epoch": 19.480935358951445, + "grad_norm": 0.05078125, + "learning_rate": 6.154756408637441e-05, + "loss": 0.8122, + "num_input_tokens_seen": 75940864, + "step": 130795 + }, + { + "epoch": 19.481680071492406, + "grad_norm": 0.045166015625, + "learning_rate": 6.137125470250525e-05, + "loss": 0.7993, + "num_input_tokens_seen": 75943488, + "step": 130800 + }, + { + "epoch": 19.482424784033363, + "grad_norm": 0.05517578125, + "learning_rate": 6.119519769158699e-05, + "loss": 0.8019, + "num_input_tokens_seen": 75946272, + "step": 130805 + }, + { + "epoch": 19.483169496574323, + "grad_norm": 0.061767578125, + "learning_rate": 6.1019393056590544e-05, + "loss": 0.7971, + "num_input_tokens_seen": 75949152, + "step": 130810 + }, + { + "epoch": 19.48391420911528, + "grad_norm": 0.0400390625, + "learning_rate": 6.084384080049021e-05, + "loss": 0.7886, + "num_input_tokens_seen": 75952064, + "step": 130815 + }, + { + "epoch": 19.48465892165624, + "grad_norm": 0.043212890625, + "learning_rate": 6.066854092624863e-05, + "loss": 0.803, + "num_input_tokens_seen": 75954944, + "step": 130820 + }, + { + "epoch": 19.4854036341972, + "grad_norm": 0.03564453125, + "learning_rate": 6.0493493436828417e-05, + "loss": 0.8, + "num_input_tokens_seen": 75957536, + "step": 130825 + }, + { + "epoch": 19.486148346738158, + "grad_norm": 0.05419921875, + "learning_rate": 6.031869833518555e-05, + "loss": 0.7803, + "num_input_tokens_seen": 75960640, + "step": 130830 + }, + { + "epoch": 19.48689305927912, + "grad_norm": 0.041748046875, + "learning_rate": 6.0144155624276e-05, + "loss": 0.7958, + "num_input_tokens_seen": 75963680, + "step": 130835 + }, + { + "epoch": 19.487637771820076, + "grad_norm": 0.04736328125, + "learning_rate": 5.996986530704573e-05, + "loss": 0.7862, + "num_input_tokens_seen": 75966496, + "step": 130840 + }, + { + "epoch": 19.488382484361036, + "grad_norm": 0.040771484375, + "learning_rate": 5.979582738644073e-05, + "loss": 0.7824, + "num_input_tokens_seen": 75969184, + "step": 130845 + }, + { + "epoch": 19.489127196901997, + "grad_norm": 0.038330078125, + "learning_rate": 5.9622041865401987e-05, + "loss": 0.7917, + "num_input_tokens_seen": 75972288, + "step": 130850 + }, + { + "epoch": 19.489871909442954, + "grad_norm": 0.03857421875, + "learning_rate": 5.944850874686213e-05, + "loss": 0.7973, + "num_input_tokens_seen": 75974976, + "step": 130855 + }, + { + "epoch": 19.490616621983914, + "grad_norm": 0.038818359375, + "learning_rate": 5.927522803375551e-05, + "loss": 0.7929, + "num_input_tokens_seen": 75977920, + "step": 130860 + }, + { + "epoch": 19.491361334524875, + "grad_norm": 0.057373046875, + "learning_rate": 5.9102199729009764e-05, + "loss": 0.7828, + "num_input_tokens_seen": 75980704, + "step": 130865 + }, + { + "epoch": 19.492106047065832, + "grad_norm": 0.05615234375, + "learning_rate": 5.892942383554589e-05, + "loss": 0.8, + "num_input_tokens_seen": 75983552, + "step": 130870 + }, + { + "epoch": 19.492850759606792, + "grad_norm": 0.053955078125, + "learning_rate": 5.875690035628489e-05, + "loss": 0.7986, + "num_input_tokens_seen": 75986880, + "step": 130875 + }, + { + "epoch": 19.49359547214775, + "grad_norm": 0.03955078125, + "learning_rate": 5.858462929413943e-05, + "loss": 0.8182, + "num_input_tokens_seen": 75989760, + "step": 130880 + }, + { + "epoch": 19.49434018468871, + "grad_norm": 0.0277099609375, + "learning_rate": 5.841261065202052e-05, + "loss": 0.8087, + "num_input_tokens_seen": 75992864, + "step": 130885 + }, + { + "epoch": 19.49508489722967, + "grad_norm": 0.056396484375, + "learning_rate": 5.8240844432835835e-05, + "loss": 0.7886, + "num_input_tokens_seen": 75995744, + "step": 130890 + }, + { + "epoch": 19.495829609770627, + "grad_norm": 0.0294189453125, + "learning_rate": 5.806933063948638e-05, + "loss": 0.8134, + "num_input_tokens_seen": 75998656, + "step": 130895 + }, + { + "epoch": 19.496574322311588, + "grad_norm": 0.0390625, + "learning_rate": 5.789806927486818e-05, + "loss": 0.8107, + "num_input_tokens_seen": 76001696, + "step": 130900 + }, + { + "epoch": 19.49731903485255, + "grad_norm": 0.041748046875, + "learning_rate": 5.772706034187558e-05, + "loss": 0.792, + "num_input_tokens_seen": 76004288, + "step": 130905 + }, + { + "epoch": 19.498063747393505, + "grad_norm": 0.057861328125, + "learning_rate": 5.7556303843397934e-05, + "loss": 0.7956, + "num_input_tokens_seen": 76007104, + "step": 130910 + }, + { + "epoch": 19.498808459934466, + "grad_norm": 0.052490234375, + "learning_rate": 5.738579978231961e-05, + "loss": 0.7916, + "num_input_tokens_seen": 76009760, + "step": 130915 + }, + { + "epoch": 19.499553172475423, + "grad_norm": 0.050537109375, + "learning_rate": 5.7215548161521634e-05, + "loss": 0.7873, + "num_input_tokens_seen": 76012384, + "step": 130920 + }, + { + "epoch": 19.500297885016384, + "grad_norm": 0.047119140625, + "learning_rate": 5.704554898387837e-05, + "loss": 0.818, + "num_input_tokens_seen": 76015392, + "step": 130925 + }, + { + "epoch": 19.501042597557344, + "grad_norm": 0.037353515625, + "learning_rate": 5.687580225226418e-05, + "loss": 0.7819, + "num_input_tokens_seen": 76018240, + "step": 130930 + }, + { + "epoch": 19.5017873100983, + "grad_norm": 0.032470703125, + "learning_rate": 5.6706307969546785e-05, + "loss": 0.8048, + "num_input_tokens_seen": 76021088, + "step": 130935 + }, + { + "epoch": 19.50253202263926, + "grad_norm": 0.0260009765625, + "learning_rate": 5.6537066138588886e-05, + "loss": 0.7852, + "num_input_tokens_seen": 76024000, + "step": 130940 + }, + { + "epoch": 19.503276735180222, + "grad_norm": 0.123046875, + "learning_rate": 5.6368076762248196e-05, + "loss": 0.801, + "num_input_tokens_seen": 76027136, + "step": 130945 + }, + { + "epoch": 19.50402144772118, + "grad_norm": 0.05712890625, + "learning_rate": 5.619933984338077e-05, + "loss": 0.7963, + "num_input_tokens_seen": 76030016, + "step": 130950 + }, + { + "epoch": 19.50476616026214, + "grad_norm": 0.0927734375, + "learning_rate": 5.603085538483765e-05, + "loss": 0.7764, + "num_input_tokens_seen": 76032800, + "step": 130955 + }, + { + "epoch": 19.505510872803097, + "grad_norm": 0.0517578125, + "learning_rate": 5.5862623389463234e-05, + "loss": 0.7991, + "num_input_tokens_seen": 76035616, + "step": 130960 + }, + { + "epoch": 19.506255585344057, + "grad_norm": 0.0478515625, + "learning_rate": 5.5694643860103585e-05, + "loss": 0.793, + "num_input_tokens_seen": 76038432, + "step": 130965 + }, + { + "epoch": 19.507000297885018, + "grad_norm": 0.044189453125, + "learning_rate": 5.5526916799593094e-05, + "loss": 0.7859, + "num_input_tokens_seen": 76041120, + "step": 130970 + }, + { + "epoch": 19.507745010425975, + "grad_norm": 0.045654296875, + "learning_rate": 5.5359442210766163e-05, + "loss": 0.7969, + "num_input_tokens_seen": 76043904, + "step": 130975 + }, + { + "epoch": 19.508489722966935, + "grad_norm": 0.03955078125, + "learning_rate": 5.5192220096453855e-05, + "loss": 0.7967, + "num_input_tokens_seen": 76046880, + "step": 130980 + }, + { + "epoch": 19.509234435507892, + "grad_norm": 0.058837890625, + "learning_rate": 5.5025250459478924e-05, + "loss": 0.7764, + "num_input_tokens_seen": 76049600, + "step": 130985 + }, + { + "epoch": 19.509979148048853, + "grad_norm": 0.0947265625, + "learning_rate": 5.4858533302662436e-05, + "loss": 0.7788, + "num_input_tokens_seen": 76052480, + "step": 130990 + }, + { + "epoch": 19.510723860589813, + "grad_norm": 0.037841796875, + "learning_rate": 5.469206862882214e-05, + "loss": 0.7893, + "num_input_tokens_seen": 76055552, + "step": 130995 + }, + { + "epoch": 19.51146857313077, + "grad_norm": 0.035888671875, + "learning_rate": 5.4525856440769124e-05, + "loss": 0.8087, + "num_input_tokens_seen": 76058464, + "step": 131000 + }, + { + "epoch": 19.51221328567173, + "grad_norm": 0.072265625, + "learning_rate": 5.4359896741312804e-05, + "loss": 0.8263, + "num_input_tokens_seen": 76061408, + "step": 131005 + }, + { + "epoch": 19.51295799821269, + "grad_norm": 0.0595703125, + "learning_rate": 5.4194189533254275e-05, + "loss": 0.8044, + "num_input_tokens_seen": 76064512, + "step": 131010 + }, + { + "epoch": 19.51370271075365, + "grad_norm": 0.05419921875, + "learning_rate": 5.4028734819396295e-05, + "loss": 0.7855, + "num_input_tokens_seen": 76067456, + "step": 131015 + }, + { + "epoch": 19.51444742329461, + "grad_norm": 0.09326171875, + "learning_rate": 5.3863532602531626e-05, + "loss": 0.7743, + "num_input_tokens_seen": 76070144, + "step": 131020 + }, + { + "epoch": 19.515192135835566, + "grad_norm": 0.059814453125, + "learning_rate": 5.369858288545304e-05, + "loss": 0.799, + "num_input_tokens_seen": 76072576, + "step": 131025 + }, + { + "epoch": 19.515936848376526, + "grad_norm": 0.0274658203125, + "learning_rate": 5.353388567094497e-05, + "loss": 0.7833, + "num_input_tokens_seen": 76075552, + "step": 131030 + }, + { + "epoch": 19.516681560917487, + "grad_norm": 0.04345703125, + "learning_rate": 5.336944096179186e-05, + "loss": 0.8025, + "num_input_tokens_seen": 76078080, + "step": 131035 + }, + { + "epoch": 19.517426273458444, + "grad_norm": 0.0478515625, + "learning_rate": 5.320524876076982e-05, + "loss": 0.7932, + "num_input_tokens_seen": 76081152, + "step": 131040 + }, + { + "epoch": 19.518170985999404, + "grad_norm": 0.06005859375, + "learning_rate": 5.304130907065496e-05, + "loss": 0.7928, + "num_input_tokens_seen": 76084192, + "step": 131045 + }, + { + "epoch": 19.518915698540365, + "grad_norm": 0.03271484375, + "learning_rate": 5.2877621894215075e-05, + "loss": 0.7981, + "num_input_tokens_seen": 76087776, + "step": 131050 + }, + { + "epoch": 19.519660411081322, + "grad_norm": 0.048095703125, + "learning_rate": 5.271418723421628e-05, + "loss": 0.8027, + "num_input_tokens_seen": 76090912, + "step": 131055 + }, + { + "epoch": 19.520405123622282, + "grad_norm": 0.04931640625, + "learning_rate": 5.2551005093419697e-05, + "loss": 0.7871, + "num_input_tokens_seen": 76093824, + "step": 131060 + }, + { + "epoch": 19.52114983616324, + "grad_norm": 0.047119140625, + "learning_rate": 5.238807547458146e-05, + "loss": 0.8118, + "num_input_tokens_seen": 76096736, + "step": 131065 + }, + { + "epoch": 19.5218945487042, + "grad_norm": 0.043701171875, + "learning_rate": 5.2225398380456033e-05, + "loss": 0.8319, + "num_input_tokens_seen": 76099552, + "step": 131070 + }, + { + "epoch": 19.52263926124516, + "grad_norm": 0.0478515625, + "learning_rate": 5.2062973813789546e-05, + "loss": 0.8269, + "num_input_tokens_seen": 76102336, + "step": 131075 + }, + { + "epoch": 19.523383973786117, + "grad_norm": 0.053466796875, + "learning_rate": 5.190080177732648e-05, + "loss": 0.801, + "num_input_tokens_seen": 76105024, + "step": 131080 + }, + { + "epoch": 19.524128686327078, + "grad_norm": 0.09814453125, + "learning_rate": 5.17388822738063e-05, + "loss": 0.7957, + "num_input_tokens_seen": 76107840, + "step": 131085 + }, + { + "epoch": 19.52487339886804, + "grad_norm": 0.03662109375, + "learning_rate": 5.157721530596515e-05, + "loss": 0.8122, + "num_input_tokens_seen": 76110944, + "step": 131090 + }, + { + "epoch": 19.525618111408996, + "grad_norm": 0.1005859375, + "learning_rate": 5.1415800876534187e-05, + "loss": 0.7871, + "num_input_tokens_seen": 76113824, + "step": 131095 + }, + { + "epoch": 19.526362823949956, + "grad_norm": 0.031982421875, + "learning_rate": 5.1254638988241227e-05, + "loss": 0.7989, + "num_input_tokens_seen": 76117120, + "step": 131100 + }, + { + "epoch": 19.527107536490913, + "grad_norm": 0.11767578125, + "learning_rate": 5.109372964380742e-05, + "loss": 0.7757, + "num_input_tokens_seen": 76119840, + "step": 131105 + }, + { + "epoch": 19.527852249031874, + "grad_norm": 0.048095703125, + "learning_rate": 5.093307284595061e-05, + "loss": 0.7879, + "num_input_tokens_seen": 76122784, + "step": 131110 + }, + { + "epoch": 19.528596961572834, + "grad_norm": 0.036865234375, + "learning_rate": 5.0772668597386935e-05, + "loss": 0.7931, + "num_input_tokens_seen": 76125408, + "step": 131115 + }, + { + "epoch": 19.52934167411379, + "grad_norm": 0.048583984375, + "learning_rate": 5.061251690082424e-05, + "loss": 0.7885, + "num_input_tokens_seen": 76128384, + "step": 131120 + }, + { + "epoch": 19.53008638665475, + "grad_norm": 0.0284423828125, + "learning_rate": 5.045261775897036e-05, + "loss": 0.7954, + "num_input_tokens_seen": 76131200, + "step": 131125 + }, + { + "epoch": 19.530831099195712, + "grad_norm": 0.029052734375, + "learning_rate": 5.02929711745248e-05, + "loss": 0.8041, + "num_input_tokens_seen": 76134112, + "step": 131130 + }, + { + "epoch": 19.53157581173667, + "grad_norm": 0.03515625, + "learning_rate": 5.013357715018707e-05, + "loss": 0.7929, + "num_input_tokens_seen": 76137248, + "step": 131135 + }, + { + "epoch": 19.53232052427763, + "grad_norm": 0.039794921875, + "learning_rate": 4.997443568864668e-05, + "loss": 0.8057, + "num_input_tokens_seen": 76140224, + "step": 131140 + }, + { + "epoch": 19.533065236818587, + "grad_norm": 0.037841796875, + "learning_rate": 4.981554679259315e-05, + "loss": 0.791, + "num_input_tokens_seen": 76143264, + "step": 131145 + }, + { + "epoch": 19.533809949359547, + "grad_norm": 0.035888671875, + "learning_rate": 4.9656910464712656e-05, + "loss": 0.7956, + "num_input_tokens_seen": 76146368, + "step": 131150 + }, + { + "epoch": 19.534554661900508, + "grad_norm": 0.05517578125, + "learning_rate": 4.949852670768306e-05, + "loss": 0.7954, + "num_input_tokens_seen": 76149024, + "step": 131155 + }, + { + "epoch": 19.535299374441465, + "grad_norm": 0.04833984375, + "learning_rate": 4.9340395524180565e-05, + "loss": 0.8006, + "num_input_tokens_seen": 76151904, + "step": 131160 + }, + { + "epoch": 19.536044086982425, + "grad_norm": 0.04052734375, + "learning_rate": 4.918251691687636e-05, + "loss": 0.7687, + "num_input_tokens_seen": 76154592, + "step": 131165 + }, + { + "epoch": 19.536788799523386, + "grad_norm": 0.044677734375, + "learning_rate": 4.9024890888439975e-05, + "loss": 0.8052, + "num_input_tokens_seen": 76157440, + "step": 131170 + }, + { + "epoch": 19.537533512064343, + "grad_norm": 0.06396484375, + "learning_rate": 4.8867517441530946e-05, + "loss": 0.7823, + "num_input_tokens_seen": 76160192, + "step": 131175 + }, + { + "epoch": 19.538278224605303, + "grad_norm": 0.039794921875, + "learning_rate": 4.871039657881049e-05, + "loss": 0.798, + "num_input_tokens_seen": 76163264, + "step": 131180 + }, + { + "epoch": 19.53902293714626, + "grad_norm": 0.06396484375, + "learning_rate": 4.8553528302931466e-05, + "loss": 0.7939, + "num_input_tokens_seen": 76166240, + "step": 131185 + }, + { + "epoch": 19.53976764968722, + "grad_norm": 0.032958984375, + "learning_rate": 4.839691261654511e-05, + "loss": 0.7969, + "num_input_tokens_seen": 76168992, + "step": 131190 + }, + { + "epoch": 19.54051236222818, + "grad_norm": 0.04345703125, + "learning_rate": 4.8240549522295946e-05, + "loss": 0.7869, + "num_input_tokens_seen": 76172000, + "step": 131195 + }, + { + "epoch": 19.54125707476914, + "grad_norm": 0.03271484375, + "learning_rate": 4.8084439022826884e-05, + "loss": 0.8034, + "num_input_tokens_seen": 76174880, + "step": 131200 + }, + { + "epoch": 19.5420017873101, + "grad_norm": 0.072265625, + "learning_rate": 4.79285811207758e-05, + "loss": 0.8132, + "num_input_tokens_seen": 76177856, + "step": 131205 + }, + { + "epoch": 19.542746499851056, + "grad_norm": 0.0517578125, + "learning_rate": 4.777297581877393e-05, + "loss": 0.7972, + "num_input_tokens_seen": 76180896, + "step": 131210 + }, + { + "epoch": 19.543491212392016, + "grad_norm": 0.039794921875, + "learning_rate": 4.7617623119450835e-05, + "loss": 0.7834, + "num_input_tokens_seen": 76183872, + "step": 131215 + }, + { + "epoch": 19.544235924932977, + "grad_norm": 0.1201171875, + "learning_rate": 4.746252302543108e-05, + "loss": 0.7926, + "num_input_tokens_seen": 76186656, + "step": 131220 + }, + { + "epoch": 19.544980637473934, + "grad_norm": 0.061767578125, + "learning_rate": 4.7307675539334236e-05, + "loss": 0.804, + "num_input_tokens_seen": 76189568, + "step": 131225 + }, + { + "epoch": 19.545725350014894, + "grad_norm": 0.05322265625, + "learning_rate": 4.715308066377821e-05, + "loss": 0.786, + "num_input_tokens_seen": 76192320, + "step": 131230 + }, + { + "epoch": 19.546470062555855, + "grad_norm": 0.044677734375, + "learning_rate": 4.6998738401372585e-05, + "loss": 0.7877, + "num_input_tokens_seen": 76194880, + "step": 131235 + }, + { + "epoch": 19.547214775096812, + "grad_norm": 0.03515625, + "learning_rate": 4.6844648754726936e-05, + "loss": 0.7968, + "num_input_tokens_seen": 76197696, + "step": 131240 + }, + { + "epoch": 19.547959487637772, + "grad_norm": 0.05029296875, + "learning_rate": 4.6690811726440847e-05, + "loss": 0.8083, + "num_input_tokens_seen": 76200416, + "step": 131245 + }, + { + "epoch": 19.54870420017873, + "grad_norm": 0.044677734375, + "learning_rate": 4.653722731911724e-05, + "loss": 0.7999, + "num_input_tokens_seen": 76203200, + "step": 131250 + }, + { + "epoch": 19.54944891271969, + "grad_norm": 0.04345703125, + "learning_rate": 4.638389553534905e-05, + "loss": 0.7861, + "num_input_tokens_seen": 76205984, + "step": 131255 + }, + { + "epoch": 19.55019362526065, + "grad_norm": 0.1337890625, + "learning_rate": 4.623081637772752e-05, + "loss": 0.8009, + "num_input_tokens_seen": 76208992, + "step": 131260 + }, + { + "epoch": 19.550938337801608, + "grad_norm": 0.059326171875, + "learning_rate": 4.607798984883726e-05, + "loss": 0.8239, + "num_input_tokens_seen": 76211776, + "step": 131265 + }, + { + "epoch": 19.551683050342568, + "grad_norm": 0.078125, + "learning_rate": 4.5925415951261205e-05, + "loss": 0.7906, + "num_input_tokens_seen": 76214688, + "step": 131270 + }, + { + "epoch": 19.55242776288353, + "grad_norm": 0.04443359375, + "learning_rate": 4.577309468757729e-05, + "loss": 0.798, + "num_input_tokens_seen": 76217632, + "step": 131275 + }, + { + "epoch": 19.553172475424486, + "grad_norm": 0.0576171875, + "learning_rate": 4.562102606035678e-05, + "loss": 0.7962, + "num_input_tokens_seen": 76220320, + "step": 131280 + }, + { + "epoch": 19.553917187965446, + "grad_norm": 0.0269775390625, + "learning_rate": 4.54692100721693e-05, + "loss": 0.8098, + "num_input_tokens_seen": 76223456, + "step": 131285 + }, + { + "epoch": 19.554661900506403, + "grad_norm": 0.0380859375, + "learning_rate": 4.5317646725581135e-05, + "loss": 0.7889, + "num_input_tokens_seen": 76225984, + "step": 131290 + }, + { + "epoch": 19.555406613047364, + "grad_norm": 0.03955078125, + "learning_rate": 4.5166336023153564e-05, + "loss": 0.8008, + "num_input_tokens_seen": 76228896, + "step": 131295 + }, + { + "epoch": 19.556151325588324, + "grad_norm": 0.0771484375, + "learning_rate": 4.5015277967439534e-05, + "loss": 0.7854, + "num_input_tokens_seen": 76231488, + "step": 131300 + }, + { + "epoch": 19.55689603812928, + "grad_norm": 0.035888671875, + "learning_rate": 4.486447256099368e-05, + "loss": 0.7928, + "num_input_tokens_seen": 76234560, + "step": 131305 + }, + { + "epoch": 19.55764075067024, + "grad_norm": 0.07958984375, + "learning_rate": 4.4713919806363966e-05, + "loss": 0.7985, + "num_input_tokens_seen": 76237760, + "step": 131310 + }, + { + "epoch": 19.558385463211202, + "grad_norm": 0.059326171875, + "learning_rate": 4.456361970609002e-05, + "loss": 0.7917, + "num_input_tokens_seen": 76240672, + "step": 131315 + }, + { + "epoch": 19.55913017575216, + "grad_norm": 0.03955078125, + "learning_rate": 4.4413572262714805e-05, + "loss": 0.7884, + "num_input_tokens_seen": 76243616, + "step": 131320 + }, + { + "epoch": 19.55987488829312, + "grad_norm": 0.19921875, + "learning_rate": 4.426377747877297e-05, + "loss": 0.7969, + "num_input_tokens_seen": 76246720, + "step": 131325 + }, + { + "epoch": 19.560619600834077, + "grad_norm": 0.04248046875, + "learning_rate": 4.411423535679415e-05, + "loss": 0.7839, + "num_input_tokens_seen": 76249664, + "step": 131330 + }, + { + "epoch": 19.561364313375037, + "grad_norm": 0.04052734375, + "learning_rate": 4.3964945899304665e-05, + "loss": 0.7891, + "num_input_tokens_seen": 76252768, + "step": 131335 + }, + { + "epoch": 19.562109025915998, + "grad_norm": 0.043212890625, + "learning_rate": 4.3815909108825824e-05, + "loss": 0.7818, + "num_input_tokens_seen": 76255552, + "step": 131340 + }, + { + "epoch": 19.562853738456955, + "grad_norm": 0.051025390625, + "learning_rate": 4.366712498787728e-05, + "loss": 0.7851, + "num_input_tokens_seen": 76258560, + "step": 131345 + }, + { + "epoch": 19.563598450997915, + "grad_norm": 0.039794921875, + "learning_rate": 4.3518593538972026e-05, + "loss": 0.7849, + "num_input_tokens_seen": 76261408, + "step": 131350 + }, + { + "epoch": 19.564343163538872, + "grad_norm": 0.048095703125, + "learning_rate": 4.3370314764618056e-05, + "loss": 0.8107, + "num_input_tokens_seen": 76264576, + "step": 131355 + }, + { + "epoch": 19.565087876079833, + "grad_norm": 0.047119140625, + "learning_rate": 4.3222288667321695e-05, + "loss": 0.7983, + "num_input_tokens_seen": 76267616, + "step": 131360 + }, + { + "epoch": 19.565832588620793, + "grad_norm": 0.05419921875, + "learning_rate": 4.307451524958428e-05, + "loss": 0.8085, + "num_input_tokens_seen": 76270368, + "step": 131365 + }, + { + "epoch": 19.56657730116175, + "grad_norm": 0.041259765625, + "learning_rate": 4.2926994513900474e-05, + "loss": 0.8108, + "num_input_tokens_seen": 76273216, + "step": 131370 + }, + { + "epoch": 19.56732201370271, + "grad_norm": 0.0498046875, + "learning_rate": 4.277972646276329e-05, + "loss": 0.7927, + "num_input_tokens_seen": 76276064, + "step": 131375 + }, + { + "epoch": 19.56806672624367, + "grad_norm": 0.039306640625, + "learning_rate": 4.26327110986624e-05, + "loss": 0.7903, + "num_input_tokens_seen": 76278688, + "step": 131380 + }, + { + "epoch": 19.56881143878463, + "grad_norm": 0.0478515625, + "learning_rate": 4.248594842407749e-05, + "loss": 0.7922, + "num_input_tokens_seen": 76281344, + "step": 131385 + }, + { + "epoch": 19.56955615132559, + "grad_norm": 0.041015625, + "learning_rate": 4.233943844149157e-05, + "loss": 0.8094, + "num_input_tokens_seen": 76284544, + "step": 131390 + }, + { + "epoch": 19.570300863866546, + "grad_norm": 0.03369140625, + "learning_rate": 4.2193181153377666e-05, + "loss": 0.8092, + "num_input_tokens_seen": 76287360, + "step": 131395 + }, + { + "epoch": 19.571045576407506, + "grad_norm": 0.038330078125, + "learning_rate": 4.2047176562208795e-05, + "loss": 0.7939, + "num_input_tokens_seen": 76290144, + "step": 131400 + }, + { + "epoch": 19.571790288948467, + "grad_norm": 0.06396484375, + "learning_rate": 4.190142467044966e-05, + "loss": 0.7869, + "num_input_tokens_seen": 76293056, + "step": 131405 + }, + { + "epoch": 19.572535001489424, + "grad_norm": 0.0289306640625, + "learning_rate": 4.175592548056495e-05, + "loss": 0.8099, + "num_input_tokens_seen": 76295936, + "step": 131410 + }, + { + "epoch": 19.573279714030384, + "grad_norm": 0.031982421875, + "learning_rate": 4.16106789950077e-05, + "loss": 0.7868, + "num_input_tokens_seen": 76298592, + "step": 131415 + }, + { + "epoch": 19.574024426571345, + "grad_norm": 0.03955078125, + "learning_rate": 4.146568521623761e-05, + "loss": 0.7867, + "num_input_tokens_seen": 76301408, + "step": 131420 + }, + { + "epoch": 19.574769139112302, + "grad_norm": 0.130859375, + "learning_rate": 4.1320944146701065e-05, + "loss": 0.8055, + "num_input_tokens_seen": 76304224, + "step": 131425 + }, + { + "epoch": 19.575513851653263, + "grad_norm": 0.039794921875, + "learning_rate": 4.117645578884277e-05, + "loss": 0.7956, + "num_input_tokens_seen": 76307072, + "step": 131430 + }, + { + "epoch": 19.57625856419422, + "grad_norm": 0.030517578125, + "learning_rate": 4.103222014510577e-05, + "loss": 0.7854, + "num_input_tokens_seen": 76310240, + "step": 131435 + }, + { + "epoch": 19.57700327673518, + "grad_norm": 0.031982421875, + "learning_rate": 4.088823721792478e-05, + "loss": 0.8, + "num_input_tokens_seen": 76312992, + "step": 131440 + }, + { + "epoch": 19.57774798927614, + "grad_norm": 0.0294189453125, + "learning_rate": 4.074450700973453e-05, + "loss": 0.8077, + "num_input_tokens_seen": 76316032, + "step": 131445 + }, + { + "epoch": 19.578492701817098, + "grad_norm": 0.03466796875, + "learning_rate": 4.060102952296141e-05, + "loss": 0.7957, + "num_input_tokens_seen": 76318976, + "step": 131450 + }, + { + "epoch": 19.579237414358058, + "grad_norm": 0.04931640625, + "learning_rate": 4.0457804760030136e-05, + "loss": 0.8013, + "num_input_tokens_seen": 76321728, + "step": 131455 + }, + { + "epoch": 19.57998212689902, + "grad_norm": 0.03662109375, + "learning_rate": 4.031483272335878e-05, + "loss": 0.7964, + "num_input_tokens_seen": 76324512, + "step": 131460 + }, + { + "epoch": 19.580726839439976, + "grad_norm": 0.051025390625, + "learning_rate": 4.017211341536542e-05, + "loss": 0.7896, + "num_input_tokens_seen": 76327648, + "step": 131465 + }, + { + "epoch": 19.581471551980936, + "grad_norm": 0.037353515625, + "learning_rate": 4.0029646838459774e-05, + "loss": 0.7839, + "num_input_tokens_seen": 76330336, + "step": 131470 + }, + { + "epoch": 19.582216264521893, + "grad_norm": 0.06494140625, + "learning_rate": 3.988743299504826e-05, + "loss": 0.7937, + "num_input_tokens_seen": 76333088, + "step": 131475 + }, + { + "epoch": 19.582960977062854, + "grad_norm": 0.042236328125, + "learning_rate": 3.9745471887533965e-05, + "loss": 0.786, + "num_input_tokens_seen": 76336288, + "step": 131480 + }, + { + "epoch": 19.583705689603814, + "grad_norm": 0.06005859375, + "learning_rate": 3.960376351831329e-05, + "loss": 0.7866, + "num_input_tokens_seen": 76339008, + "step": 131485 + }, + { + "epoch": 19.58445040214477, + "grad_norm": 0.049072265625, + "learning_rate": 3.946230788978433e-05, + "loss": 0.7968, + "num_input_tokens_seen": 76342080, + "step": 131490 + }, + { + "epoch": 19.58519511468573, + "grad_norm": 0.34375, + "learning_rate": 3.93211050043335e-05, + "loss": 0.7966, + "num_input_tokens_seen": 76344800, + "step": 131495 + }, + { + "epoch": 19.58593982722669, + "grad_norm": 0.03466796875, + "learning_rate": 3.9180154864347225e-05, + "loss": 0.7986, + "num_input_tokens_seen": 76347680, + "step": 131500 + }, + { + "epoch": 19.58668453976765, + "grad_norm": 0.03759765625, + "learning_rate": 3.9039457472206936e-05, + "loss": 0.8108, + "num_input_tokens_seen": 76350624, + "step": 131505 + }, + { + "epoch": 19.58742925230861, + "grad_norm": 0.03564453125, + "learning_rate": 3.8899012830287405e-05, + "loss": 0.7849, + "num_input_tokens_seen": 76353664, + "step": 131510 + }, + { + "epoch": 19.588173964849567, + "grad_norm": 0.0380859375, + "learning_rate": 3.875882094096505e-05, + "loss": 0.7906, + "num_input_tokens_seen": 76356608, + "step": 131515 + }, + { + "epoch": 19.588918677390527, + "grad_norm": 0.041748046875, + "learning_rate": 3.8618881806606326e-05, + "loss": 0.7754, + "num_input_tokens_seen": 76359328, + "step": 131520 + }, + { + "epoch": 19.589663389931488, + "grad_norm": 0.0458984375, + "learning_rate": 3.847919542957601e-05, + "loss": 0.8184, + "num_input_tokens_seen": 76362144, + "step": 131525 + }, + { + "epoch": 19.590408102472445, + "grad_norm": 0.0439453125, + "learning_rate": 3.833976181223386e-05, + "loss": 0.7985, + "num_input_tokens_seen": 76365408, + "step": 131530 + }, + { + "epoch": 19.591152815013405, + "grad_norm": 0.0458984375, + "learning_rate": 3.820058095693468e-05, + "loss": 0.788, + "num_input_tokens_seen": 76368128, + "step": 131535 + }, + { + "epoch": 19.591897527554362, + "grad_norm": 0.03564453125, + "learning_rate": 3.8061652866029915e-05, + "loss": 0.8037, + "num_input_tokens_seen": 76371200, + "step": 131540 + }, + { + "epoch": 19.592642240095323, + "grad_norm": 0.06396484375, + "learning_rate": 3.792297754186602e-05, + "loss": 0.7891, + "num_input_tokens_seen": 76373856, + "step": 131545 + }, + { + "epoch": 19.593386952636283, + "grad_norm": 0.0654296875, + "learning_rate": 3.7784554986787785e-05, + "loss": 0.8232, + "num_input_tokens_seen": 76376736, + "step": 131550 + }, + { + "epoch": 19.59413166517724, + "grad_norm": 0.05419921875, + "learning_rate": 3.764638520313168e-05, + "loss": 0.7996, + "num_input_tokens_seen": 76379744, + "step": 131555 + }, + { + "epoch": 19.5948763777182, + "grad_norm": 0.040283203125, + "learning_rate": 3.750846819323417e-05, + "loss": 0.791, + "num_input_tokens_seen": 76382336, + "step": 131560 + }, + { + "epoch": 19.59562109025916, + "grad_norm": 0.0252685546875, + "learning_rate": 3.737080395942172e-05, + "loss": 0.7846, + "num_input_tokens_seen": 76385216, + "step": 131565 + }, + { + "epoch": 19.59636580280012, + "grad_norm": 0.050537109375, + "learning_rate": 3.723339250402413e-05, + "loss": 0.7926, + "num_input_tokens_seen": 76387968, + "step": 131570 + }, + { + "epoch": 19.59711051534108, + "grad_norm": 0.04345703125, + "learning_rate": 3.709623382936122e-05, + "loss": 0.7984, + "num_input_tokens_seen": 76390816, + "step": 131575 + }, + { + "epoch": 19.597855227882036, + "grad_norm": 0.035400390625, + "learning_rate": 3.695932793774781e-05, + "loss": 0.7809, + "num_input_tokens_seen": 76393696, + "step": 131580 + }, + { + "epoch": 19.598599940422996, + "grad_norm": 0.07861328125, + "learning_rate": 3.682267483150037e-05, + "loss": 0.7935, + "num_input_tokens_seen": 76396544, + "step": 131585 + }, + { + "epoch": 19.599344652963957, + "grad_norm": 0.03076171875, + "learning_rate": 3.6686274512925386e-05, + "loss": 0.8054, + "num_input_tokens_seen": 76399456, + "step": 131590 + }, + { + "epoch": 19.600089365504914, + "grad_norm": 0.0242919921875, + "learning_rate": 3.655012698432769e-05, + "loss": 0.8127, + "num_input_tokens_seen": 76402176, + "step": 131595 + }, + { + "epoch": 19.600834078045875, + "grad_norm": 0.050537109375, + "learning_rate": 3.641423224800877e-05, + "loss": 0.7859, + "num_input_tokens_seen": 76404992, + "step": 131600 + }, + { + "epoch": 19.601578790586835, + "grad_norm": 0.03759765625, + "learning_rate": 3.627859030626179e-05, + "loss": 0.804, + "num_input_tokens_seen": 76407936, + "step": 131605 + }, + { + "epoch": 19.602323503127792, + "grad_norm": 0.06787109375, + "learning_rate": 3.614320116137992e-05, + "loss": 0.7947, + "num_input_tokens_seen": 76410528, + "step": 131610 + }, + { + "epoch": 19.603068215668753, + "grad_norm": 0.039306640625, + "learning_rate": 3.6008064815649665e-05, + "loss": 0.8407, + "num_input_tokens_seen": 76413056, + "step": 131615 + }, + { + "epoch": 19.60381292820971, + "grad_norm": 0.0277099609375, + "learning_rate": 3.587318127135419e-05, + "loss": 0.8119, + "num_input_tokens_seen": 76415776, + "step": 131620 + }, + { + "epoch": 19.60455764075067, + "grad_norm": 0.035400390625, + "learning_rate": 3.573855053077335e-05, + "loss": 0.8131, + "num_input_tokens_seen": 76418784, + "step": 131625 + }, + { + "epoch": 19.60530235329163, + "grad_norm": 0.059814453125, + "learning_rate": 3.5604172596180315e-05, + "loss": 0.7986, + "num_input_tokens_seen": 76421888, + "step": 131630 + }, + { + "epoch": 19.606047065832588, + "grad_norm": 0.051025390625, + "learning_rate": 3.547004746984495e-05, + "loss": 0.7951, + "num_input_tokens_seen": 76424768, + "step": 131635 + }, + { + "epoch": 19.606791778373548, + "grad_norm": 0.04541015625, + "learning_rate": 3.533617515403542e-05, + "loss": 0.7853, + "num_input_tokens_seen": 76427744, + "step": 131640 + }, + { + "epoch": 19.60753649091451, + "grad_norm": 0.054443359375, + "learning_rate": 3.520255565100994e-05, + "loss": 0.7903, + "num_input_tokens_seen": 76430496, + "step": 131645 + }, + { + "epoch": 19.608281203455466, + "grad_norm": 0.07080078125, + "learning_rate": 3.506918896302835e-05, + "loss": 0.8029, + "num_input_tokens_seen": 76433440, + "step": 131650 + }, + { + "epoch": 19.609025915996426, + "grad_norm": 0.080078125, + "learning_rate": 3.493607509234386e-05, + "loss": 0.7863, + "num_input_tokens_seen": 76436576, + "step": 131655 + }, + { + "epoch": 19.609770628537383, + "grad_norm": 0.0615234375, + "learning_rate": 3.4803214041204674e-05, + "loss": 0.7927, + "num_input_tokens_seen": 76439488, + "step": 131660 + }, + { + "epoch": 19.610515341078344, + "grad_norm": 0.054443359375, + "learning_rate": 3.4670605811855655e-05, + "loss": 0.7802, + "num_input_tokens_seen": 76442464, + "step": 131665 + }, + { + "epoch": 19.611260053619304, + "grad_norm": 0.03759765625, + "learning_rate": 3.453825040653502e-05, + "loss": 0.8119, + "num_input_tokens_seen": 76445664, + "step": 131670 + }, + { + "epoch": 19.61200476616026, + "grad_norm": 0.037353515625, + "learning_rate": 3.440614782748097e-05, + "loss": 0.8031, + "num_input_tokens_seen": 76448352, + "step": 131675 + }, + { + "epoch": 19.61274947870122, + "grad_norm": 0.043701171875, + "learning_rate": 3.427429807692506e-05, + "loss": 0.7914, + "num_input_tokens_seen": 76451200, + "step": 131680 + }, + { + "epoch": 19.613494191242182, + "grad_norm": 0.04296875, + "learning_rate": 3.414270115709383e-05, + "loss": 0.7966, + "num_input_tokens_seen": 76454240, + "step": 131685 + }, + { + "epoch": 19.61423890378314, + "grad_norm": 0.040283203125, + "learning_rate": 3.401135707021219e-05, + "loss": 0.8088, + "num_input_tokens_seen": 76456800, + "step": 131690 + }, + { + "epoch": 19.6149836163241, + "grad_norm": 0.032470703125, + "learning_rate": 3.388026581849668e-05, + "loss": 0.7885, + "num_input_tokens_seen": 76459808, + "step": 131695 + }, + { + "epoch": 19.615728328865057, + "grad_norm": 0.057373046875, + "learning_rate": 3.3749427404163864e-05, + "loss": 0.7992, + "num_input_tokens_seen": 76463168, + "step": 131700 + }, + { + "epoch": 19.616473041406017, + "grad_norm": 0.056884765625, + "learning_rate": 3.361884182942365e-05, + "loss": 0.8231, + "num_input_tokens_seen": 76466144, + "step": 131705 + }, + { + "epoch": 19.617217753946978, + "grad_norm": 0.0771484375, + "learning_rate": 3.348850909648093e-05, + "loss": 0.7858, + "num_input_tokens_seen": 76469280, + "step": 131710 + }, + { + "epoch": 19.617962466487935, + "grad_norm": 0.0537109375, + "learning_rate": 3.335842920754062e-05, + "loss": 0.7995, + "num_input_tokens_seen": 76471968, + "step": 131715 + }, + { + "epoch": 19.618707179028895, + "grad_norm": 0.06591796875, + "learning_rate": 3.3228602164795946e-05, + "loss": 0.7811, + "num_input_tokens_seen": 76474944, + "step": 131720 + }, + { + "epoch": 19.619451891569852, + "grad_norm": 0.05224609375, + "learning_rate": 3.30990279704435e-05, + "loss": 0.8109, + "num_input_tokens_seen": 76477888, + "step": 131725 + }, + { + "epoch": 19.620196604110813, + "grad_norm": 0.036865234375, + "learning_rate": 3.296970662667153e-05, + "loss": 0.8139, + "num_input_tokens_seen": 76480672, + "step": 131730 + }, + { + "epoch": 19.620941316651773, + "grad_norm": 0.05224609375, + "learning_rate": 3.284063813566495e-05, + "loss": 0.7852, + "num_input_tokens_seen": 76483552, + "step": 131735 + }, + { + "epoch": 19.62168602919273, + "grad_norm": 0.05126953125, + "learning_rate": 3.271182249960203e-05, + "loss": 0.8014, + "num_input_tokens_seen": 76486432, + "step": 131740 + }, + { + "epoch": 19.62243074173369, + "grad_norm": 0.053955078125, + "learning_rate": 3.258325972066267e-05, + "loss": 0.8098, + "num_input_tokens_seen": 76489504, + "step": 131745 + }, + { + "epoch": 19.62317545427465, + "grad_norm": 0.044921875, + "learning_rate": 3.245494980101515e-05, + "loss": 0.7826, + "num_input_tokens_seen": 76492128, + "step": 131750 + }, + { + "epoch": 19.62392016681561, + "grad_norm": 0.055419921875, + "learning_rate": 3.232689274283107e-05, + "loss": 0.8096, + "num_input_tokens_seen": 76494816, + "step": 131755 + }, + { + "epoch": 19.62466487935657, + "grad_norm": 0.052490234375, + "learning_rate": 3.219908854827036e-05, + "loss": 0.8123, + "num_input_tokens_seen": 76497888, + "step": 131760 + }, + { + "epoch": 19.625409591897526, + "grad_norm": 0.049560546875, + "learning_rate": 3.207153721949296e-05, + "loss": 0.7923, + "num_input_tokens_seen": 76500960, + "step": 131765 + }, + { + "epoch": 19.626154304438487, + "grad_norm": 0.09326171875, + "learning_rate": 3.194423875865548e-05, + "loss": 0.7903, + "num_input_tokens_seen": 76503808, + "step": 131770 + }, + { + "epoch": 19.626899016979447, + "grad_norm": 0.057373046875, + "learning_rate": 3.181719316790621e-05, + "loss": 0.792, + "num_input_tokens_seen": 76507008, + "step": 131775 + }, + { + "epoch": 19.627643729520404, + "grad_norm": 0.04296875, + "learning_rate": 3.1690400449393416e-05, + "loss": 0.8063, + "num_input_tokens_seen": 76510112, + "step": 131780 + }, + { + "epoch": 19.628388442061365, + "grad_norm": 0.07568359375, + "learning_rate": 3.1563860605257064e-05, + "loss": 0.8045, + "num_input_tokens_seen": 76513312, + "step": 131785 + }, + { + "epoch": 19.629133154602325, + "grad_norm": 0.05322265625, + "learning_rate": 3.1437573637637104e-05, + "loss": 0.8, + "num_input_tokens_seen": 76516096, + "step": 131790 + }, + { + "epoch": 19.629877867143282, + "grad_norm": 0.0712890625, + "learning_rate": 3.1311539548665165e-05, + "loss": 0.8193, + "num_input_tokens_seen": 76519008, + "step": 131795 + }, + { + "epoch": 19.630622579684243, + "grad_norm": 0.06494140625, + "learning_rate": 3.1185758340472876e-05, + "loss": 0.796, + "num_input_tokens_seen": 76521536, + "step": 131800 + }, + { + "epoch": 19.6313672922252, + "grad_norm": 0.059814453125, + "learning_rate": 3.106023001518188e-05, + "loss": 0.8027, + "num_input_tokens_seen": 76524352, + "step": 131805 + }, + { + "epoch": 19.63211200476616, + "grad_norm": 0.043212890625, + "learning_rate": 3.093495457491546e-05, + "loss": 0.7771, + "num_input_tokens_seen": 76527232, + "step": 131810 + }, + { + "epoch": 19.63285671730712, + "grad_norm": 0.04296875, + "learning_rate": 3.080993202178861e-05, + "loss": 0.7982, + "num_input_tokens_seen": 76530208, + "step": 131815 + }, + { + "epoch": 19.633601429848078, + "grad_norm": 0.03759765625, + "learning_rate": 3.0685162357912964e-05, + "loss": 0.7977, + "num_input_tokens_seen": 76533056, + "step": 131820 + }, + { + "epoch": 19.634346142389038, + "grad_norm": 0.060791015625, + "learning_rate": 3.056064558539851e-05, + "loss": 0.7866, + "num_input_tokens_seen": 76536000, + "step": 131825 + }, + { + "epoch": 19.63509085493, + "grad_norm": 0.05126953125, + "learning_rate": 3.0436381706348547e-05, + "loss": 0.786, + "num_input_tokens_seen": 76538720, + "step": 131830 + }, + { + "epoch": 19.635835567470956, + "grad_norm": 0.035400390625, + "learning_rate": 3.031237072285975e-05, + "loss": 0.8057, + "num_input_tokens_seen": 76541920, + "step": 131835 + }, + { + "epoch": 19.636580280011916, + "grad_norm": 0.058837890625, + "learning_rate": 3.018861263703043e-05, + "loss": 0.8175, + "num_input_tokens_seen": 76544992, + "step": 131840 + }, + { + "epoch": 19.637324992552873, + "grad_norm": 0.0888671875, + "learning_rate": 3.0065107450948923e-05, + "loss": 0.7949, + "num_input_tokens_seen": 76547808, + "step": 131845 + }, + { + "epoch": 19.638069705093834, + "grad_norm": 0.03662109375, + "learning_rate": 2.9941855166703557e-05, + "loss": 0.8091, + "num_input_tokens_seen": 76550880, + "step": 131850 + }, + { + "epoch": 19.638814417634794, + "grad_norm": 0.091796875, + "learning_rate": 2.981885578637433e-05, + "loss": 0.7755, + "num_input_tokens_seen": 76553664, + "step": 131855 + }, + { + "epoch": 19.63955913017575, + "grad_norm": 0.03759765625, + "learning_rate": 2.9696109312041252e-05, + "loss": 0.779, + "num_input_tokens_seen": 76556512, + "step": 131860 + }, + { + "epoch": 19.640303842716712, + "grad_norm": 0.050048828125, + "learning_rate": 2.957361574577766e-05, + "loss": 0.7892, + "num_input_tokens_seen": 76559264, + "step": 131865 + }, + { + "epoch": 19.64104855525767, + "grad_norm": 0.1796875, + "learning_rate": 2.9451375089651897e-05, + "loss": 0.8174, + "num_input_tokens_seen": 76562272, + "step": 131870 + }, + { + "epoch": 19.64179326779863, + "grad_norm": 0.059814453125, + "learning_rate": 2.9329387345730648e-05, + "loss": 0.7927, + "num_input_tokens_seen": 76565056, + "step": 131875 + }, + { + "epoch": 19.64253798033959, + "grad_norm": 0.03369140625, + "learning_rate": 2.920765251607227e-05, + "loss": 0.8041, + "num_input_tokens_seen": 76567776, + "step": 131880 + }, + { + "epoch": 19.643282692880547, + "grad_norm": 0.04833984375, + "learning_rate": 2.908617060273677e-05, + "loss": 0.7813, + "num_input_tokens_seen": 76570464, + "step": 131885 + }, + { + "epoch": 19.644027405421507, + "grad_norm": 0.06494140625, + "learning_rate": 2.8964941607774187e-05, + "loss": 0.781, + "num_input_tokens_seen": 76573088, + "step": 131890 + }, + { + "epoch": 19.644772117962468, + "grad_norm": 0.043701171875, + "learning_rate": 2.8843965533231206e-05, + "loss": 0.7948, + "num_input_tokens_seen": 76576128, + "step": 131895 + }, + { + "epoch": 19.645516830503425, + "grad_norm": 0.0380859375, + "learning_rate": 2.8723242381156198e-05, + "loss": 0.8216, + "num_input_tokens_seen": 76578976, + "step": 131900 + }, + { + "epoch": 19.646261543044385, + "grad_norm": 0.037841796875, + "learning_rate": 2.8602772153584197e-05, + "loss": 0.7744, + "num_input_tokens_seen": 76581696, + "step": 131905 + }, + { + "epoch": 19.647006255585342, + "grad_norm": 0.050537109375, + "learning_rate": 2.848255485255191e-05, + "loss": 0.7921, + "num_input_tokens_seen": 76584704, + "step": 131910 + }, + { + "epoch": 19.647750968126303, + "grad_norm": 0.0673828125, + "learning_rate": 2.836259048009271e-05, + "loss": 0.7968, + "num_input_tokens_seen": 76587872, + "step": 131915 + }, + { + "epoch": 19.648495680667263, + "grad_norm": 0.10986328125, + "learning_rate": 2.8242879038228307e-05, + "loss": 0.7907, + "num_input_tokens_seen": 76590784, + "step": 131920 + }, + { + "epoch": 19.64924039320822, + "grad_norm": 0.036865234375, + "learning_rate": 2.812342052898542e-05, + "loss": 0.785, + "num_input_tokens_seen": 76593632, + "step": 131925 + }, + { + "epoch": 19.64998510574918, + "grad_norm": 0.052734375, + "learning_rate": 2.8004214954380767e-05, + "loss": 0.7892, + "num_input_tokens_seen": 76596416, + "step": 131930 + }, + { + "epoch": 19.65072981829014, + "grad_norm": 0.03857421875, + "learning_rate": 2.788526231642774e-05, + "loss": 0.8034, + "num_input_tokens_seen": 76599648, + "step": 131935 + }, + { + "epoch": 19.6514745308311, + "grad_norm": 0.0576171875, + "learning_rate": 2.7766562617134726e-05, + "loss": 0.8077, + "num_input_tokens_seen": 76602784, + "step": 131940 + }, + { + "epoch": 19.65221924337206, + "grad_norm": 0.053466796875, + "learning_rate": 2.7648115858510123e-05, + "loss": 0.7923, + "num_input_tokens_seen": 76605536, + "step": 131945 + }, + { + "epoch": 19.652963955913016, + "grad_norm": 0.055908203125, + "learning_rate": 2.7529922042554e-05, + "loss": 0.8042, + "num_input_tokens_seen": 76608256, + "step": 131950 + }, + { + "epoch": 19.653708668453977, + "grad_norm": 0.03857421875, + "learning_rate": 2.7411981171259758e-05, + "loss": 0.7981, + "num_input_tokens_seen": 76611008, + "step": 131955 + }, + { + "epoch": 19.654453380994937, + "grad_norm": 0.03955078125, + "learning_rate": 2.72942932466258e-05, + "loss": 0.8045, + "num_input_tokens_seen": 76613952, + "step": 131960 + }, + { + "epoch": 19.655198093535894, + "grad_norm": 0.05029296875, + "learning_rate": 2.717685827063554e-05, + "loss": 0.7953, + "num_input_tokens_seen": 76616960, + "step": 131965 + }, + { + "epoch": 19.655942806076855, + "grad_norm": 0.051513671875, + "learning_rate": 2.7059676245274055e-05, + "loss": 0.7904, + "num_input_tokens_seen": 76619936, + "step": 131970 + }, + { + "epoch": 19.656687518617815, + "grad_norm": 0.060791015625, + "learning_rate": 2.6942747172523095e-05, + "loss": 0.7934, + "num_input_tokens_seen": 76622656, + "step": 131975 + }, + { + "epoch": 19.657432231158772, + "grad_norm": 0.0546875, + "learning_rate": 2.6826071054354417e-05, + "loss": 0.7876, + "num_input_tokens_seen": 76625536, + "step": 131980 + }, + { + "epoch": 19.658176943699733, + "grad_norm": 0.028564453125, + "learning_rate": 2.6709647892743105e-05, + "loss": 0.8014, + "num_input_tokens_seen": 76628512, + "step": 131985 + }, + { + "epoch": 19.65892165624069, + "grad_norm": 0.057373046875, + "learning_rate": 2.6593477689654255e-05, + "loss": 0.8084, + "num_input_tokens_seen": 76631456, + "step": 131990 + }, + { + "epoch": 19.65966636878165, + "grad_norm": 0.0419921875, + "learning_rate": 2.647756044704963e-05, + "loss": 0.7869, + "num_input_tokens_seen": 76634304, + "step": 131995 + }, + { + "epoch": 19.66041108132261, + "grad_norm": 0.02734375, + "learning_rate": 2.6361896166887664e-05, + "loss": 0.8025, + "num_input_tokens_seen": 76637280, + "step": 132000 + }, + { + "epoch": 19.661155793863568, + "grad_norm": 0.053955078125, + "learning_rate": 2.6246484851123462e-05, + "loss": 0.7931, + "num_input_tokens_seen": 76640160, + "step": 132005 + }, + { + "epoch": 19.66190050640453, + "grad_norm": 0.03857421875, + "learning_rate": 2.6131326501705464e-05, + "loss": 0.7942, + "num_input_tokens_seen": 76643168, + "step": 132010 + }, + { + "epoch": 19.662645218945485, + "grad_norm": 0.040283203125, + "learning_rate": 2.601642112058211e-05, + "loss": 0.8009, + "num_input_tokens_seen": 76645888, + "step": 132015 + }, + { + "epoch": 19.663389931486446, + "grad_norm": 0.04736328125, + "learning_rate": 2.5901768709690186e-05, + "loss": 0.8126, + "num_input_tokens_seen": 76648896, + "step": 132020 + }, + { + "epoch": 19.664134644027406, + "grad_norm": 0.039306640625, + "learning_rate": 2.5787369270969806e-05, + "loss": 0.7998, + "num_input_tokens_seen": 76652224, + "step": 132025 + }, + { + "epoch": 19.664879356568363, + "grad_norm": 0.0654296875, + "learning_rate": 2.5673222806352757e-05, + "loss": 0.8021, + "num_input_tokens_seen": 76655008, + "step": 132030 + }, + { + "epoch": 19.665624069109324, + "grad_norm": 0.057861328125, + "learning_rate": 2.5559329317767497e-05, + "loss": 0.8162, + "num_input_tokens_seen": 76657792, + "step": 132035 + }, + { + "epoch": 19.666368781650284, + "grad_norm": 0.04638671875, + "learning_rate": 2.544568880713749e-05, + "loss": 0.8248, + "num_input_tokens_seen": 76660512, + "step": 132040 + }, + { + "epoch": 19.66711349419124, + "grad_norm": 0.30859375, + "learning_rate": 2.533230127638286e-05, + "loss": 0.8092, + "num_input_tokens_seen": 76663488, + "step": 132045 + }, + { + "epoch": 19.667858206732202, + "grad_norm": 0.07861328125, + "learning_rate": 2.5219166727420417e-05, + "loss": 0.8225, + "num_input_tokens_seen": 76666144, + "step": 132050 + }, + { + "epoch": 19.66860291927316, + "grad_norm": 0.0478515625, + "learning_rate": 2.510628516216029e-05, + "loss": 0.8013, + "num_input_tokens_seen": 76668864, + "step": 132055 + }, + { + "epoch": 19.66934763181412, + "grad_norm": 0.04736328125, + "learning_rate": 2.499365658250929e-05, + "loss": 0.8133, + "num_input_tokens_seen": 76671936, + "step": 132060 + }, + { + "epoch": 19.67009234435508, + "grad_norm": 0.04052734375, + "learning_rate": 2.4881280990370902e-05, + "loss": 0.8022, + "num_input_tokens_seen": 76674880, + "step": 132065 + }, + { + "epoch": 19.670837056896037, + "grad_norm": 0.044921875, + "learning_rate": 2.476915838764193e-05, + "loss": 0.7997, + "num_input_tokens_seen": 76677600, + "step": 132070 + }, + { + "epoch": 19.671581769436997, + "grad_norm": 0.0751953125, + "learning_rate": 2.46572887762192e-05, + "loss": 0.8129, + "num_input_tokens_seen": 76680768, + "step": 132075 + }, + { + "epoch": 19.672326481977958, + "grad_norm": 0.051025390625, + "learning_rate": 2.4545672157991192e-05, + "loss": 0.8127, + "num_input_tokens_seen": 76683424, + "step": 132080 + }, + { + "epoch": 19.673071194518915, + "grad_norm": 0.08251953125, + "learning_rate": 2.4434308534843074e-05, + "loss": 0.798, + "num_input_tokens_seen": 76686432, + "step": 132085 + }, + { + "epoch": 19.673815907059875, + "grad_norm": 0.037109375, + "learning_rate": 2.4323197908658334e-05, + "loss": 0.8101, + "num_input_tokens_seen": 76689120, + "step": 132090 + }, + { + "epoch": 19.674560619600832, + "grad_norm": 0.064453125, + "learning_rate": 2.421234028131047e-05, + "loss": 0.794, + "num_input_tokens_seen": 76692096, + "step": 132095 + }, + { + "epoch": 19.675305332141793, + "grad_norm": 0.07861328125, + "learning_rate": 2.4101735654676324e-05, + "loss": 0.7861, + "num_input_tokens_seen": 76695040, + "step": 132100 + }, + { + "epoch": 19.676050044682754, + "grad_norm": 0.06298828125, + "learning_rate": 2.3991384030621066e-05, + "loss": 0.7856, + "num_input_tokens_seen": 76697952, + "step": 132105 + }, + { + "epoch": 19.67679475722371, + "grad_norm": 0.076171875, + "learning_rate": 2.388128541101153e-05, + "loss": 0.7895, + "num_input_tokens_seen": 76700512, + "step": 132110 + }, + { + "epoch": 19.67753946976467, + "grad_norm": 0.0419921875, + "learning_rate": 2.377143979770624e-05, + "loss": 0.7897, + "num_input_tokens_seen": 76703680, + "step": 132115 + }, + { + "epoch": 19.67828418230563, + "grad_norm": 0.05712890625, + "learning_rate": 2.366184719256037e-05, + "loss": 0.7806, + "num_input_tokens_seen": 76706400, + "step": 132120 + }, + { + "epoch": 19.67902889484659, + "grad_norm": 0.055419921875, + "learning_rate": 2.355250759742744e-05, + "loss": 0.8163, + "num_input_tokens_seen": 76709344, + "step": 132125 + }, + { + "epoch": 19.67977360738755, + "grad_norm": 0.09326171875, + "learning_rate": 2.3443421014154307e-05, + "loss": 0.7931, + "num_input_tokens_seen": 76712352, + "step": 132130 + }, + { + "epoch": 19.680518319928506, + "grad_norm": 0.03125, + "learning_rate": 2.3334587444581166e-05, + "loss": 0.7902, + "num_input_tokens_seen": 76715264, + "step": 132135 + }, + { + "epoch": 19.681263032469467, + "grad_norm": 0.0556640625, + "learning_rate": 2.3226006890549875e-05, + "loss": 0.8062, + "num_input_tokens_seen": 76718272, + "step": 132140 + }, + { + "epoch": 19.682007745010427, + "grad_norm": 0.0301513671875, + "learning_rate": 2.3117679353892307e-05, + "loss": 0.7842, + "num_input_tokens_seen": 76721408, + "step": 132145 + }, + { + "epoch": 19.682752457551384, + "grad_norm": 0.049560546875, + "learning_rate": 2.300960483644032e-05, + "loss": 0.7973, + "num_input_tokens_seen": 76724064, + "step": 132150 + }, + { + "epoch": 19.683497170092345, + "grad_norm": 0.034912109375, + "learning_rate": 2.2901783340019132e-05, + "loss": 0.7914, + "num_input_tokens_seen": 76726880, + "step": 132155 + }, + { + "epoch": 19.684241882633305, + "grad_norm": 0.07470703125, + "learning_rate": 2.2794214866450613e-05, + "loss": 0.7803, + "num_input_tokens_seen": 76729728, + "step": 132160 + }, + { + "epoch": 19.684986595174262, + "grad_norm": 0.05029296875, + "learning_rate": 2.2686899417551642e-05, + "loss": 0.8011, + "num_input_tokens_seen": 76732320, + "step": 132165 + }, + { + "epoch": 19.685731307715223, + "grad_norm": 0.06494140625, + "learning_rate": 2.257983699513577e-05, + "loss": 0.7909, + "num_input_tokens_seen": 76735424, + "step": 132170 + }, + { + "epoch": 19.68647602025618, + "grad_norm": 0.33203125, + "learning_rate": 2.2473027601009887e-05, + "loss": 0.8208, + "num_input_tokens_seen": 76738240, + "step": 132175 + }, + { + "epoch": 19.68722073279714, + "grad_norm": 0.05908203125, + "learning_rate": 2.2366471236980876e-05, + "loss": 0.7886, + "num_input_tokens_seen": 76741216, + "step": 132180 + }, + { + "epoch": 19.6879654453381, + "grad_norm": 0.050048828125, + "learning_rate": 2.2260167904847304e-05, + "loss": 0.8051, + "num_input_tokens_seen": 76743872, + "step": 132185 + }, + { + "epoch": 19.688710157879058, + "grad_norm": 0.055419921875, + "learning_rate": 2.21541176064044e-05, + "loss": 0.7857, + "num_input_tokens_seen": 76746752, + "step": 132190 + }, + { + "epoch": 19.68945487042002, + "grad_norm": 0.052734375, + "learning_rate": 2.2048320343445726e-05, + "loss": 0.7932, + "num_input_tokens_seen": 76749696, + "step": 132195 + }, + { + "epoch": 19.69019958296098, + "grad_norm": 0.03662109375, + "learning_rate": 2.194277611775819e-05, + "loss": 0.7979, + "num_input_tokens_seen": 76753024, + "step": 132200 + }, + { + "epoch": 19.690944295501936, + "grad_norm": 0.038330078125, + "learning_rate": 2.1837484931123697e-05, + "loss": 0.8052, + "num_input_tokens_seen": 76756160, + "step": 132205 + }, + { + "epoch": 19.691689008042896, + "grad_norm": 0.06396484375, + "learning_rate": 2.173244678532249e-05, + "loss": 0.7876, + "num_input_tokens_seen": 76759168, + "step": 132210 + }, + { + "epoch": 19.692433720583853, + "grad_norm": 0.03564453125, + "learning_rate": 2.162766168212815e-05, + "loss": 0.7938, + "num_input_tokens_seen": 76761760, + "step": 132215 + }, + { + "epoch": 19.693178433124814, + "grad_norm": 0.040283203125, + "learning_rate": 2.1523129623310932e-05, + "loss": 0.7989, + "num_input_tokens_seen": 76764672, + "step": 132220 + }, + { + "epoch": 19.693923145665774, + "grad_norm": 0.0234375, + "learning_rate": 2.141885061063775e-05, + "loss": 0.7875, + "num_input_tokens_seen": 76767328, + "step": 132225 + }, + { + "epoch": 19.69466785820673, + "grad_norm": 0.03955078125, + "learning_rate": 2.1314824645868867e-05, + "loss": 0.8013, + "num_input_tokens_seen": 76770016, + "step": 132230 + }, + { + "epoch": 19.695412570747692, + "grad_norm": 0.0634765625, + "learning_rate": 2.1211051730762876e-05, + "loss": 0.8032, + "num_input_tokens_seen": 76773312, + "step": 132235 + }, + { + "epoch": 19.69615728328865, + "grad_norm": 0.0751953125, + "learning_rate": 2.1107531867071703e-05, + "loss": 0.8172, + "num_input_tokens_seen": 76776512, + "step": 132240 + }, + { + "epoch": 19.69690199582961, + "grad_norm": 0.0556640625, + "learning_rate": 2.100426505654562e-05, + "loss": 0.8071, + "num_input_tokens_seen": 76779264, + "step": 132245 + }, + { + "epoch": 19.69764670837057, + "grad_norm": 0.0556640625, + "learning_rate": 2.090125130092657e-05, + "loss": 0.7977, + "num_input_tokens_seen": 76782240, + "step": 132250 + }, + { + "epoch": 19.698391420911527, + "grad_norm": 0.03955078125, + "learning_rate": 2.0798490601958154e-05, + "loss": 0.8103, + "num_input_tokens_seen": 76785056, + "step": 132255 + }, + { + "epoch": 19.699136133452487, + "grad_norm": 0.046142578125, + "learning_rate": 2.0695982961375646e-05, + "loss": 0.7941, + "num_input_tokens_seen": 76787744, + "step": 132260 + }, + { + "epoch": 19.699880845993448, + "grad_norm": 0.036376953125, + "learning_rate": 2.0593728380911003e-05, + "loss": 0.8014, + "num_input_tokens_seen": 76790976, + "step": 132265 + }, + { + "epoch": 19.700625558534405, + "grad_norm": 0.029296875, + "learning_rate": 2.0491726862289505e-05, + "loss": 0.7889, + "num_input_tokens_seen": 76793760, + "step": 132270 + }, + { + "epoch": 19.701370271075366, + "grad_norm": 0.041015625, + "learning_rate": 2.0389978407234775e-05, + "loss": 0.8015, + "num_input_tokens_seen": 76796640, + "step": 132275 + }, + { + "epoch": 19.702114983616323, + "grad_norm": 0.04541015625, + "learning_rate": 2.028848301746877e-05, + "loss": 0.8146, + "num_input_tokens_seen": 76799872, + "step": 132280 + }, + { + "epoch": 19.702859696157283, + "grad_norm": 0.0458984375, + "learning_rate": 2.0187240694703455e-05, + "loss": 0.8047, + "num_input_tokens_seen": 76802656, + "step": 132285 + }, + { + "epoch": 19.703604408698244, + "grad_norm": 0.06884765625, + "learning_rate": 2.0086251440649128e-05, + "loss": 0.8068, + "num_input_tokens_seen": 76805568, + "step": 132290 + }, + { + "epoch": 19.7043491212392, + "grad_norm": 0.052490234375, + "learning_rate": 1.9985515257012752e-05, + "loss": 0.8089, + "num_input_tokens_seen": 76808480, + "step": 132295 + }, + { + "epoch": 19.70509383378016, + "grad_norm": 0.0517578125, + "learning_rate": 1.988503214549797e-05, + "loss": 0.8217, + "num_input_tokens_seen": 76811328, + "step": 132300 + }, + { + "epoch": 19.70583854632112, + "grad_norm": 0.042724609375, + "learning_rate": 1.9784802107798427e-05, + "loss": 0.8218, + "num_input_tokens_seen": 76814400, + "step": 132305 + }, + { + "epoch": 19.70658325886208, + "grad_norm": 0.04833984375, + "learning_rate": 1.9684825145611096e-05, + "loss": 0.8165, + "num_input_tokens_seen": 76817088, + "step": 132310 + }, + { + "epoch": 19.70732797140304, + "grad_norm": 0.0517578125, + "learning_rate": 1.9585101260621297e-05, + "loss": 0.7985, + "num_input_tokens_seen": 76819840, + "step": 132315 + }, + { + "epoch": 19.708072683943996, + "grad_norm": 0.036376953125, + "learning_rate": 1.9485630454517677e-05, + "loss": 0.8068, + "num_input_tokens_seen": 76822880, + "step": 132320 + }, + { + "epoch": 19.708817396484957, + "grad_norm": 0.0498046875, + "learning_rate": 1.938641272897723e-05, + "loss": 0.7961, + "num_input_tokens_seen": 76826176, + "step": 132325 + }, + { + "epoch": 19.709562109025917, + "grad_norm": 0.04248046875, + "learning_rate": 1.9287448085678615e-05, + "loss": 0.7964, + "num_input_tokens_seen": 76829184, + "step": 132330 + }, + { + "epoch": 19.710306821566874, + "grad_norm": 0.05029296875, + "learning_rate": 1.9188736526293825e-05, + "loss": 0.795, + "num_input_tokens_seen": 76832064, + "step": 132335 + }, + { + "epoch": 19.711051534107835, + "grad_norm": 0.044189453125, + "learning_rate": 1.9090278052488195e-05, + "loss": 0.7929, + "num_input_tokens_seen": 76835040, + "step": 132340 + }, + { + "epoch": 19.711796246648795, + "grad_norm": 0.09619140625, + "learning_rate": 1.899207266592706e-05, + "loss": 0.7976, + "num_input_tokens_seen": 76837856, + "step": 132345 + }, + { + "epoch": 19.712540959189752, + "grad_norm": 0.059814453125, + "learning_rate": 1.889412036826743e-05, + "loss": 0.7883, + "num_input_tokens_seen": 76840672, + "step": 132350 + }, + { + "epoch": 19.713285671730713, + "grad_norm": 0.043701171875, + "learning_rate": 1.879642116116631e-05, + "loss": 0.7955, + "num_input_tokens_seen": 76843520, + "step": 132355 + }, + { + "epoch": 19.71403038427167, + "grad_norm": 0.07470703125, + "learning_rate": 1.8698975046274047e-05, + "loss": 0.7925, + "num_input_tokens_seen": 76846528, + "step": 132360 + }, + { + "epoch": 19.71477509681263, + "grad_norm": 0.034423828125, + "learning_rate": 1.860178202523599e-05, + "loss": 0.7984, + "num_input_tokens_seen": 76849344, + "step": 132365 + }, + { + "epoch": 19.71551980935359, + "grad_norm": 0.095703125, + "learning_rate": 1.8504842099694163e-05, + "loss": 0.807, + "num_input_tokens_seen": 76852704, + "step": 132370 + }, + { + "epoch": 19.716264521894548, + "grad_norm": 0.03857421875, + "learning_rate": 1.8408155271287253e-05, + "loss": 0.797, + "num_input_tokens_seen": 76855424, + "step": 132375 + }, + { + "epoch": 19.71700923443551, + "grad_norm": 0.046630859375, + "learning_rate": 1.8311721541647284e-05, + "loss": 0.7989, + "num_input_tokens_seen": 76858496, + "step": 132380 + }, + { + "epoch": 19.717753946976465, + "grad_norm": 0.04248046875, + "learning_rate": 1.8215540912404627e-05, + "loss": 0.7834, + "num_input_tokens_seen": 76860960, + "step": 132385 + }, + { + "epoch": 19.718498659517426, + "grad_norm": 0.035400390625, + "learning_rate": 1.8119613385182975e-05, + "loss": 0.7986, + "num_input_tokens_seen": 76863904, + "step": 132390 + }, + { + "epoch": 19.719243372058386, + "grad_norm": 0.059814453125, + "learning_rate": 1.8023938961604368e-05, + "loss": 0.7848, + "num_input_tokens_seen": 76866688, + "step": 132395 + }, + { + "epoch": 19.719988084599343, + "grad_norm": 0.057861328125, + "learning_rate": 1.7928517643282515e-05, + "loss": 0.7828, + "num_input_tokens_seen": 76869728, + "step": 132400 + }, + { + "epoch": 19.720732797140304, + "grad_norm": 0.041015625, + "learning_rate": 1.783334943183279e-05, + "loss": 0.7998, + "num_input_tokens_seen": 76872864, + "step": 132405 + }, + { + "epoch": 19.721477509681264, + "grad_norm": 0.0732421875, + "learning_rate": 1.7738434328860575e-05, + "loss": 0.7935, + "num_input_tokens_seen": 76875840, + "step": 132410 + }, + { + "epoch": 19.72222222222222, + "grad_norm": 0.039306640625, + "learning_rate": 1.7643772335971253e-05, + "loss": 0.8105, + "num_input_tokens_seen": 76878848, + "step": 132415 + }, + { + "epoch": 19.722966934763182, + "grad_norm": 0.045654296875, + "learning_rate": 1.754936345476188e-05, + "loss": 0.8126, + "num_input_tokens_seen": 76881792, + "step": 132420 + }, + { + "epoch": 19.72371164730414, + "grad_norm": 0.05419921875, + "learning_rate": 1.745520768682951e-05, + "loss": 0.795, + "num_input_tokens_seen": 76884832, + "step": 132425 + }, + { + "epoch": 19.7244563598451, + "grad_norm": 0.07470703125, + "learning_rate": 1.736130503376454e-05, + "loss": 0.78, + "num_input_tokens_seen": 76887616, + "step": 132430 + }, + { + "epoch": 19.72520107238606, + "grad_norm": 0.0439453125, + "learning_rate": 1.7267655497150702e-05, + "loss": 0.7985, + "num_input_tokens_seen": 76890816, + "step": 132435 + }, + { + "epoch": 19.725945784927017, + "grad_norm": 0.042724609375, + "learning_rate": 1.7174259078573396e-05, + "loss": 0.8014, + "num_input_tokens_seen": 76894112, + "step": 132440 + }, + { + "epoch": 19.726690497467978, + "grad_norm": 0.038330078125, + "learning_rate": 1.7081115779608023e-05, + "loss": 0.8194, + "num_input_tokens_seen": 76897056, + "step": 132445 + }, + { + "epoch": 19.727435210008938, + "grad_norm": 0.04150390625, + "learning_rate": 1.6988225601829997e-05, + "loss": 0.7935, + "num_input_tokens_seen": 76899872, + "step": 132450 + }, + { + "epoch": 19.728179922549895, + "grad_norm": 0.0458984375, + "learning_rate": 1.689558854680806e-05, + "loss": 0.824, + "num_input_tokens_seen": 76902944, + "step": 132455 + }, + { + "epoch": 19.728924635090856, + "grad_norm": 0.051513671875, + "learning_rate": 1.6803204616105958e-05, + "loss": 0.8126, + "num_input_tokens_seen": 76905504, + "step": 132460 + }, + { + "epoch": 19.729669347631813, + "grad_norm": 0.0654296875, + "learning_rate": 1.671107381128578e-05, + "loss": 0.8116, + "num_input_tokens_seen": 76908640, + "step": 132465 + }, + { + "epoch": 19.730414060172773, + "grad_norm": 0.044189453125, + "learning_rate": 1.661919613390461e-05, + "loss": 0.7993, + "num_input_tokens_seen": 76911808, + "step": 132470 + }, + { + "epoch": 19.731158772713734, + "grad_norm": 0.040283203125, + "learning_rate": 1.6527571585512878e-05, + "loss": 0.7987, + "num_input_tokens_seen": 76914688, + "step": 132475 + }, + { + "epoch": 19.73190348525469, + "grad_norm": 0.04638671875, + "learning_rate": 1.6436200167659344e-05, + "loss": 0.797, + "num_input_tokens_seen": 76917536, + "step": 132480 + }, + { + "epoch": 19.73264819779565, + "grad_norm": 0.0625, + "learning_rate": 1.634508188188777e-05, + "loss": 0.7745, + "num_input_tokens_seen": 76920320, + "step": 132485 + }, + { + "epoch": 19.73339291033661, + "grad_norm": 0.04296875, + "learning_rate": 1.625421672973859e-05, + "loss": 0.8221, + "num_input_tokens_seen": 76923392, + "step": 132490 + }, + { + "epoch": 19.73413762287757, + "grad_norm": 0.05224609375, + "learning_rate": 1.6163604712743917e-05, + "loss": 0.8115, + "num_input_tokens_seen": 76926112, + "step": 132495 + }, + { + "epoch": 19.73488233541853, + "grad_norm": 0.09814453125, + "learning_rate": 1.6073245832435856e-05, + "loss": 0.7813, + "num_input_tokens_seen": 76929344, + "step": 132500 + }, + { + "epoch": 19.735627047959486, + "grad_norm": 0.0400390625, + "learning_rate": 1.5983140090343185e-05, + "loss": 0.8128, + "num_input_tokens_seen": 76932352, + "step": 132505 + }, + { + "epoch": 19.736371760500447, + "grad_norm": 0.04833984375, + "learning_rate": 1.5893287487984687e-05, + "loss": 0.7942, + "num_input_tokens_seen": 76935040, + "step": 132510 + }, + { + "epoch": 19.737116473041407, + "grad_norm": 0.0556640625, + "learning_rate": 1.5803688026880814e-05, + "loss": 0.7745, + "num_input_tokens_seen": 76937920, + "step": 132515 + }, + { + "epoch": 19.737861185582364, + "grad_norm": 0.035400390625, + "learning_rate": 1.571434170854369e-05, + "loss": 0.7983, + "num_input_tokens_seen": 76940544, + "step": 132520 + }, + { + "epoch": 19.738605898123325, + "grad_norm": 0.0732421875, + "learning_rate": 1.5625248534483772e-05, + "loss": 0.8014, + "num_input_tokens_seen": 76943488, + "step": 132525 + }, + { + "epoch": 19.73935061066428, + "grad_norm": 0.04345703125, + "learning_rate": 1.553640850620652e-05, + "loss": 0.7877, + "num_input_tokens_seen": 76946048, + "step": 132530 + }, + { + "epoch": 19.740095323205242, + "grad_norm": 0.06640625, + "learning_rate": 1.5447821625210745e-05, + "loss": 0.7953, + "num_input_tokens_seen": 76948960, + "step": 132535 + }, + { + "epoch": 19.740840035746203, + "grad_norm": 0.06201171875, + "learning_rate": 1.5359487892995237e-05, + "loss": 0.8251, + "num_input_tokens_seen": 76952064, + "step": 132540 + }, + { + "epoch": 19.74158474828716, + "grad_norm": 0.041259765625, + "learning_rate": 1.527140731105214e-05, + "loss": 0.8091, + "num_input_tokens_seen": 76954720, + "step": 132545 + }, + { + "epoch": 19.74232946082812, + "grad_norm": 0.04248046875, + "learning_rate": 1.5183579880868602e-05, + "loss": 0.8012, + "num_input_tokens_seen": 76957664, + "step": 132550 + }, + { + "epoch": 19.74307417336908, + "grad_norm": 0.051513671875, + "learning_rate": 1.5096005603926764e-05, + "loss": 0.7874, + "num_input_tokens_seen": 76960704, + "step": 132555 + }, + { + "epoch": 19.743818885910038, + "grad_norm": 0.040283203125, + "learning_rate": 1.5008684481710443e-05, + "loss": 0.7903, + "num_input_tokens_seen": 76963584, + "step": 132560 + }, + { + "epoch": 19.744563598451, + "grad_norm": 0.05859375, + "learning_rate": 1.4921616515690128e-05, + "loss": 0.8017, + "num_input_tokens_seen": 76966336, + "step": 132565 + }, + { + "epoch": 19.74530831099196, + "grad_norm": 0.032470703125, + "learning_rate": 1.483480170733964e-05, + "loss": 0.8037, + "num_input_tokens_seen": 76969216, + "step": 132570 + }, + { + "epoch": 19.746053023532916, + "grad_norm": 0.036865234375, + "learning_rate": 1.4748240058126138e-05, + "loss": 0.7893, + "num_input_tokens_seen": 76972640, + "step": 132575 + }, + { + "epoch": 19.746797736073876, + "grad_norm": 0.050537109375, + "learning_rate": 1.4661931569510123e-05, + "loss": 0.8006, + "num_input_tokens_seen": 76975488, + "step": 132580 + }, + { + "epoch": 19.747542448614833, + "grad_norm": 0.0830078125, + "learning_rate": 1.4575876242950425e-05, + "loss": 0.7832, + "num_input_tokens_seen": 76978272, + "step": 132585 + }, + { + "epoch": 19.748287161155794, + "grad_norm": 0.06005859375, + "learning_rate": 1.4490074079899218e-05, + "loss": 0.795, + "num_input_tokens_seen": 76981152, + "step": 132590 + }, + { + "epoch": 19.749031873696755, + "grad_norm": 0.04931640625, + "learning_rate": 1.4404525081810338e-05, + "loss": 0.7943, + "num_input_tokens_seen": 76983936, + "step": 132595 + }, + { + "epoch": 19.74977658623771, + "grad_norm": 0.099609375, + "learning_rate": 1.4319229250124299e-05, + "loss": 0.7965, + "num_input_tokens_seen": 76986944, + "step": 132600 + }, + { + "epoch": 19.750521298778672, + "grad_norm": 0.02734375, + "learning_rate": 1.4234186586284947e-05, + "loss": 0.7809, + "num_input_tokens_seen": 76990464, + "step": 132605 + }, + { + "epoch": 19.75126601131963, + "grad_norm": 0.0498046875, + "learning_rate": 1.4149397091727799e-05, + "loss": 0.7934, + "num_input_tokens_seen": 76993312, + "step": 132610 + }, + { + "epoch": 19.75201072386059, + "grad_norm": 0.052978515625, + "learning_rate": 1.4064860767885046e-05, + "loss": 0.8022, + "num_input_tokens_seen": 76996032, + "step": 132615 + }, + { + "epoch": 19.75275543640155, + "grad_norm": 0.06103515625, + "learning_rate": 1.3980577616187206e-05, + "loss": 0.795, + "num_input_tokens_seen": 76998880, + "step": 132620 + }, + { + "epoch": 19.753500148942507, + "grad_norm": 0.04443359375, + "learning_rate": 1.3896547638054812e-05, + "loss": 0.7852, + "num_input_tokens_seen": 77002016, + "step": 132625 + }, + { + "epoch": 19.754244861483468, + "grad_norm": 0.02734375, + "learning_rate": 1.381277083491006e-05, + "loss": 0.7964, + "num_input_tokens_seen": 77004672, + "step": 132630 + }, + { + "epoch": 19.754989574024428, + "grad_norm": 0.06396484375, + "learning_rate": 1.3729247208166816e-05, + "loss": 0.8281, + "num_input_tokens_seen": 77007584, + "step": 132635 + }, + { + "epoch": 19.755734286565385, + "grad_norm": 0.046142578125, + "learning_rate": 1.3645976759235623e-05, + "loss": 0.7974, + "num_input_tokens_seen": 77010528, + "step": 132640 + }, + { + "epoch": 19.756478999106346, + "grad_norm": 0.044189453125, + "learning_rate": 1.356295948952535e-05, + "loss": 0.8173, + "num_input_tokens_seen": 77013440, + "step": 132645 + }, + { + "epoch": 19.757223711647303, + "grad_norm": 0.03759765625, + "learning_rate": 1.3480195400436545e-05, + "loss": 0.7838, + "num_input_tokens_seen": 77016320, + "step": 132650 + }, + { + "epoch": 19.757968424188263, + "grad_norm": 0.030517578125, + "learning_rate": 1.3397684493369754e-05, + "loss": 0.7925, + "num_input_tokens_seen": 77019328, + "step": 132655 + }, + { + "epoch": 19.758713136729224, + "grad_norm": 0.06689453125, + "learning_rate": 1.3315426769715532e-05, + "loss": 0.7894, + "num_input_tokens_seen": 77022368, + "step": 132660 + }, + { + "epoch": 19.75945784927018, + "grad_norm": 0.03564453125, + "learning_rate": 1.3233422230867764e-05, + "loss": 0.7892, + "num_input_tokens_seen": 77025600, + "step": 132665 + }, + { + "epoch": 19.76020256181114, + "grad_norm": 0.056640625, + "learning_rate": 1.315167087820701e-05, + "loss": 0.7915, + "num_input_tokens_seen": 77028384, + "step": 132670 + }, + { + "epoch": 19.7609472743521, + "grad_norm": 0.0361328125, + "learning_rate": 1.3070172713118833e-05, + "loss": 0.7995, + "num_input_tokens_seen": 77030944, + "step": 132675 + }, + { + "epoch": 19.76169198689306, + "grad_norm": 0.06884765625, + "learning_rate": 1.2988927736977129e-05, + "loss": 0.8098, + "num_input_tokens_seen": 77033696, + "step": 132680 + }, + { + "epoch": 19.76243669943402, + "grad_norm": 0.0281982421875, + "learning_rate": 1.2907935951154136e-05, + "loss": 0.7901, + "num_input_tokens_seen": 77036576, + "step": 132685 + }, + { + "epoch": 19.763181411974976, + "grad_norm": 0.0595703125, + "learning_rate": 1.2827197357020425e-05, + "loss": 0.7947, + "num_input_tokens_seen": 77039584, + "step": 132690 + }, + { + "epoch": 19.763926124515937, + "grad_norm": 0.039794921875, + "learning_rate": 1.2746711955939905e-05, + "loss": 0.7844, + "num_input_tokens_seen": 77042560, + "step": 132695 + }, + { + "epoch": 19.764670837056897, + "grad_norm": 0.06298828125, + "learning_rate": 1.2666479749269821e-05, + "loss": 0.7794, + "num_input_tokens_seen": 77045440, + "step": 132700 + }, + { + "epoch": 19.765415549597854, + "grad_norm": 0.28125, + "learning_rate": 1.2586500738367423e-05, + "loss": 0.832, + "num_input_tokens_seen": 77048192, + "step": 132705 + }, + { + "epoch": 19.766160262138815, + "grad_norm": 0.138671875, + "learning_rate": 1.2506774924583297e-05, + "loss": 0.7791, + "num_input_tokens_seen": 77051072, + "step": 132710 + }, + { + "epoch": 19.766904974679775, + "grad_norm": 0.33203125, + "learning_rate": 1.2427302309266363e-05, + "loss": 0.8311, + "num_input_tokens_seen": 77053888, + "step": 132715 + }, + { + "epoch": 19.767649687220732, + "grad_norm": 0.17578125, + "learning_rate": 1.2348082893755551e-05, + "loss": 0.7698, + "num_input_tokens_seen": 77056800, + "step": 132720 + }, + { + "epoch": 19.768394399761693, + "grad_norm": 0.041015625, + "learning_rate": 1.2269116679391455e-05, + "loss": 0.7986, + "num_input_tokens_seen": 77059744, + "step": 132725 + }, + { + "epoch": 19.76913911230265, + "grad_norm": 0.05126953125, + "learning_rate": 1.2190403667509675e-05, + "loss": 0.7904, + "num_input_tokens_seen": 77062720, + "step": 132730 + }, + { + "epoch": 19.76988382484361, + "grad_norm": 0.09033203125, + "learning_rate": 1.2111943859435815e-05, + "loss": 0.7792, + "num_input_tokens_seen": 77065632, + "step": 132735 + }, + { + "epoch": 19.77062853738457, + "grad_norm": 0.02880859375, + "learning_rate": 1.2033737256498811e-05, + "loss": 0.8013, + "num_input_tokens_seen": 77068416, + "step": 132740 + }, + { + "epoch": 19.771373249925528, + "grad_norm": 0.037841796875, + "learning_rate": 1.195578386001761e-05, + "loss": 0.7965, + "num_input_tokens_seen": 77071424, + "step": 132745 + }, + { + "epoch": 19.77211796246649, + "grad_norm": 0.03857421875, + "learning_rate": 1.1878083671311157e-05, + "loss": 0.7998, + "num_input_tokens_seen": 77074336, + "step": 132750 + }, + { + "epoch": 19.772862675007445, + "grad_norm": 0.052734375, + "learning_rate": 1.1800636691691735e-05, + "loss": 0.8012, + "num_input_tokens_seen": 77077376, + "step": 132755 + }, + { + "epoch": 19.773607387548406, + "grad_norm": 0.0302734375, + "learning_rate": 1.172344292246663e-05, + "loss": 0.8114, + "num_input_tokens_seen": 77080032, + "step": 132760 + }, + { + "epoch": 19.774352100089367, + "grad_norm": 0.055908203125, + "learning_rate": 1.1646502364939803e-05, + "loss": 0.7954, + "num_input_tokens_seen": 77082720, + "step": 132765 + }, + { + "epoch": 19.775096812630323, + "grad_norm": 0.080078125, + "learning_rate": 1.1569815020411878e-05, + "loss": 0.7888, + "num_input_tokens_seen": 77085888, + "step": 132770 + }, + { + "epoch": 19.775841525171284, + "grad_norm": 0.0478515625, + "learning_rate": 1.149338089018015e-05, + "loss": 0.8131, + "num_input_tokens_seen": 77088960, + "step": 132775 + }, + { + "epoch": 19.776586237712245, + "grad_norm": 0.0458984375, + "learning_rate": 1.1417199975531922e-05, + "loss": 0.8213, + "num_input_tokens_seen": 77092064, + "step": 132780 + }, + { + "epoch": 19.7773309502532, + "grad_norm": 0.03564453125, + "learning_rate": 1.134127227775783e-05, + "loss": 0.7941, + "num_input_tokens_seen": 77094880, + "step": 132785 + }, + { + "epoch": 19.778075662794162, + "grad_norm": 0.040771484375, + "learning_rate": 1.1265597798138515e-05, + "loss": 0.8025, + "num_input_tokens_seen": 77097568, + "step": 132790 + }, + { + "epoch": 19.77882037533512, + "grad_norm": 0.061767578125, + "learning_rate": 1.119017653795462e-05, + "loss": 0.792, + "num_input_tokens_seen": 77100416, + "step": 132795 + }, + { + "epoch": 19.77956508787608, + "grad_norm": 0.056884765625, + "learning_rate": 1.1115008498476797e-05, + "loss": 0.7817, + "num_input_tokens_seen": 77103360, + "step": 132800 + }, + { + "epoch": 19.78030980041704, + "grad_norm": 0.061279296875, + "learning_rate": 1.1040093680977358e-05, + "loss": 0.7977, + "num_input_tokens_seen": 77106272, + "step": 132805 + }, + { + "epoch": 19.781054512957997, + "grad_norm": 0.046875, + "learning_rate": 1.096543208672196e-05, + "loss": 0.7774, + "num_input_tokens_seen": 77109024, + "step": 132810 + }, + { + "epoch": 19.781799225498958, + "grad_norm": 0.03955078125, + "learning_rate": 1.089102371697126e-05, + "loss": 0.7912, + "num_input_tokens_seen": 77111808, + "step": 132815 + }, + { + "epoch": 19.782543938039918, + "grad_norm": 0.05029296875, + "learning_rate": 1.081686857298092e-05, + "loss": 0.8034, + "num_input_tokens_seen": 77114816, + "step": 132820 + }, + { + "epoch": 19.783288650580875, + "grad_norm": 0.04296875, + "learning_rate": 1.0742966656006603e-05, + "loss": 0.788, + "num_input_tokens_seen": 77117728, + "step": 132825 + }, + { + "epoch": 19.784033363121836, + "grad_norm": 0.050537109375, + "learning_rate": 1.0669317967295643e-05, + "loss": 0.7847, + "num_input_tokens_seen": 77120416, + "step": 132830 + }, + { + "epoch": 19.784778075662793, + "grad_norm": 0.05322265625, + "learning_rate": 1.0595922508092047e-05, + "loss": 0.7831, + "num_input_tokens_seen": 77123648, + "step": 132835 + }, + { + "epoch": 19.785522788203753, + "grad_norm": 0.048095703125, + "learning_rate": 1.052278027963649e-05, + "loss": 0.8007, + "num_input_tokens_seen": 77126464, + "step": 132840 + }, + { + "epoch": 19.786267500744714, + "grad_norm": 0.03564453125, + "learning_rate": 1.0449891283162982e-05, + "loss": 0.7916, + "num_input_tokens_seen": 77129312, + "step": 132845 + }, + { + "epoch": 19.78701221328567, + "grad_norm": 0.033203125, + "learning_rate": 1.0377255519903871e-05, + "loss": 0.7927, + "num_input_tokens_seen": 77132032, + "step": 132850 + }, + { + "epoch": 19.78775692582663, + "grad_norm": 0.08984375, + "learning_rate": 1.0304872991086511e-05, + "loss": 0.7899, + "num_input_tokens_seen": 77134880, + "step": 132855 + }, + { + "epoch": 19.788501638367592, + "grad_norm": 0.041748046875, + "learning_rate": 1.0232743697933255e-05, + "loss": 0.7937, + "num_input_tokens_seen": 77137472, + "step": 132860 + }, + { + "epoch": 19.78924635090855, + "grad_norm": 0.111328125, + "learning_rate": 1.0160867641663129e-05, + "loss": 0.8061, + "num_input_tokens_seen": 77140320, + "step": 132865 + }, + { + "epoch": 19.78999106344951, + "grad_norm": 0.04296875, + "learning_rate": 1.0089244823490161e-05, + "loss": 0.7952, + "num_input_tokens_seen": 77143104, + "step": 132870 + }, + { + "epoch": 19.790735775990466, + "grad_norm": 0.05517578125, + "learning_rate": 1.0017875244623386e-05, + "loss": 0.7957, + "num_input_tokens_seen": 77146240, + "step": 132875 + }, + { + "epoch": 19.791480488531427, + "grad_norm": 0.046875, + "learning_rate": 9.94675890627017e-06, + "loss": 0.8091, + "num_input_tokens_seen": 77149344, + "step": 132880 + }, + { + "epoch": 19.792225201072387, + "grad_norm": 0.0615234375, + "learning_rate": 9.87589580963122e-06, + "loss": 0.793, + "num_input_tokens_seen": 77152352, + "step": 132885 + }, + { + "epoch": 19.792969913613344, + "grad_norm": 0.05712890625, + "learning_rate": 9.805285955903908e-06, + "loss": 0.7913, + "num_input_tokens_seen": 77155168, + "step": 132890 + }, + { + "epoch": 19.793714626154305, + "grad_norm": 0.05615234375, + "learning_rate": 9.734929346280618e-06, + "loss": 0.798, + "num_input_tokens_seen": 77157920, + "step": 132895 + }, + { + "epoch": 19.794459338695262, + "grad_norm": 0.051513671875, + "learning_rate": 9.664825981950398e-06, + "loss": 0.7885, + "num_input_tokens_seen": 77160896, + "step": 132900 + }, + { + "epoch": 19.795204051236222, + "grad_norm": 0.037353515625, + "learning_rate": 9.594975864097299e-06, + "loss": 0.7921, + "num_input_tokens_seen": 77163904, + "step": 132905 + }, + { + "epoch": 19.795948763777183, + "grad_norm": 0.039306640625, + "learning_rate": 9.525378993902045e-06, + "loss": 0.7833, + "num_input_tokens_seen": 77166688, + "step": 132910 + }, + { + "epoch": 19.79669347631814, + "grad_norm": 0.051025390625, + "learning_rate": 9.45603537254036e-06, + "loss": 0.7916, + "num_input_tokens_seen": 77169760, + "step": 132915 + }, + { + "epoch": 19.7974381888591, + "grad_norm": 0.05419921875, + "learning_rate": 9.386945001181312e-06, + "loss": 0.797, + "num_input_tokens_seen": 77172704, + "step": 132920 + }, + { + "epoch": 19.79818290140006, + "grad_norm": 0.040283203125, + "learning_rate": 9.31810788099563e-06, + "loss": 0.801, + "num_input_tokens_seen": 77175680, + "step": 132925 + }, + { + "epoch": 19.798927613941018, + "grad_norm": 0.0439453125, + "learning_rate": 9.24952401314405e-06, + "loss": 0.8083, + "num_input_tokens_seen": 77178784, + "step": 132930 + }, + { + "epoch": 19.79967232648198, + "grad_norm": 0.037841796875, + "learning_rate": 9.181193398787313e-06, + "loss": 0.809, + "num_input_tokens_seen": 77181536, + "step": 132935 + }, + { + "epoch": 19.800417039022935, + "grad_norm": 0.03662109375, + "learning_rate": 9.113116039076164e-06, + "loss": 0.7916, + "num_input_tokens_seen": 77184512, + "step": 132940 + }, + { + "epoch": 19.801161751563896, + "grad_norm": 0.041748046875, + "learning_rate": 9.045291935164679e-06, + "loss": 0.8111, + "num_input_tokens_seen": 77187200, + "step": 132945 + }, + { + "epoch": 19.801906464104857, + "grad_norm": 0.07080078125, + "learning_rate": 8.977721088195277e-06, + "loss": 0.7867, + "num_input_tokens_seen": 77189952, + "step": 132950 + }, + { + "epoch": 19.802651176645814, + "grad_norm": 0.06494140625, + "learning_rate": 8.910403499312046e-06, + "loss": 0.7985, + "num_input_tokens_seen": 77192896, + "step": 132955 + }, + { + "epoch": 19.803395889186774, + "grad_norm": 0.036376953125, + "learning_rate": 8.843339169650744e-06, + "loss": 0.7884, + "num_input_tokens_seen": 77195872, + "step": 132960 + }, + { + "epoch": 19.804140601727735, + "grad_norm": 0.034423828125, + "learning_rate": 8.77652810034546e-06, + "loss": 0.8095, + "num_input_tokens_seen": 77198752, + "step": 132965 + }, + { + "epoch": 19.80488531426869, + "grad_norm": 0.042724609375, + "learning_rate": 8.709970292523627e-06, + "loss": 0.7838, + "num_input_tokens_seen": 77201600, + "step": 132970 + }, + { + "epoch": 19.805630026809652, + "grad_norm": 0.044921875, + "learning_rate": 8.643665747311014e-06, + "loss": 0.7999, + "num_input_tokens_seen": 77204512, + "step": 132975 + }, + { + "epoch": 19.80637473935061, + "grad_norm": 0.045654296875, + "learning_rate": 8.577614465826722e-06, + "loss": 0.8013, + "num_input_tokens_seen": 77207136, + "step": 132980 + }, + { + "epoch": 19.80711945189157, + "grad_norm": 0.035888671875, + "learning_rate": 8.511816449186526e-06, + "loss": 0.7742, + "num_input_tokens_seen": 77210112, + "step": 132985 + }, + { + "epoch": 19.80786416443253, + "grad_norm": 0.076171875, + "learning_rate": 8.446271698504537e-06, + "loss": 0.7891, + "num_input_tokens_seen": 77213088, + "step": 132990 + }, + { + "epoch": 19.808608876973487, + "grad_norm": 0.038818359375, + "learning_rate": 8.380980214883204e-06, + "loss": 0.8008, + "num_input_tokens_seen": 77215968, + "step": 132995 + }, + { + "epoch": 19.809353589514448, + "grad_norm": 0.0286865234375, + "learning_rate": 8.315941999429977e-06, + "loss": 0.8189, + "num_input_tokens_seen": 77218912, + "step": 133000 + }, + { + "epoch": 19.81009830205541, + "grad_norm": 0.031494140625, + "learning_rate": 8.251157053242308e-06, + "loss": 0.8062, + "num_input_tokens_seen": 77221760, + "step": 133005 + }, + { + "epoch": 19.810843014596365, + "grad_norm": 0.026611328125, + "learning_rate": 8.186625377414324e-06, + "loss": 0.7995, + "num_input_tokens_seen": 77224672, + "step": 133010 + }, + { + "epoch": 19.811587727137326, + "grad_norm": 0.0274658203125, + "learning_rate": 8.12234697303682e-06, + "loss": 0.789, + "num_input_tokens_seen": 77227712, + "step": 133015 + }, + { + "epoch": 19.812332439678283, + "grad_norm": 0.044921875, + "learning_rate": 8.058321841193927e-06, + "loss": 0.8123, + "num_input_tokens_seen": 77230528, + "step": 133020 + }, + { + "epoch": 19.813077152219243, + "grad_norm": 0.031005859375, + "learning_rate": 7.994549982968113e-06, + "loss": 0.8023, + "num_input_tokens_seen": 77233344, + "step": 133025 + }, + { + "epoch": 19.813821864760204, + "grad_norm": 0.054443359375, + "learning_rate": 7.931031399438514e-06, + "loss": 0.8015, + "num_input_tokens_seen": 77236384, + "step": 133030 + }, + { + "epoch": 19.81456657730116, + "grad_norm": 0.041748046875, + "learning_rate": 7.867766091677608e-06, + "loss": 0.8002, + "num_input_tokens_seen": 77239520, + "step": 133035 + }, + { + "epoch": 19.81531128984212, + "grad_norm": 0.047607421875, + "learning_rate": 7.804754060751206e-06, + "loss": 0.7945, + "num_input_tokens_seen": 77242432, + "step": 133040 + }, + { + "epoch": 19.816056002383082, + "grad_norm": 0.0390625, + "learning_rate": 7.741995307728455e-06, + "loss": 0.7995, + "num_input_tokens_seen": 77245280, + "step": 133045 + }, + { + "epoch": 19.81680071492404, + "grad_norm": 0.055419921875, + "learning_rate": 7.679489833665176e-06, + "loss": 0.8007, + "num_input_tokens_seen": 77248320, + "step": 133050 + }, + { + "epoch": 19.817545427465, + "grad_norm": 0.045166015625, + "learning_rate": 7.6172376396205215e-06, + "loss": 0.7891, + "num_input_tokens_seen": 77251424, + "step": 133055 + }, + { + "epoch": 19.818290140005956, + "grad_norm": 0.03369140625, + "learning_rate": 7.555238726645319e-06, + "loss": 0.8031, + "num_input_tokens_seen": 77254432, + "step": 133060 + }, + { + "epoch": 19.819034852546917, + "grad_norm": 0.0703125, + "learning_rate": 7.493493095785397e-06, + "loss": 0.7738, + "num_input_tokens_seen": 77257600, + "step": 133065 + }, + { + "epoch": 19.819779565087877, + "grad_norm": 0.0291748046875, + "learning_rate": 7.432000748086586e-06, + "loss": 0.7967, + "num_input_tokens_seen": 77260512, + "step": 133070 + }, + { + "epoch": 19.820524277628834, + "grad_norm": 0.040771484375, + "learning_rate": 7.370761684584726e-06, + "loss": 0.7935, + "num_input_tokens_seen": 77263840, + "step": 133075 + }, + { + "epoch": 19.821268990169795, + "grad_norm": 0.12060546875, + "learning_rate": 7.309775906317317e-06, + "loss": 0.7974, + "num_input_tokens_seen": 77266752, + "step": 133080 + }, + { + "epoch": 19.822013702710755, + "grad_norm": 0.040771484375, + "learning_rate": 7.249043414313538e-06, + "loss": 0.8106, + "num_input_tokens_seen": 77269568, + "step": 133085 + }, + { + "epoch": 19.822758415251712, + "grad_norm": 0.037841796875, + "learning_rate": 7.188564209599235e-06, + "loss": 0.8006, + "num_input_tokens_seen": 77272416, + "step": 133090 + }, + { + "epoch": 19.823503127792673, + "grad_norm": 0.043701171875, + "learning_rate": 7.128338293195257e-06, + "loss": 0.784, + "num_input_tokens_seen": 77274912, + "step": 133095 + }, + { + "epoch": 19.82424784033363, + "grad_norm": 0.03515625, + "learning_rate": 7.06836566612079e-06, + "loss": 0.8047, + "num_input_tokens_seen": 77277632, + "step": 133100 + }, + { + "epoch": 19.82499255287459, + "grad_norm": 0.08984375, + "learning_rate": 7.008646329386692e-06, + "loss": 0.7671, + "num_input_tokens_seen": 77280864, + "step": 133105 + }, + { + "epoch": 19.82573726541555, + "grad_norm": 0.04150390625, + "learning_rate": 6.949180284005485e-06, + "loss": 0.7903, + "num_input_tokens_seen": 77283680, + "step": 133110 + }, + { + "epoch": 19.826481977956508, + "grad_norm": 0.0693359375, + "learning_rate": 6.889967530978036e-06, + "loss": 0.8127, + "num_input_tokens_seen": 77286752, + "step": 133115 + }, + { + "epoch": 19.82722669049747, + "grad_norm": 0.042724609375, + "learning_rate": 6.831008071306876e-06, + "loss": 0.7749, + "num_input_tokens_seen": 77289632, + "step": 133120 + }, + { + "epoch": 19.827971403038426, + "grad_norm": 0.052734375, + "learning_rate": 6.772301905987876e-06, + "loss": 0.8339, + "num_input_tokens_seen": 77292512, + "step": 133125 + }, + { + "epoch": 19.828716115579386, + "grad_norm": 0.038330078125, + "learning_rate": 6.71384903601191e-06, + "loss": 0.7903, + "num_input_tokens_seen": 77295232, + "step": 133130 + }, + { + "epoch": 19.829460828120347, + "grad_norm": 0.043212890625, + "learning_rate": 6.655649462366519e-06, + "loss": 0.7957, + "num_input_tokens_seen": 77298112, + "step": 133135 + }, + { + "epoch": 19.830205540661304, + "grad_norm": 0.06591796875, + "learning_rate": 6.597703186035919e-06, + "loss": 0.7986, + "num_input_tokens_seen": 77301248, + "step": 133140 + }, + { + "epoch": 19.830950253202264, + "grad_norm": 0.046875, + "learning_rate": 6.5400102079976595e-06, + "loss": 0.8137, + "num_input_tokens_seen": 77304096, + "step": 133145 + }, + { + "epoch": 19.831694965743225, + "grad_norm": 0.058349609375, + "learning_rate": 6.482570529227627e-06, + "loss": 0.7789, + "num_input_tokens_seen": 77306880, + "step": 133150 + }, + { + "epoch": 19.83243967828418, + "grad_norm": 0.053466796875, + "learning_rate": 6.425384150696711e-06, + "loss": 0.7903, + "num_input_tokens_seen": 77309664, + "step": 133155 + }, + { + "epoch": 19.833184390825142, + "grad_norm": 0.03955078125, + "learning_rate": 6.368451073369141e-06, + "loss": 0.7803, + "num_input_tokens_seen": 77312800, + "step": 133160 + }, + { + "epoch": 19.8339291033661, + "grad_norm": 0.031982421875, + "learning_rate": 6.311771298209145e-06, + "loss": 0.7858, + "num_input_tokens_seen": 77315648, + "step": 133165 + }, + { + "epoch": 19.83467381590706, + "grad_norm": 0.04638671875, + "learning_rate": 6.255344826170961e-06, + "loss": 0.7869, + "num_input_tokens_seen": 77318688, + "step": 133170 + }, + { + "epoch": 19.83541852844802, + "grad_norm": 0.03759765625, + "learning_rate": 6.1991716582104896e-06, + "loss": 0.8196, + "num_input_tokens_seen": 77321472, + "step": 133175 + }, + { + "epoch": 19.836163240988977, + "grad_norm": 0.047119140625, + "learning_rate": 6.143251795276971e-06, + "loss": 0.8144, + "num_input_tokens_seen": 77324384, + "step": 133180 + }, + { + "epoch": 19.836907953529938, + "grad_norm": 0.04638671875, + "learning_rate": 6.087585238312986e-06, + "loss": 0.7957, + "num_input_tokens_seen": 77327264, + "step": 133185 + }, + { + "epoch": 19.8376526660709, + "grad_norm": 0.0380859375, + "learning_rate": 6.032171988261114e-06, + "loss": 0.7954, + "num_input_tokens_seen": 77330272, + "step": 133190 + }, + { + "epoch": 19.838397378611855, + "grad_norm": 0.06005859375, + "learning_rate": 5.977012046055607e-06, + "loss": 0.803, + "num_input_tokens_seen": 77333216, + "step": 133195 + }, + { + "epoch": 19.839142091152816, + "grad_norm": 0.04150390625, + "learning_rate": 5.9221054126290526e-06, + "loss": 0.8105, + "num_input_tokens_seen": 77335840, + "step": 133200 + }, + { + "epoch": 19.839886803693773, + "grad_norm": 0.053466796875, + "learning_rate": 5.867452088910707e-06, + "loss": 0.7732, + "num_input_tokens_seen": 77338528, + "step": 133205 + }, + { + "epoch": 19.840631516234733, + "grad_norm": 0.03955078125, + "learning_rate": 5.813052075821501e-06, + "loss": 0.8061, + "num_input_tokens_seen": 77341472, + "step": 133210 + }, + { + "epoch": 19.841376228775694, + "grad_norm": 0.031005859375, + "learning_rate": 5.7589053742806985e-06, + "loss": 0.8085, + "num_input_tokens_seen": 77344320, + "step": 133215 + }, + { + "epoch": 19.84212094131665, + "grad_norm": 0.055419921875, + "learning_rate": 5.705011985204233e-06, + "loss": 0.7971, + "num_input_tokens_seen": 77347200, + "step": 133220 + }, + { + "epoch": 19.84286565385761, + "grad_norm": 0.03515625, + "learning_rate": 5.651371909501379e-06, + "loss": 0.7889, + "num_input_tokens_seen": 77349856, + "step": 133225 + }, + { + "epoch": 19.843610366398572, + "grad_norm": 0.05810546875, + "learning_rate": 5.597985148079743e-06, + "loss": 0.801, + "num_input_tokens_seen": 77352480, + "step": 133230 + }, + { + "epoch": 19.84435507893953, + "grad_norm": 0.032470703125, + "learning_rate": 5.544851701841935e-06, + "loss": 0.7975, + "num_input_tokens_seen": 77355168, + "step": 133235 + }, + { + "epoch": 19.84509979148049, + "grad_norm": 0.08349609375, + "learning_rate": 5.491971571682241e-06, + "loss": 0.7769, + "num_input_tokens_seen": 77358240, + "step": 133240 + }, + { + "epoch": 19.845844504021446, + "grad_norm": 0.051513671875, + "learning_rate": 5.439344758496611e-06, + "loss": 0.802, + "num_input_tokens_seen": 77361120, + "step": 133245 + }, + { + "epoch": 19.846589216562407, + "grad_norm": 0.056396484375, + "learning_rate": 5.386971263172668e-06, + "loss": 0.7851, + "num_input_tokens_seen": 77364000, + "step": 133250 + }, + { + "epoch": 19.847333929103367, + "grad_norm": 0.050048828125, + "learning_rate": 5.334851086596371e-06, + "loss": 0.8137, + "num_input_tokens_seen": 77366752, + "step": 133255 + }, + { + "epoch": 19.848078641644324, + "grad_norm": 0.0301513671875, + "learning_rate": 5.282984229648679e-06, + "loss": 0.8088, + "num_input_tokens_seen": 77369728, + "step": 133260 + }, + { + "epoch": 19.848823354185285, + "grad_norm": 0.0537109375, + "learning_rate": 5.231370693203896e-06, + "loss": 0.8094, + "num_input_tokens_seen": 77372896, + "step": 133265 + }, + { + "epoch": 19.849568066726242, + "grad_norm": 0.043701171875, + "learning_rate": 5.180010478136321e-06, + "loss": 0.7803, + "num_input_tokens_seen": 77375840, + "step": 133270 + }, + { + "epoch": 19.850312779267203, + "grad_norm": 0.03759765625, + "learning_rate": 5.128903585311927e-06, + "loss": 0.7895, + "num_input_tokens_seen": 77378688, + "step": 133275 + }, + { + "epoch": 19.851057491808163, + "grad_norm": 0.03662109375, + "learning_rate": 5.078050015595026e-06, + "loss": 0.8288, + "num_input_tokens_seen": 77381760, + "step": 133280 + }, + { + "epoch": 19.85180220434912, + "grad_norm": 0.054931640625, + "learning_rate": 5.027449769843262e-06, + "loss": 0.7944, + "num_input_tokens_seen": 77384352, + "step": 133285 + }, + { + "epoch": 19.85254691689008, + "grad_norm": 0.043701171875, + "learning_rate": 4.977102848912617e-06, + "loss": 0.8058, + "num_input_tokens_seen": 77387264, + "step": 133290 + }, + { + "epoch": 19.85329162943104, + "grad_norm": 0.05615234375, + "learning_rate": 4.92700925365408e-06, + "loss": 0.7952, + "num_input_tokens_seen": 77389888, + "step": 133295 + }, + { + "epoch": 19.854036341971998, + "grad_norm": 0.034423828125, + "learning_rate": 4.8771689849136375e-06, + "loss": 0.8003, + "num_input_tokens_seen": 77392928, + "step": 133300 + }, + { + "epoch": 19.85478105451296, + "grad_norm": 0.0791015625, + "learning_rate": 4.827582043532286e-06, + "loss": 0.7923, + "num_input_tokens_seen": 77395936, + "step": 133305 + }, + { + "epoch": 19.855525767053916, + "grad_norm": 0.0294189453125, + "learning_rate": 4.778248430349352e-06, + "loss": 0.8104, + "num_input_tokens_seen": 77398848, + "step": 133310 + }, + { + "epoch": 19.856270479594876, + "grad_norm": 0.0556640625, + "learning_rate": 4.729168146197504e-06, + "loss": 0.7907, + "num_input_tokens_seen": 77401664, + "step": 133315 + }, + { + "epoch": 19.857015192135837, + "grad_norm": 0.0634765625, + "learning_rate": 4.680341191904413e-06, + "loss": 0.8159, + "num_input_tokens_seen": 77404608, + "step": 133320 + }, + { + "epoch": 19.857759904676794, + "grad_norm": 0.0228271484375, + "learning_rate": 4.631767568297751e-06, + "loss": 0.7935, + "num_input_tokens_seen": 77407552, + "step": 133325 + }, + { + "epoch": 19.858504617217754, + "grad_norm": 0.0537109375, + "learning_rate": 4.5834472761968615e-06, + "loss": 0.7761, + "num_input_tokens_seen": 77410624, + "step": 133330 + }, + { + "epoch": 19.859249329758715, + "grad_norm": 0.076171875, + "learning_rate": 4.535380316417758e-06, + "loss": 0.8052, + "num_input_tokens_seen": 77413344, + "step": 133335 + }, + { + "epoch": 19.85999404229967, + "grad_norm": 0.03125, + "learning_rate": 4.4875666897714605e-06, + "loss": 0.7839, + "num_input_tokens_seen": 77415936, + "step": 133340 + }, + { + "epoch": 19.860738754840632, + "grad_norm": 0.046142578125, + "learning_rate": 4.440006397068985e-06, + "loss": 0.7902, + "num_input_tokens_seen": 77418944, + "step": 133345 + }, + { + "epoch": 19.86148346738159, + "grad_norm": 0.05517578125, + "learning_rate": 4.392699439109693e-06, + "loss": 0.7902, + "num_input_tokens_seen": 77422016, + "step": 133350 + }, + { + "epoch": 19.86222817992255, + "grad_norm": 0.04296875, + "learning_rate": 4.345645816696275e-06, + "loss": 0.7973, + "num_input_tokens_seen": 77424672, + "step": 133355 + }, + { + "epoch": 19.86297289246351, + "grad_norm": 0.047607421875, + "learning_rate": 4.298845530623096e-06, + "loss": 0.7786, + "num_input_tokens_seen": 77427616, + "step": 133360 + }, + { + "epoch": 19.863717605004467, + "grad_norm": 0.0771484375, + "learning_rate": 4.25229858167786e-06, + "loss": 0.7882, + "num_input_tokens_seen": 77430368, + "step": 133365 + }, + { + "epoch": 19.864462317545428, + "grad_norm": 0.045654296875, + "learning_rate": 4.206004970649934e-06, + "loss": 0.7972, + "num_input_tokens_seen": 77433312, + "step": 133370 + }, + { + "epoch": 19.86520703008639, + "grad_norm": 0.033203125, + "learning_rate": 4.159964698320362e-06, + "loss": 0.7941, + "num_input_tokens_seen": 77436128, + "step": 133375 + }, + { + "epoch": 19.865951742627345, + "grad_norm": 0.045166015625, + "learning_rate": 4.1141777654685185e-06, + "loss": 0.8006, + "num_input_tokens_seen": 77438944, + "step": 133380 + }, + { + "epoch": 19.866696455168306, + "grad_norm": 0.032470703125, + "learning_rate": 4.068644172865454e-06, + "loss": 0.7883, + "num_input_tokens_seen": 77441824, + "step": 133385 + }, + { + "epoch": 19.867441167709263, + "grad_norm": 0.0322265625, + "learning_rate": 4.023363921280554e-06, + "loss": 0.7825, + "num_input_tokens_seen": 77444576, + "step": 133390 + }, + { + "epoch": 19.868185880250223, + "grad_norm": 0.049072265625, + "learning_rate": 3.978337011479871e-06, + "loss": 0.7748, + "num_input_tokens_seen": 77447872, + "step": 133395 + }, + { + "epoch": 19.868930592791184, + "grad_norm": 0.03369140625, + "learning_rate": 3.933563444224464e-06, + "loss": 0.8046, + "num_input_tokens_seen": 77450976, + "step": 133400 + }, + { + "epoch": 19.86967530533214, + "grad_norm": 0.03173828125, + "learning_rate": 3.889043220270394e-06, + "loss": 0.8025, + "num_input_tokens_seen": 77453856, + "step": 133405 + }, + { + "epoch": 19.8704200178731, + "grad_norm": 0.0252685546875, + "learning_rate": 3.8447763403703924e-06, + "loss": 0.7879, + "num_input_tokens_seen": 77456736, + "step": 133410 + }, + { + "epoch": 19.87116473041406, + "grad_norm": 0.040283203125, + "learning_rate": 3.80076280527053e-06, + "loss": 0.8064, + "num_input_tokens_seen": 77459488, + "step": 133415 + }, + { + "epoch": 19.87190944295502, + "grad_norm": 0.0654296875, + "learning_rate": 3.75700261571521e-06, + "loss": 0.7985, + "num_input_tokens_seen": 77462464, + "step": 133420 + }, + { + "epoch": 19.87265415549598, + "grad_norm": 0.05615234375, + "learning_rate": 3.713495772443842e-06, + "loss": 0.783, + "num_input_tokens_seen": 77465280, + "step": 133425 + }, + { + "epoch": 19.873398868036936, + "grad_norm": 0.10986328125, + "learning_rate": 3.670242276190838e-06, + "loss": 0.8052, + "num_input_tokens_seen": 77467936, + "step": 133430 + }, + { + "epoch": 19.874143580577897, + "grad_norm": 0.042236328125, + "learning_rate": 3.6272421276889455e-06, + "loss": 0.7802, + "num_input_tokens_seen": 77470816, + "step": 133435 + }, + { + "epoch": 19.874888293118858, + "grad_norm": 0.039306640625, + "learning_rate": 3.5844953276609193e-06, + "loss": 0.8064, + "num_input_tokens_seen": 77473472, + "step": 133440 + }, + { + "epoch": 19.875633005659815, + "grad_norm": 0.30078125, + "learning_rate": 3.5420018768328453e-06, + "loss": 0.8578, + "num_input_tokens_seen": 77476864, + "step": 133445 + }, + { + "epoch": 19.876377718200775, + "grad_norm": 0.05712890625, + "learning_rate": 3.4997617759208174e-06, + "loss": 0.8129, + "num_input_tokens_seen": 77479744, + "step": 133450 + }, + { + "epoch": 19.877122430741732, + "grad_norm": 0.0537109375, + "learning_rate": 3.4577750256392648e-06, + "loss": 0.8076, + "num_input_tokens_seen": 77482400, + "step": 133455 + }, + { + "epoch": 19.877867143282693, + "grad_norm": 0.052490234375, + "learning_rate": 3.4160416266959536e-06, + "loss": 0.7845, + "num_input_tokens_seen": 77485600, + "step": 133460 + }, + { + "epoch": 19.878611855823653, + "grad_norm": 0.04931640625, + "learning_rate": 3.3745615797969863e-06, + "loss": 0.792, + "num_input_tokens_seen": 77488544, + "step": 133465 + }, + { + "epoch": 19.87935656836461, + "grad_norm": 0.044677734375, + "learning_rate": 3.333334885641803e-06, + "loss": 0.8049, + "num_input_tokens_seen": 77491360, + "step": 133470 + }, + { + "epoch": 19.88010128090557, + "grad_norm": 0.2197265625, + "learning_rate": 3.2923615449298447e-06, + "loss": 0.8291, + "num_input_tokens_seen": 77494112, + "step": 133475 + }, + { + "epoch": 19.88084599344653, + "grad_norm": 0.0703125, + "learning_rate": 3.2516415583505595e-06, + "loss": 0.8075, + "num_input_tokens_seen": 77497120, + "step": 133480 + }, + { + "epoch": 19.881590705987488, + "grad_norm": 0.056640625, + "learning_rate": 3.211174926591731e-06, + "loss": 0.804, + "num_input_tokens_seen": 77500000, + "step": 133485 + }, + { + "epoch": 19.88233541852845, + "grad_norm": 0.064453125, + "learning_rate": 3.1709616503394766e-06, + "loss": 0.7872, + "num_input_tokens_seen": 77502688, + "step": 133490 + }, + { + "epoch": 19.883080131069406, + "grad_norm": 0.037841796875, + "learning_rate": 3.1310017302715875e-06, + "loss": 0.7859, + "num_input_tokens_seen": 77505312, + "step": 133495 + }, + { + "epoch": 19.883824843610366, + "grad_norm": 0.045654296875, + "learning_rate": 3.0912951670625245e-06, + "loss": 0.7942, + "num_input_tokens_seen": 77508352, + "step": 133500 + }, + { + "epoch": 19.884569556151327, + "grad_norm": 0.0732421875, + "learning_rate": 3.051841961385082e-06, + "loss": 0.7944, + "num_input_tokens_seen": 77511584, + "step": 133505 + }, + { + "epoch": 19.885314268692284, + "grad_norm": 0.030029296875, + "learning_rate": 3.0126421139037295e-06, + "loss": 0.8159, + "num_input_tokens_seen": 77514592, + "step": 133510 + }, + { + "epoch": 19.886058981233244, + "grad_norm": 0.048828125, + "learning_rate": 2.9736956252812697e-06, + "loss": 0.803, + "num_input_tokens_seen": 77517408, + "step": 133515 + }, + { + "epoch": 19.886803693774205, + "grad_norm": 0.07080078125, + "learning_rate": 2.9350024961755095e-06, + "loss": 0.8085, + "num_input_tokens_seen": 77520256, + "step": 133520 + }, + { + "epoch": 19.88754840631516, + "grad_norm": 0.033447265625, + "learning_rate": 2.8965627272425906e-06, + "loss": 0.7915, + "num_input_tokens_seen": 77523360, + "step": 133525 + }, + { + "epoch": 19.888293118856122, + "grad_norm": 0.0693359375, + "learning_rate": 2.8583763191269984e-06, + "loss": 0.7953, + "num_input_tokens_seen": 77526016, + "step": 133530 + }, + { + "epoch": 19.88903783139708, + "grad_norm": 0.047607421875, + "learning_rate": 2.8204432724798776e-06, + "loss": 0.7945, + "num_input_tokens_seen": 77529248, + "step": 133535 + }, + { + "epoch": 19.88978254393804, + "grad_norm": 0.056396484375, + "learning_rate": 2.7827635879357215e-06, + "loss": 0.7898, + "num_input_tokens_seen": 77532096, + "step": 133540 + }, + { + "epoch": 19.890527256479, + "grad_norm": 0.056640625, + "learning_rate": 2.7453372661373486e-06, + "loss": 0.7835, + "num_input_tokens_seen": 77534720, + "step": 133545 + }, + { + "epoch": 19.891271969019957, + "grad_norm": 0.0810546875, + "learning_rate": 2.7081643077125906e-06, + "loss": 0.7729, + "num_input_tokens_seen": 77537536, + "step": 133550 + }, + { + "epoch": 19.892016681560918, + "grad_norm": 0.06103515625, + "learning_rate": 2.671244713289278e-06, + "loss": 0.7942, + "num_input_tokens_seen": 77540608, + "step": 133555 + }, + { + "epoch": 19.89276139410188, + "grad_norm": 0.060302734375, + "learning_rate": 2.634578483495242e-06, + "loss": 0.7992, + "num_input_tokens_seen": 77543360, + "step": 133560 + }, + { + "epoch": 19.893506106642835, + "grad_norm": 0.035888671875, + "learning_rate": 2.598165618946657e-06, + "loss": 0.7803, + "num_input_tokens_seen": 77546464, + "step": 133565 + }, + { + "epoch": 19.894250819183796, + "grad_norm": 0.045654296875, + "learning_rate": 2.562006120258031e-06, + "loss": 0.8122, + "num_input_tokens_seen": 77549376, + "step": 133570 + }, + { + "epoch": 19.894995531724753, + "grad_norm": 0.0255126953125, + "learning_rate": 2.5260999880422074e-06, + "loss": 0.7947, + "num_input_tokens_seen": 77552128, + "step": 133575 + }, + { + "epoch": 19.895740244265713, + "grad_norm": 0.044189453125, + "learning_rate": 2.4904472229053676e-06, + "loss": 0.8036, + "num_input_tokens_seen": 77555104, + "step": 133580 + }, + { + "epoch": 19.896484956806674, + "grad_norm": 0.04541015625, + "learning_rate": 2.4550478254503627e-06, + "loss": 0.7935, + "num_input_tokens_seen": 77557920, + "step": 133585 + }, + { + "epoch": 19.89722966934763, + "grad_norm": 0.0419921875, + "learning_rate": 2.4199017962750478e-06, + "loss": 0.7992, + "num_input_tokens_seen": 77560992, + "step": 133590 + }, + { + "epoch": 19.89797438188859, + "grad_norm": 0.036865234375, + "learning_rate": 2.385009135970617e-06, + "loss": 0.8029, + "num_input_tokens_seen": 77564032, + "step": 133595 + }, + { + "epoch": 19.898719094429552, + "grad_norm": 0.0400390625, + "learning_rate": 2.350369845129929e-06, + "loss": 0.7891, + "num_input_tokens_seen": 77567296, + "step": 133600 + }, + { + "epoch": 19.89946380697051, + "grad_norm": 0.035400390625, + "learning_rate": 2.3159839243358516e-06, + "loss": 0.7932, + "num_input_tokens_seen": 77570112, + "step": 133605 + }, + { + "epoch": 19.90020851951147, + "grad_norm": 0.042236328125, + "learning_rate": 2.2818513741712507e-06, + "loss": 0.8027, + "num_input_tokens_seen": 77572928, + "step": 133610 + }, + { + "epoch": 19.900953232052427, + "grad_norm": 0.09228515625, + "learning_rate": 2.247972195210668e-06, + "loss": 0.8028, + "num_input_tokens_seen": 77575520, + "step": 133615 + }, + { + "epoch": 19.901697944593387, + "grad_norm": 0.039306640625, + "learning_rate": 2.2143463880286427e-06, + "loss": 0.7899, + "num_input_tokens_seen": 77578272, + "step": 133620 + }, + { + "epoch": 19.902442657134348, + "grad_norm": 0.036376953125, + "learning_rate": 2.1809739531913894e-06, + "loss": 0.8015, + "num_input_tokens_seen": 77581344, + "step": 133625 + }, + { + "epoch": 19.903187369675305, + "grad_norm": 0.054931640625, + "learning_rate": 2.1478548912634566e-06, + "loss": 0.8172, + "num_input_tokens_seen": 77584352, + "step": 133630 + }, + { + "epoch": 19.903932082216265, + "grad_norm": 0.0498046875, + "learning_rate": 2.114989202806061e-06, + "loss": 0.8096, + "num_input_tokens_seen": 77587264, + "step": 133635 + }, + { + "epoch": 19.904676794757222, + "grad_norm": 0.041259765625, + "learning_rate": 2.0823768883704297e-06, + "loss": 0.79, + "num_input_tokens_seen": 77590112, + "step": 133640 + }, + { + "epoch": 19.905421507298183, + "grad_norm": 0.0673828125, + "learning_rate": 2.050017948511118e-06, + "loss": 0.7912, + "num_input_tokens_seen": 77592800, + "step": 133645 + }, + { + "epoch": 19.906166219839143, + "grad_norm": 0.08984375, + "learning_rate": 2.017912383772691e-06, + "loss": 0.7753, + "num_input_tokens_seen": 77595840, + "step": 133650 + }, + { + "epoch": 19.9069109323801, + "grad_norm": 0.05029296875, + "learning_rate": 1.9860601946997125e-06, + "loss": 0.794, + "num_input_tokens_seen": 77598752, + "step": 133655 + }, + { + "epoch": 19.90765564492106, + "grad_norm": 0.052001953125, + "learning_rate": 1.9544613818284207e-06, + "loss": 0.7807, + "num_input_tokens_seen": 77601632, + "step": 133660 + }, + { + "epoch": 19.90840035746202, + "grad_norm": 0.055908203125, + "learning_rate": 1.9231159456917222e-06, + "loss": 0.8026, + "num_input_tokens_seen": 77604320, + "step": 133665 + }, + { + "epoch": 19.909145070002978, + "grad_norm": 0.045166015625, + "learning_rate": 1.8920238868225248e-06, + "loss": 0.8069, + "num_input_tokens_seen": 77607040, + "step": 133670 + }, + { + "epoch": 19.90988978254394, + "grad_norm": 0.05419921875, + "learning_rate": 1.8611852057437427e-06, + "loss": 0.8105, + "num_input_tokens_seen": 77609856, + "step": 133675 + }, + { + "epoch": 19.910634495084896, + "grad_norm": 0.044921875, + "learning_rate": 1.8305999029766262e-06, + "loss": 0.8026, + "num_input_tokens_seen": 77612608, + "step": 133680 + }, + { + "epoch": 19.911379207625856, + "grad_norm": 0.05712890625, + "learning_rate": 1.8002679790374286e-06, + "loss": 0.8042, + "num_input_tokens_seen": 77615584, + "step": 133685 + }, + { + "epoch": 19.912123920166817, + "grad_norm": 0.05615234375, + "learning_rate": 1.7701894344390733e-06, + "loss": 0.8014, + "num_input_tokens_seen": 77618272, + "step": 133690 + }, + { + "epoch": 19.912868632707774, + "grad_norm": 0.059814453125, + "learning_rate": 1.7403642696911524e-06, + "loss": 0.8065, + "num_input_tokens_seen": 77621504, + "step": 133695 + }, + { + "epoch": 19.913613345248734, + "grad_norm": 0.03564453125, + "learning_rate": 1.7107924852949317e-06, + "loss": 0.7958, + "num_input_tokens_seen": 77624064, + "step": 133700 + }, + { + "epoch": 19.914358057789695, + "grad_norm": 0.0380859375, + "learning_rate": 1.681474081751677e-06, + "loss": 0.7867, + "num_input_tokens_seen": 77627168, + "step": 133705 + }, + { + "epoch": 19.915102770330652, + "grad_norm": 0.059326171875, + "learning_rate": 1.6524090595576579e-06, + "loss": 0.8044, + "num_input_tokens_seen": 77630048, + "step": 133710 + }, + { + "epoch": 19.915847482871612, + "grad_norm": 0.06298828125, + "learning_rate": 1.6235974192008174e-06, + "loss": 0.8056, + "num_input_tokens_seen": 77632768, + "step": 133715 + }, + { + "epoch": 19.91659219541257, + "grad_norm": 0.048095703125, + "learning_rate": 1.5950391611707636e-06, + "loss": 0.8162, + "num_input_tokens_seen": 77635968, + "step": 133720 + }, + { + "epoch": 19.91733690795353, + "grad_norm": 0.0498046875, + "learning_rate": 1.5667342859487787e-06, + "loss": 0.799, + "num_input_tokens_seen": 77638752, + "step": 133725 + }, + { + "epoch": 19.91808162049449, + "grad_norm": 0.06298828125, + "learning_rate": 1.5386827940128133e-06, + "loss": 0.804, + "num_input_tokens_seen": 77641728, + "step": 133730 + }, + { + "epoch": 19.918826333035447, + "grad_norm": 0.0299072265625, + "learning_rate": 1.5108846858358227e-06, + "loss": 0.794, + "num_input_tokens_seen": 77644672, + "step": 133735 + }, + { + "epoch": 19.919571045576408, + "grad_norm": 0.041748046875, + "learning_rate": 1.4833399618907617e-06, + "loss": 0.7974, + "num_input_tokens_seen": 77647808, + "step": 133740 + }, + { + "epoch": 19.92031575811737, + "grad_norm": 0.0908203125, + "learning_rate": 1.4560486226389279e-06, + "loss": 0.7914, + "num_input_tokens_seen": 77650848, + "step": 133745 + }, + { + "epoch": 19.921060470658325, + "grad_norm": 0.06640625, + "learning_rate": 1.4290106685449498e-06, + "loss": 0.7939, + "num_input_tokens_seen": 77653664, + "step": 133750 + }, + { + "epoch": 19.921805183199286, + "grad_norm": 0.0257568359375, + "learning_rate": 1.4022261000617986e-06, + "loss": 0.8008, + "num_input_tokens_seen": 77656352, + "step": 133755 + }, + { + "epoch": 19.922549895740243, + "grad_norm": 0.0654296875, + "learning_rate": 1.3756949176457755e-06, + "loss": 0.8131, + "num_input_tokens_seen": 77659264, + "step": 133760 + }, + { + "epoch": 19.923294608281203, + "grad_norm": 0.029296875, + "learning_rate": 1.3494171217431904e-06, + "loss": 0.7999, + "num_input_tokens_seen": 77662144, + "step": 133765 + }, + { + "epoch": 19.924039320822164, + "grad_norm": 0.0849609375, + "learning_rate": 1.3233927127986876e-06, + "loss": 0.797, + "num_input_tokens_seen": 77664960, + "step": 133770 + }, + { + "epoch": 19.92478403336312, + "grad_norm": 0.053955078125, + "learning_rate": 1.2976216912519155e-06, + "loss": 0.7787, + "num_input_tokens_seen": 77667936, + "step": 133775 + }, + { + "epoch": 19.92552874590408, + "grad_norm": 0.0301513671875, + "learning_rate": 1.272104057535861e-06, + "loss": 0.7894, + "num_input_tokens_seen": 77670816, + "step": 133780 + }, + { + "epoch": 19.92627345844504, + "grad_norm": 0.064453125, + "learning_rate": 1.2468398120851764e-06, + "loss": 0.7892, + "num_input_tokens_seen": 77673696, + "step": 133785 + }, + { + "epoch": 19.927018170986, + "grad_norm": 0.03564453125, + "learning_rate": 1.221828955324522e-06, + "loss": 0.7809, + "num_input_tokens_seen": 77676608, + "step": 133790 + }, + { + "epoch": 19.92776288352696, + "grad_norm": 0.06494140625, + "learning_rate": 1.1970714876768928e-06, + "loss": 0.8027, + "num_input_tokens_seen": 77679392, + "step": 133795 + }, + { + "epoch": 19.928507596067917, + "grad_norm": 0.042236328125, + "learning_rate": 1.172567409561953e-06, + "loss": 0.7993, + "num_input_tokens_seen": 77682176, + "step": 133800 + }, + { + "epoch": 19.929252308608877, + "grad_norm": 0.04541015625, + "learning_rate": 1.1483167213910405e-06, + "loss": 0.7929, + "num_input_tokens_seen": 77684736, + "step": 133805 + }, + { + "epoch": 19.929997021149838, + "grad_norm": 0.03125, + "learning_rate": 1.1243194235754926e-06, + "loss": 0.7978, + "num_input_tokens_seen": 77687584, + "step": 133810 + }, + { + "epoch": 19.930741733690795, + "grad_norm": 0.05126953125, + "learning_rate": 1.100575516519986e-06, + "loss": 0.8087, + "num_input_tokens_seen": 77690336, + "step": 133815 + }, + { + "epoch": 19.931486446231755, + "grad_norm": 0.036865234375, + "learning_rate": 1.077085000625866e-06, + "loss": 0.7733, + "num_input_tokens_seen": 77693280, + "step": 133820 + }, + { + "epoch": 19.932231158772712, + "grad_norm": 0.048095703125, + "learning_rate": 1.0538478762911475e-06, + "loss": 0.7958, + "num_input_tokens_seen": 77696096, + "step": 133825 + }, + { + "epoch": 19.932975871313673, + "grad_norm": 0.03857421875, + "learning_rate": 1.0308641439071841e-06, + "loss": 0.8035, + "num_input_tokens_seen": 77698912, + "step": 133830 + }, + { + "epoch": 19.933720583854633, + "grad_norm": 0.05029296875, + "learning_rate": 1.008133803861999e-06, + "loss": 0.7822, + "num_input_tokens_seen": 77701600, + "step": 133835 + }, + { + "epoch": 19.93446529639559, + "grad_norm": 0.0615234375, + "learning_rate": 9.856568565402845e-07, + "loss": 0.8144, + "num_input_tokens_seen": 77704448, + "step": 133840 + }, + { + "epoch": 19.93521000893655, + "grad_norm": 0.039306640625, + "learning_rate": 9.634333023217366e-07, + "loss": 0.7913, + "num_input_tokens_seen": 77707424, + "step": 133845 + }, + { + "epoch": 19.93595472147751, + "grad_norm": 0.047607421875, + "learning_rate": 9.414631415810559e-07, + "loss": 0.7898, + "num_input_tokens_seen": 77710112, + "step": 133850 + }, + { + "epoch": 19.93669943401847, + "grad_norm": 0.048828125, + "learning_rate": 9.197463746912771e-07, + "loss": 0.8033, + "num_input_tokens_seen": 77712992, + "step": 133855 + }, + { + "epoch": 19.93744414655943, + "grad_norm": 0.045654296875, + "learning_rate": 8.982830020171084e-07, + "loss": 0.7878, + "num_input_tokens_seen": 77715936, + "step": 133860 + }, + { + "epoch": 19.938188859100386, + "grad_norm": 0.042236328125, + "learning_rate": 8.770730239215929e-07, + "loss": 0.7996, + "num_input_tokens_seen": 77718880, + "step": 133865 + }, + { + "epoch": 19.938933571641346, + "grad_norm": 0.0289306640625, + "learning_rate": 8.561164407644428e-07, + "loss": 0.8063, + "num_input_tokens_seen": 77721952, + "step": 133870 + }, + { + "epoch": 19.939678284182307, + "grad_norm": 0.05419921875, + "learning_rate": 8.35413252898709e-07, + "loss": 0.7995, + "num_input_tokens_seen": 77724736, + "step": 133875 + }, + { + "epoch": 19.940422996723264, + "grad_norm": 0.0458984375, + "learning_rate": 8.149634606741119e-07, + "loss": 0.8197, + "num_input_tokens_seen": 77727648, + "step": 133880 + }, + { + "epoch": 19.941167709264224, + "grad_norm": 0.07470703125, + "learning_rate": 7.947670644353754e-07, + "loss": 0.7888, + "num_input_tokens_seen": 77730688, + "step": 133885 + }, + { + "epoch": 19.941912421805185, + "grad_norm": 0.037109375, + "learning_rate": 7.748240645255589e-07, + "loss": 0.7872, + "num_input_tokens_seen": 77733472, + "step": 133890 + }, + { + "epoch": 19.942657134346142, + "grad_norm": 0.054443359375, + "learning_rate": 7.551344612793942e-07, + "loss": 0.7917, + "num_input_tokens_seen": 77736416, + "step": 133895 + }, + { + "epoch": 19.943401846887102, + "grad_norm": 0.052734375, + "learning_rate": 7.356982550316137e-07, + "loss": 0.8016, + "num_input_tokens_seen": 77739104, + "step": 133900 + }, + { + "epoch": 19.94414655942806, + "grad_norm": 0.051025390625, + "learning_rate": 7.165154461102885e-07, + "loss": 0.7889, + "num_input_tokens_seen": 77741952, + "step": 133905 + }, + { + "epoch": 19.94489127196902, + "grad_norm": 0.0654296875, + "learning_rate": 6.975860348368278e-07, + "loss": 0.7958, + "num_input_tokens_seen": 77744832, + "step": 133910 + }, + { + "epoch": 19.94563598450998, + "grad_norm": 0.08154296875, + "learning_rate": 6.789100215343069e-07, + "loss": 0.8002, + "num_input_tokens_seen": 77747552, + "step": 133915 + }, + { + "epoch": 19.946380697050937, + "grad_norm": 0.07958984375, + "learning_rate": 6.604874065174737e-07, + "loss": 0.8337, + "num_input_tokens_seen": 77750432, + "step": 133920 + }, + { + "epoch": 19.947125409591898, + "grad_norm": 0.0361328125, + "learning_rate": 6.423181900960805e-07, + "loss": 0.8061, + "num_input_tokens_seen": 77753344, + "step": 133925 + }, + { + "epoch": 19.947870122132855, + "grad_norm": 0.080078125, + "learning_rate": 6.244023725782144e-07, + "loss": 0.7879, + "num_input_tokens_seen": 77756224, + "step": 133930 + }, + { + "epoch": 19.948614834673815, + "grad_norm": 0.06494140625, + "learning_rate": 6.06739954266966e-07, + "loss": 0.775, + "num_input_tokens_seen": 77758944, + "step": 133935 + }, + { + "epoch": 19.949359547214776, + "grad_norm": 0.04150390625, + "learning_rate": 5.893309354604304e-07, + "loss": 0.791, + "num_input_tokens_seen": 77761824, + "step": 133940 + }, + { + "epoch": 19.950104259755733, + "grad_norm": 0.04052734375, + "learning_rate": 5.721753164517063e-07, + "loss": 0.7898, + "num_input_tokens_seen": 77764672, + "step": 133945 + }, + { + "epoch": 19.950848972296694, + "grad_norm": 0.031982421875, + "learning_rate": 5.55273097530562e-07, + "loss": 0.7974, + "num_input_tokens_seen": 77767776, + "step": 133950 + }, + { + "epoch": 19.951593684837654, + "grad_norm": 0.0517578125, + "learning_rate": 5.386242789851004e-07, + "loss": 0.7885, + "num_input_tokens_seen": 77770720, + "step": 133955 + }, + { + "epoch": 19.95233839737861, + "grad_norm": 0.0419921875, + "learning_rate": 5.222288610934322e-07, + "loss": 0.818, + "num_input_tokens_seen": 77773504, + "step": 133960 + }, + { + "epoch": 19.95308310991957, + "grad_norm": 0.03857421875, + "learning_rate": 5.060868441336685e-07, + "loss": 0.774, + "num_input_tokens_seen": 77776800, + "step": 133965 + }, + { + "epoch": 19.95382782246053, + "grad_norm": 0.05810546875, + "learning_rate": 4.901982283805895e-07, + "loss": 0.8278, + "num_input_tokens_seen": 77779776, + "step": 133970 + }, + { + "epoch": 19.95457253500149, + "grad_norm": 0.051513671875, + "learning_rate": 4.7456301409898317e-07, + "loss": 0.821, + "num_input_tokens_seen": 77782720, + "step": 133975 + }, + { + "epoch": 19.95531724754245, + "grad_norm": 0.05908203125, + "learning_rate": 4.591812015553032e-07, + "loss": 0.8058, + "num_input_tokens_seen": 77785600, + "step": 133980 + }, + { + "epoch": 19.956061960083407, + "grad_norm": 0.042236328125, + "learning_rate": 4.440527910093417e-07, + "loss": 0.774, + "num_input_tokens_seen": 77788832, + "step": 133985 + }, + { + "epoch": 19.956806672624367, + "grad_norm": 0.057861328125, + "learning_rate": 4.2917778271589487e-07, + "loss": 0.8118, + "num_input_tokens_seen": 77791776, + "step": 133990 + }, + { + "epoch": 19.957551385165328, + "grad_norm": 0.06201171875, + "learning_rate": 4.145561769264283e-07, + "loss": 0.7913, + "num_input_tokens_seen": 77794720, + "step": 133995 + }, + { + "epoch": 19.958296097706285, + "grad_norm": 0.09619140625, + "learning_rate": 4.001879738890768e-07, + "loss": 0.8092, + "num_input_tokens_seen": 77797728, + "step": 134000 + }, + { + "epoch": 19.959040810247245, + "grad_norm": 0.09326171875, + "learning_rate": 3.8607317384531377e-07, + "loss": 0.7951, + "num_input_tokens_seen": 77800512, + "step": 134005 + }, + { + "epoch": 19.959785522788202, + "grad_norm": 0.03125, + "learning_rate": 3.722117770332822e-07, + "loss": 0.8184, + "num_input_tokens_seen": 77803136, + "step": 134010 + }, + { + "epoch": 19.960530235329163, + "grad_norm": 0.042236328125, + "learning_rate": 3.586037836877942e-07, + "loss": 0.8001, + "num_input_tokens_seen": 77806112, + "step": 134015 + }, + { + "epoch": 19.961274947870123, + "grad_norm": 0.05908203125, + "learning_rate": 3.4524919403866593e-07, + "loss": 0.8042, + "num_input_tokens_seen": 77808960, + "step": 134020 + }, + { + "epoch": 19.96201966041108, + "grad_norm": 0.049072265625, + "learning_rate": 3.321480083123829e-07, + "loss": 0.789, + "num_input_tokens_seen": 77811744, + "step": 134025 + }, + { + "epoch": 19.96276437295204, + "grad_norm": 0.03466796875, + "learning_rate": 3.1930022672876923e-07, + "loss": 0.802, + "num_input_tokens_seen": 77814848, + "step": 134030 + }, + { + "epoch": 19.963509085493, + "grad_norm": 0.041259765625, + "learning_rate": 3.0670584950598376e-07, + "loss": 0.808, + "num_input_tokens_seen": 77817952, + "step": 134035 + }, + { + "epoch": 19.96425379803396, + "grad_norm": 0.054931640625, + "learning_rate": 2.9436487685718935e-07, + "loss": 0.794, + "num_input_tokens_seen": 77820832, + "step": 134040 + }, + { + "epoch": 19.96499851057492, + "grad_norm": 0.037841796875, + "learning_rate": 2.8227730898888745e-07, + "loss": 0.8008, + "num_input_tokens_seen": 77823936, + "step": 134045 + }, + { + "epoch": 19.965743223115876, + "grad_norm": 0.0257568359375, + "learning_rate": 2.704431461059142e-07, + "loss": 0.7845, + "num_input_tokens_seen": 77826912, + "step": 134050 + }, + { + "epoch": 19.966487935656836, + "grad_norm": 0.031494140625, + "learning_rate": 2.588623884097751e-07, + "loss": 0.8049, + "num_input_tokens_seen": 77829888, + "step": 134055 + }, + { + "epoch": 19.967232648197797, + "grad_norm": 0.06689453125, + "learning_rate": 2.4753503609531434e-07, + "loss": 0.788, + "num_input_tokens_seen": 77832704, + "step": 134060 + }, + { + "epoch": 19.967977360738754, + "grad_norm": 0.04931640625, + "learning_rate": 2.364610893540453e-07, + "loss": 0.8117, + "num_input_tokens_seen": 77835648, + "step": 134065 + }, + { + "epoch": 19.968722073279714, + "grad_norm": 0.1103515625, + "learning_rate": 2.2564054837082015e-07, + "loss": 0.8061, + "num_input_tokens_seen": 77838688, + "step": 134070 + }, + { + "epoch": 19.969466785820675, + "grad_norm": 0.04638671875, + "learning_rate": 2.1507341333215635e-07, + "loss": 0.7868, + "num_input_tokens_seen": 77841632, + "step": 134075 + }, + { + "epoch": 19.970211498361632, + "grad_norm": 0.09326171875, + "learning_rate": 2.0475968441291404e-07, + "loss": 0.7998, + "num_input_tokens_seen": 77844544, + "step": 134080 + }, + { + "epoch": 19.970956210902592, + "grad_norm": 0.0810546875, + "learning_rate": 1.94699361791284e-07, + "loss": 0.7966, + "num_input_tokens_seen": 77847648, + "step": 134085 + }, + { + "epoch": 19.97170092344355, + "grad_norm": 0.042236328125, + "learning_rate": 1.848924456337997e-07, + "loss": 0.7938, + "num_input_tokens_seen": 77850432, + "step": 134090 + }, + { + "epoch": 19.97244563598451, + "grad_norm": 0.05810546875, + "learning_rate": 1.7533893610699457e-07, + "loss": 0.7942, + "num_input_tokens_seen": 77853440, + "step": 134095 + }, + { + "epoch": 19.97319034852547, + "grad_norm": 0.0390625, + "learning_rate": 1.6603883337407143e-07, + "loss": 0.7989, + "num_input_tokens_seen": 77856192, + "step": 134100 + }, + { + "epoch": 19.973935061066427, + "grad_norm": 0.05322265625, + "learning_rate": 1.5699213758990638e-07, + "loss": 0.7751, + "num_input_tokens_seen": 77859392, + "step": 134105 + }, + { + "epoch": 19.974679773607388, + "grad_norm": 0.053955078125, + "learning_rate": 1.4819884890771017e-07, + "loss": 0.8192, + "num_input_tokens_seen": 77862304, + "step": 134110 + }, + { + "epoch": 19.97542448614835, + "grad_norm": 0.04052734375, + "learning_rate": 1.3965896747736295e-07, + "loss": 0.8138, + "num_input_tokens_seen": 77865344, + "step": 134115 + }, + { + "epoch": 19.976169198689306, + "grad_norm": 0.05517578125, + "learning_rate": 1.3137249344208344e-07, + "loss": 0.7975, + "num_input_tokens_seen": 77868320, + "step": 134120 + }, + { + "epoch": 19.976913911230266, + "grad_norm": 0.0361328125, + "learning_rate": 1.2333942694175982e-07, + "loss": 0.7987, + "num_input_tokens_seen": 77871456, + "step": 134125 + }, + { + "epoch": 19.977658623771223, + "grad_norm": 0.051025390625, + "learning_rate": 1.1555976811294942e-07, + "loss": 0.802, + "num_input_tokens_seen": 77874656, + "step": 134130 + }, + { + "epoch": 19.978403336312184, + "grad_norm": 0.04541015625, + "learning_rate": 1.0803351708721375e-07, + "loss": 0.8094, + "num_input_tokens_seen": 77877536, + "step": 134135 + }, + { + "epoch": 19.979148048853144, + "grad_norm": 0.0712890625, + "learning_rate": 1.0076067398945287e-07, + "loss": 0.8023, + "num_input_tokens_seen": 77880704, + "step": 134140 + }, + { + "epoch": 19.9798927613941, + "grad_norm": 0.04150390625, + "learning_rate": 9.37412389462322e-08, + "loss": 0.7932, + "num_input_tokens_seen": 77883840, + "step": 134145 + }, + { + "epoch": 19.98063747393506, + "grad_norm": 0.0238037109375, + "learning_rate": 8.697521207245984e-08, + "loss": 0.7978, + "num_input_tokens_seen": 77886720, + "step": 134150 + }, + { + "epoch": 19.98138218647602, + "grad_norm": 0.056640625, + "learning_rate": 8.046259348304385e-08, + "loss": 0.7722, + "num_input_tokens_seen": 77889664, + "step": 134155 + }, + { + "epoch": 19.98212689901698, + "grad_norm": 0.0498046875, + "learning_rate": 7.420338329122699e-08, + "loss": 0.7916, + "num_input_tokens_seen": 77892448, + "step": 134160 + }, + { + "epoch": 19.98287161155794, + "grad_norm": 0.048583984375, + "learning_rate": 6.819758159859468e-08, + "loss": 0.7972, + "num_input_tokens_seen": 77895072, + "step": 134165 + }, + { + "epoch": 19.983616324098897, + "grad_norm": 0.048828125, + "learning_rate": 6.244518851006297e-08, + "loss": 0.8127, + "num_input_tokens_seen": 77897792, + "step": 134170 + }, + { + "epoch": 19.984361036639857, + "grad_norm": 0.04736328125, + "learning_rate": 5.6946204120555954e-08, + "loss": 0.8112, + "num_input_tokens_seen": 77900672, + "step": 134175 + }, + { + "epoch": 19.985105749180818, + "grad_norm": 0.0947265625, + "learning_rate": 5.1700628523332346e-08, + "loss": 0.8235, + "num_input_tokens_seen": 77903488, + "step": 134180 + }, + { + "epoch": 19.985850461721775, + "grad_norm": 0.039794921875, + "learning_rate": 4.670846180832022e-08, + "loss": 0.7935, + "num_input_tokens_seen": 77906336, + "step": 134185 + }, + { + "epoch": 19.986595174262735, + "grad_norm": 0.037353515625, + "learning_rate": 4.1969704057120967e-08, + "loss": 0.8051, + "num_input_tokens_seen": 77908960, + "step": 134190 + }, + { + "epoch": 19.987339886803692, + "grad_norm": 0.034912109375, + "learning_rate": 3.7484355353001316e-08, + "loss": 0.8039, + "num_input_tokens_seen": 77911872, + "step": 134195 + }, + { + "epoch": 19.988084599344653, + "grad_norm": 0.03662109375, + "learning_rate": 3.3252415770901324e-08, + "loss": 0.8109, + "num_input_tokens_seen": 77914496, + "step": 134200 + }, + { + "epoch": 19.988829311885613, + "grad_norm": 0.039794921875, + "learning_rate": 2.9273885380765028e-08, + "loss": 0.7852, + "num_input_tokens_seen": 77917472, + "step": 134205 + }, + { + "epoch": 19.98957402442657, + "grad_norm": 0.037841796875, + "learning_rate": 2.5548764250871158e-08, + "loss": 0.8004, + "num_input_tokens_seen": 77920352, + "step": 134210 + }, + { + "epoch": 19.99031873696753, + "grad_norm": 0.039306640625, + "learning_rate": 2.2077052442837086e-08, + "loss": 0.785, + "num_input_tokens_seen": 77923168, + "step": 134215 + }, + { + "epoch": 19.99106344950849, + "grad_norm": 0.04736328125, + "learning_rate": 1.885875001828019e-08, + "loss": 0.803, + "num_input_tokens_seen": 77926368, + "step": 134220 + }, + { + "epoch": 19.99180816204945, + "grad_norm": 0.060791015625, + "learning_rate": 1.5893857028825842e-08, + "loss": 0.8026, + "num_input_tokens_seen": 77929440, + "step": 134225 + }, + { + "epoch": 19.99255287459041, + "grad_norm": 0.08544921875, + "learning_rate": 1.3182373526099411e-08, + "loss": 0.8162, + "num_input_tokens_seen": 77932192, + "step": 134230 + }, + { + "epoch": 19.993297587131366, + "grad_norm": 0.0439453125, + "learning_rate": 1.072429955506493e-08, + "loss": 0.7904, + "num_input_tokens_seen": 77935136, + "step": 134235 + }, + { + "epoch": 19.994042299672326, + "grad_norm": 0.038330078125, + "learning_rate": 8.519635157355765e-09, + "loss": 0.81, + "num_input_tokens_seen": 77938048, + "step": 134240 + }, + { + "epoch": 19.994787012213287, + "grad_norm": 0.0458984375, + "learning_rate": 6.5683803712746065e-09, + "loss": 0.8076, + "num_input_tokens_seen": 77940864, + "step": 134245 + }, + { + "epoch": 19.995531724754244, + "grad_norm": 0.03076171875, + "learning_rate": 4.8705352284628135e-09, + "loss": 0.7787, + "num_input_tokens_seen": 77943904, + "step": 134250 + }, + { + "epoch": 19.996276437295204, + "grad_norm": 0.05078125, + "learning_rate": 3.4260997572310714e-09, + "loss": 0.8164, + "num_input_tokens_seen": 77946912, + "step": 134255 + }, + { + "epoch": 19.997021149836165, + "grad_norm": 0.06640625, + "learning_rate": 2.2350739842247333e-09, + "loss": 0.7934, + "num_input_tokens_seen": 77949824, + "step": 134260 + }, + { + "epoch": 19.997765862377122, + "grad_norm": 0.0546875, + "learning_rate": 1.2974579277624798e-09, + "loss": 0.7855, + "num_input_tokens_seen": 77953024, + "step": 134265 + }, + { + "epoch": 19.998510574918082, + "grad_norm": 0.05810546875, + "learning_rate": 6.132516028323209e-10, + "loss": 0.8183, + "num_input_tokens_seen": 77956032, + "step": 134270 + }, + { + "epoch": 19.99925528745904, + "grad_norm": 0.040283203125, + "learning_rate": 1.8245502275693325e-10, + "loss": 0.7991, + "num_input_tokens_seen": 77959072, + "step": 134275 + }, + { + "epoch": 20.0, + "grad_norm": 0.0693359375, + "learning_rate": 5.068195862989455e-12, + "loss": 0.8063, + "num_input_tokens_seen": 77961608, + "step": 134280 + }, + { + "epoch": 20.0, + "eval_loss": 0.7992360591888428, + "eval_runtime": 70.5641, + "eval_samples_per_second": 42.288, + "eval_steps_per_second": 10.572, + "num_input_tokens_seen": 77961608, + "step": 134280 + }, + { + "epoch": 20.0, + "num_input_tokens_seen": 77961608, + "step": 134280, + "total_flos": 3.5105759720879555e+18, + "train_loss": 0.8024387844978857, + "train_runtime": 29897.2604, + "train_samples_per_second": 17.964, + "train_steps_per_second": 4.491 + } + ], + "logging_steps": 5, + "max_steps": 134280, + "num_input_tokens_seen": 77961608, + "num_train_epochs": 20, + "save_steps": 6714, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.5105759720879555e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}